From 18b35ea2d30354e300488bdf35c66da86f637bcb Mon Sep 17 00:00:00 2001
From: zjm <1746163329@qq.com>
Date: Thu, 30 May 2024 15:50:46 +0800
Subject: [PATCH] ddpm

---
 .../conf/km_re1000_rs256_conditional.yaml     |  64 ++
 ...e1000_rs256_sparse_recons_conditional.yaml |  50 ++
 examples/ddpm/denoise_step.py                 | 201 +++++++
 examples/ddpm/functions.py                    | 557 ++++++++++++++++++
 examples/ddpm/main.py                         | 117 ++++
 examples/ddpm/utils.py                        | 223 +++++++
 ppsci/arch/__init__.py                        |   3 +
 ppsci/arch/ddpm.py                            | 411 +++++++++++++
 8 files changed, 1626 insertions(+)
 create mode 100644 examples/ddpm/conf/km_re1000_rs256_conditional.yaml
 create mode 100644 examples/ddpm/conf/kmflow_re1000_rs256_sparse_recons_conditional.yaml
 create mode 100644 examples/ddpm/denoise_step.py
 create mode 100644 examples/ddpm/functions.py
 create mode 100644 examples/ddpm/main.py
 create mode 100644 examples/ddpm/utils.py
 create mode 100644 ppsci/arch/ddpm.py

diff --git a/examples/ddpm/conf/km_re1000_rs256_conditional.yaml b/examples/ddpm/conf/km_re1000_rs256_conditional.yaml
new file mode 100644
index 0000000000..a929e8cf6e
--- /dev/null
+++ b/examples/ddpm/conf/km_re1000_rs256_conditional.yaml
@@ -0,0 +1,64 @@
+mode: train
+train:
+    seed: 1234
+    output_dir: 'experiments/km256/'
+    doc: 'weights/km256/'
+    timesteps: 1000
+    log_path: ''
+    verbose: 'info'
+    ni: false
+    comment: ''
+    resume_training: false
+data:
+    dataset: "kolmogorov flow"
+    data_dir: "/home/aistudio/data/data264003/train_data.npy"
+    stat_path: "/home/aistudio/data/data264003/km256_stats.npz"
+    image_size: 256
+    channels: 3
+    logit_transform: false
+    uniform_dequantization: false
+    gaussian_dequantization: false
+    random_flip: false
+    rescaled: false
+    num_workers: 0
+
+model:
+    type: "conditional"
+    in_channels: 3
+    out_ch: 3
+    ch: 64
+    ch_mult: [1, 1, 1, 2]
+    num_res_blocks: 1
+    attn_resolutions: [16, ]
+    dropout: 0.1
+    var_type: fixedlarge
+    ema_rate: 0.9999
+    ema: True
+    resamp_with_conv: True
+    ckpt_path: '/home/aistudio/data/data264003/init_ckpt.pdparams'
+
+diffusion:
+    beta_schedule: linear
+    beta_start: 0.0001
+    beta_end: 0.02
+    num_diffusion_timesteps: 1000 # Might need to be changed to 500 later to match SDEdit
+
+training:
+    batch_size: 1
+    n_epochs: 20 # 300 epoch for about 12 hours
+    n_iters: 200000
+    snapshot_freq: 20000
+    validation_freq: 2000
+
+sampling:
+    batch_size: 32
+    last_only: True
+
+optim:
+    weight_decay: 0.000
+    optimizer: "Adam"
+    lr: 0.0002
+    beta1: 0.9
+    amsgrad: false
+    eps: 0.00000001
+    grad_clip: 1.0
diff --git a/examples/ddpm/conf/kmflow_re1000_rs256_sparse_recons_conditional.yaml b/examples/ddpm/conf/kmflow_re1000_rs256_sparse_recons_conditional.yaml
new file mode 100644
index 0000000000..4ddc0f3ee3
--- /dev/null
+++ b/examples/ddpm/conf/kmflow_re1000_rs256_sparse_recons_conditional.yaml
@@ -0,0 +1,50 @@
+log_dir: "./experiments/kmflow_re1000_rs256_ddim_recons_conditional_log"
+mode: eval
+place: 'cuda'
+data:
+    dataset: "kmflow"
+    category: "kmflow"
+    image_size: 256
+    channels: 3
+    num_workers: 0
+    data_dir: "/home/aistudio/data/data264003/kf_2d_re1000_256_40seed.npy"
+    sample_data_dir: "/home/aistudio/data/data264003/kmflow_sampled_data_irregnew.npz"
+    stat_path: "/home/aistudio/data/data264003/km256_stats.npz"
+    data_kw: 'u3232'
+    smoothing: True
+    smoothing_scale: 7
+
+model:
+    type: "conditional"
+    in_channels: 3
+    out_ch: 3
+    ch: 64
+    ch_mult: [ 1, 1, 1, 2 ]
+    num_res_blocks: 1
+    attn_resolutions: [ 16, ]
+    dropout: 0.0
+    var_type: fixedlarge
+    ema_rate: 0.9999
+    ema: True
+    resamp_with_conv: True
+    ckpt_path: "/home/aistudio/data/data264003/latest.pdparams"
+
+diffusion:
+    beta_schedule: linear
+    beta_start: 0.0001
+    beta_end: 0.02
+    num_diffusion_timesteps: 1000
+eval:
+    repeat_run: 1
+    sample_step: 1
+    t: 240
+    reverse_steps: 30
+    seed: 1234
+
+
+sampling:
+    batch_size: 20
+    last_only: True
+    guidance_weight: 0.
+    log_loss: True
+    dump_arr: True
diff --git a/examples/ddpm/denoise_step.py b/examples/ddpm/denoise_step.py
new file mode 100644
index 0000000000..a03c9cdd7b
--- /dev/null
+++ b/examples/ddpm/denoise_step.py
@@ -0,0 +1,201 @@
+import paddle
+
+
+def compute_alpha(beta, t):
+    beta = paddle.concat(x=[paddle.zeros(shape=[1]).to("gpu"), beta], axis=0)
+    beta_sub = 1 - beta
+
+    # 2. 计算累积乘积
+    cumprod_beta = paddle.cumprod(beta_sub, dim=0)
+
+    # 3. 根据t + 1的值选择特定的元素
+    # 注意：Paddle中使用gather函数来实现类似的索引选择功能
+    # 假设t是一个标量，我们需要将其转换为Tensor，如果t已经是Tensor则不需要转换
+    if not isinstance(t, paddle.Tensor):
+        t = paddle.to_tensor(t, dtype="int64")
+    selected = paddle.gather(cumprod_beta, index=t + 1, axis=0)
+
+    # 4. 改变张量的形状
+    # PaddlePaddle使用reshape函数来改变形状
+    a = paddle.reshape(selected, shape=[-1, 1, 1, 1])
+    #     """Class Method: *.view, can not convert, please check whether it is torch.Tensor.*/Optimizer.*/nn.Module.*/torch.distributions.Distribution.*/torch.autograd.function.FunctionCtx.*/torch.profiler.profile.*/torch.autograd.profiler.profile.*, and convert manually"""
+    # >>>>>>    a = (1 - beta).cumprod(dim=0).index_select(axis=0, index=t + 1).view(-1,
+    #         1, 1, 1)
+    return a
+
+
+def ddim_steps(x, seq, model, b, **kwargs):
+    n = x.shape[0]
+    seq_next = [-1] + list(seq[:-1])
+    x0_preds = []
+    xs = [x]
+    dx_func = kwargs.get("dx_func", None)
+    clamp_func = kwargs.get("clamp_func", None)
+    cache = kwargs.get("cache", False)
+    logger = kwargs.get("logger", None)
+    if logger is not None:
+        logger.update(x=xs[-1])
+    for i, j in zip(reversed(seq), reversed(seq_next)):
+        with paddle.no_grad():
+            t = (paddle.ones(shape=n) * i).to(x.place)
+            next_t = (paddle.ones(shape=n) * j).to(x.place)
+            at = compute_alpha(b, t.astype(dtype="int64"))
+            at_next = compute_alpha(b, next_t.astype(dtype="int64"))
+            xt = xs[-1].to("cuda")
+            et = model(xt, t)
+            x0_t = (xt - et * (1 - at).sqrt()) / at.sqrt()
+            x0_preds.append(x0_t.to("cpu"))
+            c2 = (1 - at_next).sqrt()
+        if dx_func is not None:
+            dx = dx_func(xt)
+        else:
+            dx = 0
+        with paddle.no_grad():
+            xt_next = at_next.sqrt() * x0_t + c2 * et - dx
+            if clamp_func is not None:
+                xt_next = clamp_func(xt_next)
+            xs.append(xt_next.to("cpu"))
+        if logger is not None:
+            logger.update(x=xs[-1])
+        if not cache:
+            xs = xs[-1:]
+            x0_preds = x0_preds[-1:]
+    return xs, x0_preds
+
+
+def ddpm_steps(x, seq, model, b, **kwargs):
+    n = x.shape[0]
+    seq_next = [-1] + list(seq[:-1])
+    xs = [x]
+    x0_preds = []
+    betas = b
+    dx_func = kwargs.get("dx_func", None)
+    cache = kwargs.get("cache", False)
+    clamp_func = kwargs.get("clamp_func", None)
+    for i, j in zip(reversed(seq), reversed(seq_next)):
+        with paddle.no_grad():
+            t = (paddle.ones(shape=n) * i).to(x.place)
+            next_t = (paddle.ones(shape=n) * j).to(x.place)
+            at = compute_alpha(betas, t.astype(dtype="int64"))
+            atm1 = compute_alpha(betas, next_t.astype(dtype="int64"))
+            beta_t = 1 - at / atm1
+            x = xs[-1].to("cuda")
+            output = model(x, t.astype(dtype="float32"))
+            e = output
+            x0_from_e = (1.0 / at).sqrt() * x - (1.0 / at - 1).sqrt() * e
+            x0_from_e = paddle.clip(x=x0_from_e, min=-1, max=1)
+            x0_preds.append(x0_from_e.to("cpu"))
+            mean_eps = (
+                atm1.sqrt() * beta_t * x0_from_e + (1 - beta_t).sqrt() * (1 - atm1) * x
+            ) / (1.0 - at)
+            mean = mean_eps
+            noise = paddle.randn(shape=x.shape, dtype=x.dtype)
+            mask = 1 - (t == 0).astype(dtype="float32")
+            """Class Method: *.view, can not convert, please check whether it is torch.Tensor.*/Optimizer.*/nn.Module.*/torch.distributions.Distribution.*/torch.autograd.function.FunctionCtx.*/torch.profiler.profile.*/torch.autograd.profiler.profile.*, and convert manually"""
+            # >>>>>>            mask = mask.view(-1, 1, 1, 1)
+            mask = paddle.reshape(mask, shape=[-1, 1, 1, 1])
+            logvar = beta_t.log()
+        if dx_func is not None:
+            dx = dx_func(x)
+        else:
+            dx = 0
+        with paddle.no_grad():
+            sample = mean + mask * paddle.exp(x=0.5 * logvar) * noise - dx
+            if clamp_func is not None:
+                sample = clamp_func(sample)
+            xs.append(sample.to("cpu"))
+        if not cache:
+            xs = xs[-1:]
+            x0_preds = x0_preds[-1:]
+    return xs, x0_preds
+
+
+def guided_ddpm_steps(x, seq, model, b, **kwargs):
+    n = x.shape[0]
+    seq_next = [-1] + list(seq[:-1])
+    xs = [x]
+    x0_preds = []
+    betas = b
+    dx_func = kwargs.get("dx_func", None)
+    if dx_func is None:
+        raise ValueError("dx_func is required for guided denoising")
+    clamp_func = kwargs.get("clamp_func", None)
+    cache = kwargs.get("cache", False)
+    w = kwargs.get("w", 3.0)
+    for i, j in zip(reversed(seq), reversed(seq_next)):
+        with paddle.no_grad():
+            t = (paddle.ones(shape=n) * i).to(x.place)
+            next_t = (paddle.ones(shape=n) * j).to(x.place)
+            at = compute_alpha(betas, t.astype(dtype="int64"))
+            atm1 = compute_alpha(betas, next_t.astype(dtype="int64"))
+            beta_t = 1 - at / atm1
+            x = xs[-1].to("cuda")
+        dx = dx_func(x)
+        with paddle.no_grad():
+            output = (w + 1) * model(x, t.astype(dtype="float32"), dx) - w * model(
+                x, t.astype(dtype="float32")
+            )
+            e = output
+            x0_from_e = (1.0 / at).sqrt() * x - (1.0 / at - 1).sqrt() * e
+            x0_from_e = paddle.clip(x=x0_from_e, min=-1, max=1)
+            x0_preds.append(x0_from_e.to("cpu"))
+            mean_eps = (
+                atm1.sqrt() * beta_t * x0_from_e + (1 - beta_t).sqrt() * (1 - atm1) * x
+            ) / (1.0 - at)
+            mean = mean_eps
+            noise = paddle.randn(shape=x.shape, dtype=x.dtype)
+            mask = 1 - (t == 0).astype(dtype="float32")
+            """Class Method: *.view, can not convert, please check whether it is torch.Tensor.*/Optimizer.*/nn.Module.*/torch.distributions.Distribution.*/torch.autograd.function.FunctionCtx.*/torch.profiler.profile.*/torch.autograd.profiler.profile.*, and convert manually"""
+            # >>>>>>            mask = mask.view(-1, 1, 1, 1)
+            mask = paddle.reshape(mask, shape=[-1, 1, 1, 1])
+            logvar = beta_t.log()
+        with paddle.no_grad():
+            sample = mean + mask * paddle.exp(x=0.5 * logvar) * noise - dx
+            if clamp_func is not None:
+                sample = clamp_func(sample)
+            xs.append(sample.to("cpu"))
+        if not cache:
+            xs = xs[-1:]
+            x0_preds = x0_preds[-1:]
+    return xs, x0_preds
+
+
+def guided_ddim_steps(x, seq, model, b, **kwargs):
+    n = x.shape[0]
+    seq_next = [-1] + list(seq[:-1])
+    x0_preds = []
+    xs = [x]
+    dx_func = kwargs.get("dx_func", None)
+    if dx_func is None:
+        raise ValueError("dx_func is required for guided denoising")
+    clamp_func = kwargs.get("clamp_func", None)
+    cache = kwargs.get("cache", False)
+    w = kwargs.get("w", 3.0)
+    logger = kwargs.get("logger", None)
+    if logger is not None:
+        xs[-1] = paddle.to_tensor(xs[-1], place=paddle.CUDAPlace(0))  # 将张量转移到 CUDA 设备上
+        logger.update(x=xs[-1])
+    for i, j in zip(reversed(seq), reversed(seq_next)):
+        with paddle.no_grad():
+            t = (paddle.ones(shape=n) * i).to("gpu")
+            next_t = (paddle.ones(shape=n) * j).to("gpu")
+            at = compute_alpha(b, t.astype(dtype="int64"))
+            at_next = compute_alpha(b, next_t.astype(dtype="int64"))
+            xt = xs[-1].to("gpu")
+        dx = dx_func(xt)
+        with paddle.no_grad():
+            et = (w + 1) * model(xt, t, dx) - w * model(xt, t)
+            x0_t = (xt - et * (1 - at).sqrt()) / at.sqrt()
+            x0_preds.append(x0_t.to("cpu"))
+            c2 = (1 - at_next).sqrt()
+        with paddle.no_grad():
+            xt_next = at_next.sqrt() * x0_t + c2 * et - dx
+            if clamp_func is not None:
+                xt_next = clamp_func(xt_next)
+            xs.append(xt_next.to("cpu"))
+        if logger is not None:
+            logger.update(x=xs[-1])
+        if not cache:
+            xs = xs[-1:]
+            x0_preds = x0_preds[-1:]
+    return xs, x0_preds
diff --git a/examples/ddpm/functions.py b/examples/ddpm/functions.py
new file mode 100644
index 0000000000..539b73b91d
--- /dev/null
+++ b/examples/ddpm/functions.py
@@ -0,0 +1,557 @@
+import logging
+import os
+import pickle
+import sys
+
+import matplotlib.pyplot as plt
+import numpy as np
+import paddle
+from denoise_step import ddim_steps
+from denoise_step import guided_ddim_steps
+from mpl_toolkits.axes_grid1 import ImageGrid
+from PIL import Image
+
+sys.path.append("home/aistudio/work/PaddleScience/ppsci/arch")
+sys.path.append("/home/aistudio/external-libraries")
+
+
+def normalize_array(x):
+    x_min = np.amin(x)
+    x_max = np.amax(x)
+    y = (x - x_min) / (x_max - x_min)
+    return y, x_min, x_max
+
+
+def unnormalize_array(y, x_min, x_max):
+    return y * (x_max - x_min) + x_min
+
+
+def data_blurring(fno_data_sample):
+    ds_size = 16
+    resample_method = Image.NEAREST
+    x_array, x_min, x_max = normalize_array(fno_data_sample.numpy())
+    im = Image.fromarray((x_array * 255).astype(np.uint8))
+    im_ds = im.resize((ds_size, ds_size))
+    im_us = im_ds.resize((im.width, im.height), resample=resample_method)
+    x_array_blur = np.asarray(im_us)
+    x_array_blur = x_array_blur.astype(np.float32) / 255.0
+    x_array_blur = unnormalize_array(x_array_blur, x_min, x_max)
+    return paddle.to_tensor(data=x_array_blur)
+
+
+class MetricLogger(object):
+    def __init__(self, metric_fn_dict):
+        self.metric_fn_dict = metric_fn_dict
+        self.metric_dict = {}
+        self.reset()
+
+    def reset(self):
+        for key in self.metric_fn_dict.keys():
+            self.metric_dict[key] = []
+
+    @paddle.no_grad()
+    def update(self, **kwargs):
+        for key in self.metric_fn_dict.keys():
+            self.metric_dict[key].append(self.metric_fn_dict[key](**kwargs))
+
+    def get(self):
+        return self.metric_dict.copy()
+
+    def log(self, outdir, postfix=""):
+        # 准备一个新字典，用于存储可以被pickle序列化的对象
+        serializable_metric_dict = {}
+
+        def to_serializable(value):
+            """将值转换为可序列化的格式"""
+            if isinstance(value, paddle.Tensor):
+                # 将Tensor转换为NumPy数组
+                return value.numpy()
+            elif isinstance(value, dict):
+                # 递归处理字典中的值
+                return {k: to_serializable(v) for k, v in value.items()}
+            elif isinstance(value, list):
+                # 递归处理列表中的值
+                return [to_serializable(v) for v in value]
+            else:
+                # 其他类型的值直接返回
+                return value
+
+        # 遍历metric_dict并转换所有值
+        for key, value in self.metric_dict.items():
+            serializable_metric_dict[key] = to_serializable(value)
+
+        # 使用修改后的字典进行序列化
+        with open(os.path.join(outdir, f"metric_log_{postfix}.pkl"), "wb") as f:
+            pickle.dump(serializable_metric_dict, f)
+
+
+def get_beta_schedule(*, beta_start, beta_end, num_diffusion_timesteps):
+    betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
+    assert betas.shape == (num_diffusion_timesteps,)
+    return betas
+
+
+def load_flow_data(path, stat_path=None):
+    data = np.load(path)
+    print("Original data shape:", data.shape)
+    data_mean, data_scale = np.mean(data[:-4]), np.std(data[:-4])
+    print(f"Data range: mean: {data_mean} scale: {data_scale}")
+    data = data[-4:, (...)].copy().astype(np.float32)
+    data = paddle.to_tensor(data=data, dtype="float32")
+    flattened_data = []
+    for i in range(data.shape[0]):
+        for j in range(data.shape[1] - 2):
+            flattened_data.append(data[(i), j : j + 3, (...)])
+    flattened_data = paddle.stack(x=flattened_data, axis=0)
+    print(f"data shape: {flattened_data.shape}")
+    return flattened_data, data_mean.item(), data_scale.item()
+
+
+def create_gaussian_kernel(kernel_size, sigma):
+    # 创建一个二维高斯核
+    ax = paddle.arange(-kernel_size // 2 + 1.0, kernel_size // 2 + 1.0)
+    xx, yy = paddle.meshgrid(ax, ax)
+    kernel = paddle.exp(-(xx**2 + yy**2) / (2.0 * sigma**2))
+    kernel = kernel / paddle.sum(kernel)
+    return kernel
+
+
+def gaussian_blur(image, kernel_size, sigma):
+    # 创建高斯核
+    gaussian_kernel = create_gaussian_kernel(kernel_size, sigma)
+
+    # 扩展维度以匹配卷积函数的权重格式 [Co, Ci, H, W]
+    gaussian_kernel = gaussian_kernel.reshape([1, 1, kernel_size, kernel_size])
+    gaussian_kernel = paddle.tile(
+        gaussian_kernel, repeat_times=[image.shape[1], 1, 1, 1]
+    )
+
+    # 使用高斯核进行卷积
+    # PaddlePaddle的conv2d需要输入的形状是[N, C, H, W]，其中N是批量大小，C是通道数
+    # 因此，可能需要调整输入tensor的形状以符合这个要求
+    padding = kernel_size // 2
+    blurred_image = paddle.nn.functional.conv2d(
+        image, weight=gaussian_kernel, stride=1, padding=padding, groups=image.shape[1]
+    )
+    return blurred_image
+
+
+def load_recons_data(ref_path, sample_path, data_kw, smoothing, smoothing_scale):
+    with np.load(sample_path, allow_pickle=True) as f:
+        sampled_data = f[data_kw][-4:, (...)].copy().astype(np.float32)
+    sampled_data = paddle.to_tensor(data=sampled_data, dtype="float32")
+    ref_data = np.load(ref_path).astype(np.float32)
+    data_mean, data_scale = np.mean(ref_data[:-4]), np.std(ref_data[:-4])
+    ref_data = ref_data[-4:, (...)].copy().astype(np.float32)
+    ref_data = paddle.to_tensor(data=ref_data, dtype="float32")
+    flattened_sampled_data = []
+    flattened_ref_data = []
+    for i in range(ref_data.shape[0]):
+        for j in range(ref_data.shape[1] - 2):
+            flattened_ref_data.append(ref_data[(i), j : j + 3, (...)])
+            flattened_sampled_data.append(sampled_data[(i), j : j + 3, (...)])
+    flattened_ref_data = paddle.stack(x=flattened_ref_data, axis=0)
+    flattened_sampled_data = paddle.stack(x=flattened_sampled_data, axis=0)
+    if smoothing:
+        arr = flattened_sampled_data
+        ker_size = smoothing_scale
+        arr = paddle.nn.functional.pad(
+            arr,
+            pad=(
+                (ker_size - 1) // 2,
+                (ker_size - 1) // 2,
+                (ker_size - 1) // 2,
+                (ker_size - 1) // 2,
+            ),
+            mode="circular",
+        )
+        arr = gaussian_blur(arr, ker_size, ker_size)
+        flattened_sampled_data = arr[
+            (...),
+            (ker_size - 1) // 2 : -(ker_size - 1) // 2,
+            (ker_size - 1) // 2 : -(ker_size - 1) // 2,
+        ]
+    print(f"data shape: {flattened_ref_data.shape}")
+    return (
+        flattened_ref_data,
+        flattened_sampled_data,
+        data_mean.item(),
+        data_scale.item(),
+    )
+
+
+class MinMaxScaler(object):
+    def __init__(self, min, max):
+        self.min = min
+        self.max = max
+
+    def __call__(self, x):
+        return x - self.min
+
+    def inverse(self, x):
+        return x * (self.max - self.min) + self.min
+
+    def scale(self):
+        return self.max - self.min
+
+
+class StdScaler(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, x):
+        return (x - self.mean) / self.std
+
+    def inverse(self, x):
+        return x * self.std + self.mean
+
+    def scale(self):
+        return self.std
+
+
+def nearest_blur_image(data, scale):
+    blur_data = data[:, :, ::scale, ::scale]
+    return blur_data
+
+
+def gaussian_blur_image(data, scale):
+    blur_data = paddle.visualization.transforms.GaussianBlur(
+        kernel_size=scale, sigma=2 * scale + 1
+    )(data)
+    return blur_data
+
+
+def random_square_hole_mask(data, hole_size):
+    h, w = data.shape[2:]
+    mask = paddle.zeros(shape=data.shape, dtype="int64").to(data.place)
+    hole_x = np.random.randint(0, w - hole_size)
+    hole_y = np.random.randint(0, h - hole_size)
+    mask[(...), hole_y : hole_y + hole_size, hole_x : hole_x + hole_size] = 1
+    return mask
+
+
+def make_image_grid(images, out_path, ncols=10):
+    t, h, w = images.shape
+    images = images.detach().cpu().numpy()
+    b = t // ncols
+    fig = plt.figure(figsize=(8.0, 8.0))
+    grid = ImageGrid(fig, 111, nrows_ncols=(b, ncols))
+    for ax, im_no in zip(grid, np.arange(b * ncols)):
+        ax.imshow(images[(im_no), :, :], cmap="twilight", vmin=-23, vmax=23)
+        ax.axis("off")
+    plt.savefig(out_path, bbox_inches="tight")
+    plt.close()
+
+
+def ensure_dir(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+
+def slice2sequence(data):
+    # 首先，选取第二个维度的特定切片
+    selected_slice = data[:, 1:2, :, :]  # 选取F维度的第二个切片，保持形状为 [T, 1, H, W]
+
+    # 然后，如果你想要合并T和F维度，可以使用reshape
+    # 但在这个例子中，由于F维度只有一个元素，实际上你不需要合并，只需要去掉F这一维
+    data = paddle.squeeze(selected_slice, axis=1)  # 结果形状为 [T, H, W]
+    return data
+
+
+def l1_loss(x, y):
+    return paddle.mean(x=paddle.abs(x=x - y))
+
+
+def l2_loss(x, y):
+    return ((x - y) ** 2).mean(axis=(-1, -2)).sqrt().mean()
+
+
+def float_to_complex(x):
+    x = paddle.cast(x, dtype="float32")
+    img_x = paddle.zeros_like(x)
+    x = paddle.complex(x, img_x)
+    return x
+
+
+def voriticity_residual(w, re=1000.0, dt=1 / 32, calc_grad=True):
+    w = w.clone()
+    w.stop_gradient = not True
+    nx = w.shape[2]
+    w_h = paddle.fft.fft2(x=w[:, 1:-1], axes=[2, 3])
+    k_max = nx // 2
+    N = nx
+    k_x = (
+        paddle.concat(
+            x=(
+                paddle.arange(start=0, end=k_max, step=1),
+                paddle.arange(start=-k_max, end=0, step=1),
+            ),
+            axis=0,
+        )
+        .reshape([N, 1])
+        .tile(repeat_times=[1, N])
+        .reshape([1, 1, N, N])
+    )
+    k_y = (
+        paddle.concat(
+            x=(
+                paddle.arange(start=0, end=k_max, step=1),
+                paddle.arange(start=-k_max, end=0, step=1),
+            ),
+            axis=0,
+        )
+        .reshape([1, N])
+        .tile(repeat_times=[N, 1])
+        .reshape([1, 1, N, N])
+    )
+
+    lap = k_x**2 + k_y**2
+
+    k_x = float_to_complex(k_x)
+    k_y = float_to_complex(k_y)
+
+    lap[..., 0, 0] = 1.0
+
+    lap = float_to_complex(lap)
+
+    psi_h = w_h / lap
+    u_h = 1.0j * k_y * psi_h
+    v_h = -1.0j * k_x * psi_h
+    wx_h = 1.0j * k_x * w_h
+    wy_h = 1.0j * k_y * w_h
+    wlap_h = -lap * w_h
+    u = paddle.fft.irfft2(x=u_h[(...), :, : k_max + 1], axes=[2, 3])
+    v = paddle.fft.irfft2(x=v_h[(...), :, : k_max + 1], axes=[2, 3])
+    wx = paddle.fft.irfft2(x=wx_h[(...), :, : k_max + 1], axes=[2, 3])
+    wy = paddle.fft.irfft2(x=wy_h[(...), :, : k_max + 1], axes=[2, 3])
+    wlap = paddle.fft.irfft2(x=wlap_h[(...), :, : k_max + 1], axes=[2, 3])
+    advection = u * wx + v * wy
+    wt = (w[:, 2:, :, :] - w[:, :-2, :, :]) / (2 * dt)
+    x = paddle.linspace(start=0, stop=2 * np.pi, num=nx + 1)
+    x = x[0:-1]
+    X, Y = paddle.meshgrid(x, x)
+    f = -4 * paddle.cos(x=4 * Y)
+    residual = wt + (advection - 1.0 / re * wlap + 0.1 * w[:, 1:-1]) - f
+    residual_loss = (residual**2).mean()
+    if calc_grad:
+        dw = paddle.grad(outputs=residual_loss, inputs=w)[0]
+        return dw, residual_loss
+    else:
+        return residual_loss
+
+
+def l2_loss_fn(x, scaler, gt):
+    return l2_loss(scaler.inverse(x).to("gpu"), gt)
+
+
+def equation_loss_fn(x, scaler):
+    return voriticity_residual(scaler.inverse(x), calc_grad=False)
+
+
+def physical_gradient_func1(x, scaler):
+    return voriticity_residual(scaler.inverse(x))[0] / scaler.scale()
+
+
+def physical_gradient_func2(x, scaler, config):
+    return (
+        voriticity_residual(scaler.inverse(x))[0]
+        / scaler.scale()
+        * config.sampling.lambda_
+    )
+
+
+def eval_input(model, config):
+    args = config.eval
+    os.makedirs(config.log_dir, exist_ok=True)
+    if config.model.type == "conditional":
+        dir_name = "recons_{}_t{}_r{}_w{}".format(
+            config.data.data_kw,
+            args.t,
+            args.reverse_steps,
+            config.sampling.guidance_weight,
+        )
+    else:
+        dir_name = "recons_{}_t{}_r{}_lam{}".format(
+            config.data.data_kw, args.t, args.reverse_steps, config.sampling.lambda_
+        )
+    if config.model.type == "conditional":
+        print("Use residual gradient guidance during sampling")
+        dir_name = "guided_" + dir_name
+    elif config.sampling.lambda_ > 0:
+        print("Use residual gradient penalty during sampling")
+        dir_name = "pi_" + dir_name
+    else:
+        print("Not use physical gradient during sampling")
+    log_dir = os.path.join(config.log_dir, dir_name)
+    os.makedirs(log_dir, exist_ok=True)
+    logger = logging.getLogger("LOG")
+    logger.setLevel(logging.INFO)
+    formatter = logging.Formatter(
+        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    )
+    file_handler = logging.FileHandler("%s/%s.txt" % (log_dir, "logging_info"))
+    file_handler.setLevel(logging.INFO)
+    file_handler.setFormatter(formatter)
+    logger.addHandler(file_handler)
+    image_sample_dir = log_dir
+    device = "gpu" if paddle.device.cuda.device_count() >= 1 else "cpu"
+    config.place = str(paddle.CUDAPlace(0))
+    betas = get_beta_schedule(
+        beta_start=config.diffusion.beta_start,
+        beta_end=config.diffusion.beta_end,
+        num_diffusion_timesteps=config.diffusion.num_diffusion_timesteps,
+    )
+    betas = paddle.to_tensor(data=betas).astype(dtype="float32").to(device)
+    # alphas = 1.0 - betas
+
+    # restuct
+    logger.info("Doing sparse reconstruction task")
+    logger.info("Loading model")
+    model.to("gpu")
+    logger.info("Model loaded")
+    model.eval()
+    logger.info("Preparing data")
+    ref_data, blur_data, data_mean, data_std = load_recons_data(
+        config.data.data_dir,
+        config.data.sample_data_dir,
+        config.data.data_kw,
+        smoothing=config.data.smoothing,
+        smoothing_scale=config.data.smoothing_scale,
+    )
+    scaler = StdScaler(data_mean, data_std)
+    logger.info("Start sampling")
+    testset = paddle.io.TensorDataset([blur_data, ref_data])
+    test_loader = paddle.io.DataLoader(
+        testset,
+        batch_size=config.sampling.batch_size,
+        shuffle=False,
+        num_workers=config.data.num_workers,
+        use_shared_memory=False,
+    )
+    l2_loss_all = np.zeros((ref_data.shape[0], args.repeat_run, args.sample_step))
+    residual_loss_all = np.zeros((ref_data.shape[0], args.repeat_run, args.sample_step))
+    for batch_index, (blur_data, data) in enumerate(test_loader):
+        logger.info("Batch: {} / Total batch {}".format(batch_index, len(test_loader)))
+        x0 = blur_data.to(device)
+        gt = data.to(device)
+        logger.info("Preparing reference image")
+        logger.info("Dumping visualization...")
+        sample_folder = "sample_batch{}".format(batch_index)
+        ensure_dir(os.path.join(image_sample_dir, sample_folder))
+        sample_img_filename = "input_image.png"
+        path_to_dump = os.path.join(
+            image_sample_dir, sample_folder, sample_img_filename
+        )
+        x0_masked = x0.clone()
+        make_image_grid(slice2sequence(x0_masked), path_to_dump)
+        sample_img_filename = "reference_image.png"
+        path_to_dump = os.path.join(
+            image_sample_dir, sample_folder, sample_img_filename
+        )
+        make_image_grid(slice2sequence(gt), path_to_dump)
+        if config.sampling.dump_arr:
+            np.save(
+                os.path.join(image_sample_dir, sample_folder, "input_arr.npy"),
+                slice2sequence(x0).cpu().numpy(),
+            )
+            np.save(
+                os.path.join(image_sample_dir, sample_folder, "reference_arr.npy"),
+                slice2sequence(data).cpu().numpy(),
+            )
+        l2_loss_init = l2_loss(x0, gt)
+        logger.info("L2 loss init: {}".format(l2_loss_init))
+        gt_residual = voriticity_residual(gt)[1].detach()
+        init_residual = voriticity_residual(x0)[1].detach()
+        logger.info("Residual init: {}".format(init_residual))
+        logger.info("Residual reference: {}".format(gt_residual))
+        x0 = scaler(x0)
+        xinit = x0.clone()
+        if config.sampling.log_loss:
+            logger1 = MetricLogger(
+                {"l2 loss": l2_loss_fn, "residual loss": equation_loss_fn}
+            )
+        for repeat in range(args.repeat_run):
+            logger.info(f"Run No.{repeat}:")
+            x0 = xinit.clone()
+            for it in range(args.sample_step):
+                e = paddle.randn(shape=x0.shape, dtype=x0.dtype)
+                total_noise_levels = int(args.t * 0.7**it)
+                a = (1 - betas).cumprod(dim=0)
+                x = (
+                    x0 * a[total_noise_levels - 1].sqrt()
+                    + e * (1.0 - a[total_noise_levels - 1]).sqrt()
+                )
+                if config.model.type == "conditional":
+                    physical_gradient_func = physical_gradient_func1(x, scaler)
+                elif config.sampling.lambda_ > 0:
+                    physical_gradient_func = physical_gradient_func2(x, scaler, config)
+                num_of_reverse_steps = int(args.reverse_steps * 0.7**it)
+                betas = betas.to(device)
+                skip = total_noise_levels // num_of_reverse_steps
+                seq = range(0, total_noise_levels, skip)
+                if config.model.type == "conditional":
+                    xs, _ = guided_ddim_steps(
+                        x,
+                        seq,
+                        model,
+                        betas,
+                        w=config.sampling.guidance_weight,
+                        dx_func=physical_gradient_func,
+                        cache=False,
+                        logger=logger1,
+                    )
+                elif config.sampling.lambda_ > 0:
+                    xs, _ = ddim_steps(
+                        x,
+                        seq,
+                        model,
+                        betas,
+                        dx_func=physical_gradient_func,
+                        cache=False,
+                        logger=logger1,
+                    )
+                else:
+                    xs, _ = ddim_steps(
+                        x, seq, model, betas, cache=False, logger=logger1
+                    )
+                x = xs[-1]
+                x0 = xs[-1]
+                l2_loss_f = l2_loss(scaler.inverse(x.clone()).to("gpu"), gt)
+                logger.info("L2 loss it{}: {}".format(it, l2_loss_f))
+                residual_loss_f = voriticity_residual(
+                    scaler.inverse(x.clone()), calc_grad=False
+                ).detach()
+                logger.info("Residual it{}: {}".format(it, residual_loss_f))
+                l2_loss_all[
+                    batch_index * x.shape[0] : (batch_index + 1) * x.shape[0],
+                    (repeat),
+                    (it),
+                ] = l2_loss_f.item()
+                residual_loss_all[
+                    batch_index * x.shape[0] : (batch_index + 1) * x.shape[0],
+                    (repeat),
+                    (it),
+                ] = residual_loss_f.item()
+                if config.sampling.dump_arr:
+                    np.save(
+                        os.path.join(
+                            image_sample_dir,
+                            sample_folder,
+                            f"sample_arr_run_{repeat}_it{it}.npy",
+                        ),
+                        slice2sequence(scaler.inverse(x)).cpu().numpy(),
+                    )
+                if config.sampling.log_loss:
+                    logger1.log(
+                        os.path.join(image_sample_dir, sample_folder),
+                        f"run_{repeat}_it{it}",
+                    )
+                    logger1.reset()
+        logger.info("Finished batch {}".format(batch_index))
+        logger.info("========================================================")
+    logger.info("Finished sampling")
+    logger.info(f"mean l2 loss: {l2_loss_all[..., -1].mean()}")
+    logger.info(f"std l2 loss: {l2_loss_all[..., -1].std(axis=1).mean()}")
+    logger.info(f"mean residual loss: {residual_loss_all[..., -1].mean()}")
+    logger.info(f"std residual loss: {residual_loss_all[..., -1].std(axis=1).mean()}")
diff --git a/examples/ddpm/main.py b/examples/ddpm/main.py
new file mode 100644
index 0000000000..5aac165d99
--- /dev/null
+++ b/examples/ddpm/main.py
@@ -0,0 +1,117 @@
+from os import path as osp
+
+import functions
+import hydra
+import numpy as np
+import paddle
+import utils
+from omegaconf import DictConfig
+
+import ppsci
+from ppsci.utils import logger
+
+
+def train(cfg):
+    # 设置PaddlePaddle的随机种子
+    paddle.seed(1234)
+
+    # 设置NumPy的随机种子
+    np.random.seed(1234)
+
+    # 生成随机数
+    print(paddle.randn([1]))  # [0.63745004]
+    print(np.random.rand(1))  # [0.19151945]
+    args = cfg.train
+    config = cfg
+    data = np.load(cfg.data.data_dir)
+    coords = data
+    # set random seed for reproducibility
+    ppsci.utils.misc.set_random_seed(args.seed)
+    # initialize logger
+    logger.init_logger("ppsci", osp.join(args.output_dir, f"{cfg.mode}.log"), "info")
+    model = ppsci.arch.DDPM(config=config)
+    ppsci.utils.load_pretrain(model, config.model.ckpt_path)
+
+    def _transform_out(_out):
+        return utils.transform_out(_out)
+
+    model.register_input_transform(utils.transform_in)
+    model.register_output_transform(_transform_out)
+    # x = paddle.ones(shape=[1, 3, 256, 256])
+    # n = 1
+    # # 创建一个值为15的张量
+    # t = paddle.full(shape=[1], fill_value=15)
+
+    # # 拼接t和1000-t-1，并取前n个元素
+    # t = paddle.concat([t, 1000 - t - 1], axis=0)[:n]
+    # dx = paddle.ones(shape=[n, 3, 256, 256])
+    # output = model.forward(x, t, dx)
+
+    # print('Model output:')
+    # print(output)
+    # raise NotImplementedError("not supported")
+    sup_constraint_pde = ppsci.constraint.SupervisedConstraint(
+        {
+            "dataset": {
+                "name": "NamedArrayDataset",
+                "input": {
+                    "x0": coords,
+                },
+            },
+            "batch_size": cfg.training.batch_size,
+            "num_workers": cfg.data.num_workers,
+        },
+        ppsci.loss.FunctionalLoss(utils.train_loss_func),
+        {
+            "loss": lambda out: out["loss"],
+        },
+        name="sup_train",
+    )
+    constraint_pde = {sup_constraint_pde.name: sup_constraint_pde}
+
+    # initialize solver
+    clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=config.optim.grad_clip)
+    optimizer = ppsci.optimizer.Adam(config.optim.lr, grad_clip=clip)(model)
+    solver = ppsci.solver.Solver(
+        model,
+        constraint_pde,
+        output_dir=args.output_dir,
+        optimizer=optimizer,
+        save_freq=2,
+        epochs=cfg.training.n_epochs,
+        iters_per_epoch=11448,
+        pretrained_model_path="/home/aistudio/data/data264003/latest.pdparams",
+        validator=None,
+    )
+
+    solver.train()
+
+
+def evaluate(cfg):
+    config = cfg
+    if config.model.type == "conditional":
+        print("Using conditional model")
+        model = ppsci.arch.DDPM(config=config)
+    else:
+        # 原始代码中没有unconditional的model，所以这里直接raise NotImplemente
+        raise NotImplementedError("not supported")
+    ppsci.utils.load_pretrain(model, config.model.ckpt_path)
+    functions.eval_input(model, config)
+
+
+@hydra.main(
+    version_base=None,
+    config_path="conf/",
+    config_name="kmflow_re1000_rs256_sparse_recons_conditional.yaml",
+)
+def main(cfg: DictConfig):
+    if cfg.mode == "train":
+        train(cfg)
+    elif cfg.mode == "eval":
+        evaluate(cfg)
+    else:
+        raise ValueError(f"cfg.mode should in ['train', 'eval'], but got '{cfg.mode}'")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/ddpm/utils.py b/examples/ddpm/utils.py
new file mode 100644
index 0000000000..7ad70ea2f7
--- /dev/null
+++ b/examples/ddpm/utils.py
@@ -0,0 +1,223 @@
+import numpy as np
+import paddle
+
+
+def transform_in(x, config, p=0.1):
+    """
+    对输入数据进行变换，并返回变换后的数据、时间步长、残差和噪声。
+
+    Args:
+        x (dict): 包含输入数据的字典，键为 "x0"，值为 Paddle Tensor。
+        config (dict): 配置参数字典，包含扩散参数、数据路径等。
+        p (float, optional): 概率值，用于决定是否计算残差。默认为 0.1。
+
+    Returns:
+        tuple: 包含四个元素的元组，分别为：
+            - x (Paddle Tensor): 变换后的数据。
+            - t (Paddle Tensor): 时间步长。
+            - dx (Paddle Tensor or None): 残差，如果 flag < p 则为 None。
+            - e (Paddle Tensor): 噪声。
+
+    """
+    x = x["x0"]
+    # print(x)
+    betas = get_beta_schedule_train(
+        beta_schedule=config.diffusion.beta_schedule,
+        beta_start=config.diffusion.beta_start,
+        beta_end=config.diffusion.beta_end,
+        num_diffusion_timesteps=config.diffusion.num_diffusion_timesteps,
+    )
+    betas = paddle.to_tensor(data=betas).astype(dtype="float32")
+    num_timesteps = betas.shape[0]
+    b = betas
+    n = x.shape[0]
+    e = paddle.randn(shape=x.shape, dtype=x.dtype)
+    t = paddle.randint(low=0, high=num_timesteps, shape=(n // 2 + 1,))
+    t = paddle.concat(x=[t, num_timesteps - t - 1], axis=0)[:n]
+    data = np.load(config.data.stat_path)
+    x_offset = data["mean"]
+    x_scale = data["scale"]
+    beta_sub = 1 - b
+    cumprod_beta = paddle.cumprod(beta_sub, dim=0)
+    if not isinstance(t, paddle.Tensor):
+        t = paddle.to_tensor(t, dtype="int64")
+    selected = paddle.gather(cumprod_beta, index=t, axis=0)
+    a = paddle.reshape(selected, shape=[-1, 1, 1, 1])
+    x = x * a.sqrt() + e * (1.0 - a).sqrt()
+    flag = np.random.uniform(0, 1)
+    if flag < p:
+        dx = None
+    else:
+        dx = voriticity_residual_train(x * x_scale + x_offset) / x_scale
+    return x, t, dx, e
+
+
+def transform_out(outputs):
+    loss = (outputs[1] - outputs[0]).square().sum(axis=(1, 2, 3)).mean(axis=0)
+
+    return {"loss": loss}
+
+
+def train_loss_func(result_dict, *args) -> paddle.Tensor:
+    """For model calculation of loss.
+
+    Args:
+        result_dict (Dict[str, paddle.Tensor]): The result dict.
+
+    Returns:
+        paddle.Tensor: Loss value.
+    """
+    return result_dict["loss"]
+
+
+def float_to_complex(x):
+    x = paddle.cast(x, dtype="float32")
+    img_x = paddle.zeros_like(x)
+    x = paddle.complex(x, img_x)
+    return x
+
+
+def voriticity_residual_train(w, re=1000.0, dt=1 / 32):
+    w = w.clone()
+    out_5 = w
+    out_5.stop_gradient = not True
+    out_5
+    nx = w.shape[2]
+    w_h = paddle.fft.fft2(x=w[:, 1:-1], axes=[2, 3])
+    k_max = nx // 2
+    N = nx
+    k_x = (
+        paddle.concat(
+            x=(
+                paddle.arange(start=0, end=k_max, step=1),
+                paddle.arange(start=-k_max, end=0, step=1),
+            ),
+            axis=0,
+        )
+        .reshape([N, 1])
+        .tile(repeat_times=[1, N])
+        .reshape([1, 1, N, N])
+    )
+    k_y = (
+        paddle.concat(
+            x=(
+                paddle.arange(start=0, end=k_max, step=1),
+                paddle.arange(start=-k_max, end=0, step=1),
+            ),
+            axis=0,
+        )
+        .reshape([1, N])
+        .tile(repeat_times=[N, 1])
+        .reshape([1, 1, N, N])
+    )
+
+    lap = k_x**2 + k_y**2
+    lap = float_to_complex(lap)
+    k_x = float_to_complex(k_x)
+    k_y = float_to_complex(k_y)
+    lap[..., 0, 0] = 1.0
+    psi_h = w_h / lap
+    u_h = 1.0j * k_y * psi_h
+    v_h = -1.0j * k_x * psi_h
+    wx_h = 1.0j * k_x * w_h
+    wy_h = 1.0j * k_y * w_h
+    wlap_h = -lap * w_h
+    u = paddle.fft.irfft2(x=u_h[(...), :, : k_max + 1], axes=[2, 3])
+    v = paddle.fft.irfft2(x=v_h[(...), :, : k_max + 1], axes=[2, 3])
+    wx = paddle.fft.irfft2(x=wx_h[(...), :, : k_max + 1], axes=[2, 3])
+    wy = paddle.fft.irfft2(x=wy_h[(...), :, : k_max + 1], axes=[2, 3])
+    wlap = paddle.fft.irfft2(x=wlap_h[(...), :, : k_max + 1], axes=[2, 3])
+    advection = u * wx + v * wy
+    wt = (w[:, 2:, :, :] - w[:, :-2, :, :]) / (2 * dt)
+    x = paddle.linspace(start=0, stop=2 * np.pi, num=nx + 1)
+    x = x[0:-1]
+    X, Y = paddle.meshgrid(x, x)
+    f = -4 * paddle.cos(x=4 * Y)
+    residual = wt + (advection - 1.0 / re * wlap + 0.1 * w[:, 1:-1]) - f
+    residual_loss = (residual**2).mean()
+    dw = paddle.grad(outputs=residual_loss, inputs=w)[0]
+    return dw
+
+
+def get_beta_schedule_train(
+    beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps
+):
+    def sigmoid(x):
+        return 1 / (np.exp(-x) + 1)
+
+    if beta_schedule == "quad":
+        betas = (
+            np.linspace(
+                beta_start**0.5,
+                beta_end**0.5,
+                num_diffusion_timesteps,
+                dtype=np.float64,
+            )
+            ** 2
+        )
+    elif beta_schedule == "linear":
+        betas = np.linspace(
+            beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64
+        )
+    elif beta_schedule == "const":
+        betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "jsd":
+        betas = 1.0 / np.linspace(
+            num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64
+        )
+    elif beta_schedule == "sigmoid":
+        betas = np.linspace(-6, 6, num_diffusion_timesteps)
+        betas = sigmoid(betas) * (beta_end - beta_start) + beta_start
+    else:
+        raise NotImplementedError(beta_schedule)
+    assert betas.shape == (num_diffusion_timesteps,)
+    return betas
+
+
+class EMAHelper(object):
+    def __init__(self, mu=0.999):
+        self.mu = mu
+        self.shadow = {}
+
+    def register(self, module):
+        if isinstance(module, paddle.DataParallel):
+            module = module.module
+        for name, param in module.named_parameters():
+            if not param.stop_gradient:
+                self.shadow[name] = param.data.clone()
+
+    def update(self, module):
+        if isinstance(module, paddle.DataParallel):
+            module = module.module
+        for name, param in module.named_parameters():
+            if not param.stop_gradient:
+                self.shadow[name].data = (
+                    1.0 - self.mu
+                ) * param.data + self.mu * self.shadow[name].data
+
+    def ema(self, module):
+        if isinstance(module, paddle.DataParallel):
+            module = module.module
+        for name, param in module.named_parameters():
+            if not param.stop_gradient:
+                param.data.copy_(self.shadow[name].data)
+
+    def ema_copy(self, module):
+        if isinstance(module, paddle.DataParallel):
+            inner_module = module.module
+            module_copy = type(inner_module)(inner_module.config).to(
+                inner_module.config.device
+            )
+            module_copy.set_state_dict(state_dict=inner_module.state_dict())
+            module_copy = paddle.DataParallel(module_copy)
+        else:
+            module_copy = type(module)(module.config).to(module.config.device)
+            module_copy.set_state_dict(state_dict=module.state_dict())
+        self.ema(module_copy)
+        return module_copy
+
+    def state_dict(self):
+        return self.shadow
+
+    def load_state_dict(self, state_dict):
+        self.shadow = state_dict
diff --git a/ppsci/arch/__init__.py b/ppsci/arch/__init__.py
index bcb0bfcc97..8a6219df49 100644
--- a/ppsci/arch/__init__.py
+++ b/ppsci/arch/__init__.py
@@ -46,10 +46,12 @@
 from ppsci.arch.sfnonet import SFNONet  # isort:skip
 from ppsci.arch.tfnonet import TFNO1dNet, TFNO2dNet, TFNO3dNet  # isort:skip
 from ppsci.arch.unonet import UNONet  # isort:skip
+from ppsci.arch.ddpm import DDPM # isort:skip
 from ppsci.arch.cuboid_transformer import CuboidTransformer  # isort:skip
 from ppsci.utils import logger  # isort:skip
 
 
+
 __all__ = [
     "Arch",
     "AMGNet",
@@ -84,6 +86,7 @@
     "UNONet",
     "build_model",
     "CFDGCN",
+    "DDPM",
 ]
 
 
diff --git a/ppsci/arch/ddpm.py b/ppsci/arch/ddpm.py
new file mode 100644
index 0000000000..c985be1f47
--- /dev/null
+++ b/ppsci/arch/ddpm.py
@@ -0,0 +1,411 @@
+import math
+
+import paddle
+
+from ppsci.arch import base
+
+
+def get_timestep_embedding(timesteps, embedding_dim):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models:
+    From Fairseq.
+    Build sinusoidal embeddings.
+    This matches the implementation in tensor2tensor, but differs slightly
+    from the description in Section 3.5 of "Attention Is All You Need".
+    """
+    assert len(timesteps.shape) == 1
+    half_dim = embedding_dim // 2
+    emb = math.log(10000) / (half_dim - 1)
+    emb = paddle.exp(x=paddle.arange(dtype="float32", end=half_dim) * -emb)
+    emb = emb.to(device="gpu")
+    emb = timesteps.astype(dtype="float32")[:, (None)] * emb[(None), :]
+    emb = paddle.concat(x=[paddle.sin(x=emb), paddle.cos(x=emb)], axis=1)
+    if embedding_dim % 2 == 1:
+        emb = paddle.nn.functional.pad(emb, pad=(0, 1, 0, 0), mode="constant", value=0)
+    return emb
+
+
+def nonlinearity(x):
+    return x * paddle.nn.functional.sigmoid(x=x)
+
+
+def Normalize(in_channels):
+    return paddle.nn.GroupNorm(
+        num_groups=8,
+        num_channels=in_channels,
+        epsilon=1e-06,
+        weight_attr=True,
+        bias_attr=True,
+    )
+
+
+class Upsample(paddle.nn.Layer):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = paddle.nn.Conv2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                padding_mode="circular",
+            )
+
+    def forward(self, x):
+        x = paddle.nn.functional.interpolate(x=x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+
+
+class Downsample(paddle.nn.Layer):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = paddle.nn.Conv2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=3,
+                stride=2,
+                padding=0,
+            )
+
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = paddle.nn.functional.pad(x, pad=pad, mode="circular")
+            x = self.conv(x)
+        else:
+            x = paddle.nn.functional.avg_pool2d(
+                kernel_size=2, stride=2, x=x, exclusive=False
+            )
+        return x
+
+
+class ResnetBlock(paddle.nn.Layer):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout,
+        temb_channels=512
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = paddle.nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            padding_mode="circular",
+        )
+        self.temb_proj = paddle.nn.Linear(
+            in_features=temb_channels, out_features=out_channels
+        )
+        self.norm2 = Normalize(out_channels)
+        self.dropout = paddle.nn.Dropout(p=dropout)
+        self.conv2 = paddle.nn.Conv2D(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            padding_mode="circular",
+        )
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = paddle.nn.Conv2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    padding_mode="circular",
+                )
+            else:
+                self.nin_shortcut = paddle.nn.Conv2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                )
+
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        h = h + self.temb_proj(nonlinearity(temb))[:, :, (None), (None)]
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h
+
+
+class AttnBlock(paddle.nn.Layer):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = paddle.nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.k = paddle.nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.v = paddle.nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.proj_out = paddle.nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        b, c, h, w = q.shape
+        q = q.reshape([b, c, h * w])
+        q = q.transpose(perm=[0, 2, 1])
+        k = k.reshape([b, c, h * w])
+        w_ = paddle.bmm(x=q, y=k)
+        w_ = w_ * int(c) ** -0.5
+        w_ = paddle.nn.functional.softmax(x=w_, axis=2)
+        v = v.reshape([b, c, h * w])
+        w_ = w_.transpose(perm=[0, 2, 1])
+        h_ = paddle.bmm(x=v, y=w_)
+        h_ = h_.reshape([b, c, h, w])
+        h_ = self.proj_out(h_)
+        return x + h_
+
+
+class DDPM(base.Arch):
+    def __init__(self, config):
+        super(DDPM, self).__init__()
+        self.config = config
+        ch, out_ch, ch_mult = (
+            config.model.ch,
+            config.model.out_ch,
+            tuple(config.model.ch_mult),
+        )
+        num_res_blocks = config.model.num_res_blocks
+        attn_resolutions = config.model.attn_resolutions
+        dropout = config.model.dropout
+        in_channels = config.model.in_channels
+        resolution = config.data.image_size
+        resamp_with_conv = config.model.resamp_with_conv
+        num_timesteps = config.diffusion.num_diffusion_timesteps
+        if config.model.type == "bayesian":
+            out_3 = paddle.create_parameter(
+                shape=paddle.zeros(shape=num_timesteps).shape,
+                dtype=paddle.zeros(shape=num_timesteps).numpy().dtype,
+                default_initializer=paddle.nn.initializer.Assign(
+                    paddle.zeros(shape=num_timesteps)
+                ),
+            )
+            out_3.stop_gradient = not True
+            self.logvar = out_3
+        self.ch = ch
+        self.temb_ch = self.ch * 4
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.temb = paddle.nn.Layer()
+        self.temb.dense = paddle.nn.LayerList(
+            sublayers=[
+                paddle.nn.Linear(in_features=self.ch, out_features=self.temb_ch),
+                paddle.nn.Linear(in_features=self.temb_ch, out_features=self.temb_ch),
+            ]
+        )
+        self.emb_conv = paddle.nn.Sequential(
+            paddle.nn.Conv2D(
+                in_channels=in_channels,
+                out_channels=self.ch,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            paddle.nn.GELU(),
+            paddle.nn.Conv2D(
+                in_channels=self.ch,
+                out_channels=self.ch,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                padding_mode="circular",
+            ),
+        )
+        self.conv_in = paddle.nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=self.ch,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            padding_mode="circular",
+        )
+        self.combine_conv = paddle.nn.Conv2D(
+            in_channels=self.ch * 2,
+            out_channels=self.ch,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        curr_res = resolution
+        in_ch_mult = (1,) + ch_mult
+        self.down = paddle.nn.LayerList()
+        block_in = None
+        for i_level in range(self.num_resolutions):
+            block = paddle.nn.LayerList()
+            attn = paddle.nn.LayerList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = paddle.nn.Layer()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        self.mid = paddle.nn.Layer()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout,
+        )
+        self.up = paddle.nn.LayerList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = paddle.nn.LayerList()
+            attn = paddle.nn.LayerList()
+            block_out = ch * ch_mult[i_level]
+            skip_in = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                if i_block == self.num_res_blocks:
+                    skip_in = ch * in_ch_mult[i_level]
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in + skip_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            up = paddle.nn.Layer()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up) if len(self.up) != 0 else self.up.append(up)
+        self.norm_out = Normalize(block_in)
+        self.conv_out = paddle.nn.Conv2D(
+            in_channels=block_in,
+            out_channels=out_ch,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            padding_mode="circular",
+        )
+
+    def forward(self, x, t=None, dx=None):
+        if self._input_transform is not None:
+            x, t, dx, e = self._input_transform(x, self.config)
+        assert x.shape[2] == x.shape[3] == self.resolution
+        temb = get_timestep_embedding(t, self.ch)
+        temb = self.temb.dense[0](temb)
+        temb = nonlinearity(temb)
+        temb = self.temb.dense[1](temb)
+        x = self.conv_in(x)
+        if dx is not None:
+            cond_emb = self.emb_conv(dx)
+        else:
+            cond_emb = paddle.zeros_like(x=x)
+        x = paddle.concat(x=(x, cond_emb), axis=1)
+        hs = [self.combine_conv(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](
+                    paddle.concat(x=[h, hs.pop()], axis=1), temb
+                )
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        if self._output_transform is not None:
+            outputs = (h, e)
+            result_dict = self._output_transform(outputs)
+            return result_dict
+        return h
\ No newline at end of file