From b6a7783bf869eafbdc402aa9e24f6177d44533d6 Mon Sep 17 00:00:00 2001
From: RbRe145 <czheng12399@outlook.com>
Date: Tue, 23 Sep 2025 13:29:27 +0000
Subject: [PATCH 1/4] fix model's hash and json

---
 .../google/t5-efficient-base-kv128/graph_hash.txt                | 1 +
 .../google/t5-efficient-base-kv16/graph_hash.txt                 | 1 +
 .../google/t5-efficient-base-kv256/graph_hash.txt                | 1 +
 .../google/t5-efficient-base-kv32/graph_hash.txt                 | 1 +
 .../google/t5-efficient-base-nh16/graph_hash.txt                 | 1 +
 .../google/t5-efficient-base-nh24/graph_hash.txt                 | 1 +
 .../google/t5-efficient-base-nh32/graph_hash.txt                 | 1 +
 .../google/t5-efficient-base-nh8/graph_hash.txt                  | 1 +
 .../google/t5-efficient-base-nl24/graph_hash.txt                 | 1 +
 .../google/t5-efficient-base-nl32/graph_hash.txt                 | 1 +
 .../google/t5-efficient-base-nl36/graph_hash.txt                 | 1 +
 .../google/t5-efficient-base-nl40/graph_hash.txt                 | 1 +
 .../google/t5-efficient-base-nl48/graph_hash.txt                 | 1 +
 .../google/t5-efficient-large-dl12/graph_hash.txt                | 1 +
 .../google/t5-efficient-large-el2/graph_hash.txt                 | 1 +
 .../google/t5-efficient-large-el4/graph_hash.txt                 | 1 +
 .../google/t5-efficient-large-el6/graph_hash.txt                 | 1 +
 .../google/t5-efficient-large-el8/graph_hash.txt                 | 1 +
 .../google/t5-efficient-large-kv128/graph_hash.txt               | 1 +
 .../google/t5-efficient-large-kv16/graph_hash.txt                | 1 +
 .../google/t5-efficient-large-kv256/graph_hash.txt               | 1 +
 .../google/t5-efficient-large-kv32/graph_hash.txt                | 1 +
 .../google/t5-efficient-large-nh2/graph_hash.txt                 | 1 +
 .../google/t5-efficient-large-nh24/graph_hash.txt                | 1 +
 .../google/t5-efficient-large-nh32/graph_hash.txt                | 1 +
 .../google/t5-efficient-large-nh4/graph_hash.txt                 | 1 +
 .../google/t5-efficient-large-nh8-nl32/graph_hash.txt            | 1 +
 .../google/t5-efficient-large-nh8/graph_hash.txt                 | 1 +
 .../google/t5-efficient-large-nl10/graph_hash.txt                | 1 +
 .../google/t5-efficient-large-nl16/graph_hash.txt                | 1 +
 .../google/t5-efficient-large-nl20/graph_hash.txt                | 1 +
 .../google/t5-efficient-large-nl32/graph_hash.txt                | 1 +
 32 files changed, 32 insertions(+)
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-base-kv128/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-base-kv16/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-base-kv256/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-base-kv32/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-base-nh16/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-base-nh24/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-base-nh32/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-base-nh8/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-base-nl24/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-base-nl32/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-base-nl36/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-base-nl40/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-base-nl48/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-large-dl12/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-large-el2/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-large-el4/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-large-el6/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-large-el8/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-large-kv128/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-large-kv16/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-large-kv256/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-large-kv32/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-large-nh2/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-large-nh24/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-large-nh32/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-large-nh4/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-large-nh8-nl32/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-large-nh8/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-large-nl10/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-large-nl16/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-large-nl20/graph_hash.txt
 create mode 100644 samples/transformers-auto-model/google/t5-efficient-large-nl32/graph_hash.txt

diff --git a/samples/transformers-auto-model/google/t5-efficient-base-kv128/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-base-kv128/graph_hash.txt
new file mode 100644
index 000000000..cfcd27b7a
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-base-kv128/graph_hash.txt
@@ -0,0 +1 @@
+51e4b7c542183b28cf684e21105bd420f3c15c88c3565ed50d246c3c8f5e5278
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-base-kv16/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-base-kv16/graph_hash.txt
new file mode 100644
index 000000000..015bbc7aa
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-base-kv16/graph_hash.txt
@@ -0,0 +1 @@
+2c690e763e6a306e0826564467c85af74792139726c4e388a916ca301c84b54f
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-base-kv256/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-base-kv256/graph_hash.txt
new file mode 100644
index 000000000..08bbd4129
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-base-kv256/graph_hash.txt
@@ -0,0 +1 @@
+ac84bafb06e423018771c4f001432d61aba15f52336c9bb50e40038569759206
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-base-kv32/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-base-kv32/graph_hash.txt
new file mode 100644
index 000000000..3ca9de559
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-base-kv32/graph_hash.txt
@@ -0,0 +1 @@
+f221f7b0376e5209452b2b70b9f298e33e6cb90433ff3ceaf6283dbc66ab5b23
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-base-nh16/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-base-nh16/graph_hash.txt
new file mode 100644
index 000000000..f9db31088
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-base-nh16/graph_hash.txt
@@ -0,0 +1 @@
+6d926a6f23b9a9be70782e5b40821fad7062454d17b7a91fb0aa14175357ecf6
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-base-nh24/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-base-nh24/graph_hash.txt
new file mode 100644
index 000000000..407374099
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-base-nh24/graph_hash.txt
@@ -0,0 +1 @@
+91714bc0491793a7ba200d13327fb318ffa77f210eff3d5f79906fab21ac5ebe
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-base-nh32/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-base-nh32/graph_hash.txt
new file mode 100644
index 000000000..2d9bbd0c7
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-base-nh32/graph_hash.txt
@@ -0,0 +1 @@
+44cd4571d13a2179afc9079932dafa5b75ceab1dc251a1529bb1b9e95be578b1
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-base-nh8/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-base-nh8/graph_hash.txt
new file mode 100644
index 000000000..ab3440384
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-base-nh8/graph_hash.txt
@@ -0,0 +1 @@
+0e6cc2de7066ca6742d73e7aa2fd65c18492696bef6d992f4284839c07e3a2e7
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-base-nl24/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-base-nl24/graph_hash.txt
new file mode 100644
index 000000000..8a6b5d4d6
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-base-nl24/graph_hash.txt
@@ -0,0 +1 @@
+9850b460106189e9acf25fc43ce61d9245a7d6d384edd99faeea1f2d913741ed
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-base-nl32/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-base-nl32/graph_hash.txt
new file mode 100644
index 000000000..31e7f9cd8
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-base-nl32/graph_hash.txt
@@ -0,0 +1 @@
+c795de753c5c6fcaaa4df739ece68037f8c577f089759c068bbe8e7610bf7e98
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-base-nl36/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-base-nl36/graph_hash.txt
new file mode 100644
index 000000000..a91b50894
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-base-nl36/graph_hash.txt
@@ -0,0 +1 @@
+e71ae02a715037931ed3ae897878167e8484bbcbb63d56445ddb0e1cc525e4bb
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-base-nl40/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-base-nl40/graph_hash.txt
new file mode 100644
index 000000000..04d34bb04
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-base-nl40/graph_hash.txt
@@ -0,0 +1 @@
+4aeba0ecc3f62c67dd85542b3bca4ab44bc7f78ab18bab99953394c03d881b03
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-base-nl48/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-base-nl48/graph_hash.txt
new file mode 100644
index 000000000..7ce33b90d
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-base-nl48/graph_hash.txt
@@ -0,0 +1 @@
+93514c3328a569c3227db5b50d925909e5d934efb8ccdf54324f1a0b0838ba17
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-large-dl12/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-large-dl12/graph_hash.txt
new file mode 100644
index 000000000..fab3ac478
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-large-dl12/graph_hash.txt
@@ -0,0 +1 @@
+059ab0fa0a3dbbfdbe6a0b800ec63351e36b56c5fc8412709f746954a87ebbf9
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-large-el2/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-large-el2/graph_hash.txt
new file mode 100644
index 000000000..d24f2f69c
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-large-el2/graph_hash.txt
@@ -0,0 +1 @@
+8cb732beed93c3a710b0eb430de39d9f4ad357846cfe461dc27f178d0e9d232a
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-large-el4/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-large-el4/graph_hash.txt
new file mode 100644
index 000000000..3733fd158
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-large-el4/graph_hash.txt
@@ -0,0 +1 @@
+1f06bbbd38a25a53a8f038d8b5f1eacc3515ad6212b8b900c22cd2da03d9a698
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-large-el6/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-large-el6/graph_hash.txt
new file mode 100644
index 000000000..6e9a6915e
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-large-el6/graph_hash.txt
@@ -0,0 +1 @@
+47a7e4cb097c127b69d5a17be2ae2bff7b56b37a8a56dcf60f7c553e9e078d38
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-large-el8/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-large-el8/graph_hash.txt
new file mode 100644
index 000000000..9207e79e9
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-large-el8/graph_hash.txt
@@ -0,0 +1 @@
+9f35c32ef0a7ee3d3ae5f84a2093ee5e14fc764d062d8943c28de2a5c8a8e4ce
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-large-kv128/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-large-kv128/graph_hash.txt
new file mode 100644
index 000000000..08668239f
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-large-kv128/graph_hash.txt
@@ -0,0 +1 @@
+2781951f366c05172fe709789e26d1c62272f51b20164df6ea85c3fc7427a82d
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-large-kv16/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-large-kv16/graph_hash.txt
new file mode 100644
index 000000000..3fecf8159
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-large-kv16/graph_hash.txt
@@ -0,0 +1 @@
+199bf77e61c0101558b5b7780bb46c4c54baab4f69eb15b6b41502fbff14afa3
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-large-kv256/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-large-kv256/graph_hash.txt
new file mode 100644
index 000000000..164a105a1
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-large-kv256/graph_hash.txt
@@ -0,0 +1 @@
+7d74c207c0968b2e19a31649462a08fafba36650a8ae68da8d2b0bcfba16c6b0
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-large-kv32/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-large-kv32/graph_hash.txt
new file mode 100644
index 000000000..1bd7e6a6a
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-large-kv32/graph_hash.txt
@@ -0,0 +1 @@
+eff65c086eaa1c4084e7eefb1ae75801e3550e8ede05a95ecb99282e249846ac
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-large-nh2/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-large-nh2/graph_hash.txt
new file mode 100644
index 000000000..a6e70010b
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-large-nh2/graph_hash.txt
@@ -0,0 +1 @@
+6184647728f37854e2fade80d3698a7bdf63438ef9c14fc5ea3c78324ada14af
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-large-nh24/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-large-nh24/graph_hash.txt
new file mode 100644
index 000000000..0a85121f1
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-large-nh24/graph_hash.txt
@@ -0,0 +1 @@
+889e789a15be4caf0606d0fb41c544e9fdf3418dff7bf2a4f7c15f70bbc65ad0
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-large-nh32/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-large-nh32/graph_hash.txt
new file mode 100644
index 000000000..122a7a64f
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-large-nh32/graph_hash.txt
@@ -0,0 +1 @@
+0b955d2404a5a80f0ecd8a8ffb027d7536d5740ec9286f2a0870493643c1d6ac
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-large-nh4/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-large-nh4/graph_hash.txt
new file mode 100644
index 000000000..0fdecb31e
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-large-nh4/graph_hash.txt
@@ -0,0 +1 @@
+75428df2efbff5477e120014af028467121ce02e9c51672c6881d1e3b83579fb
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-large-nh8-nl32/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-large-nh8-nl32/graph_hash.txt
new file mode 100644
index 000000000..f6c83fa65
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-large-nh8-nl32/graph_hash.txt
@@ -0,0 +1 @@
+89e7d8480a29d73c7e3045e43aa4dcb8f4ab41b6748ac1fd27de7547d0ac9d0c
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-large-nh8/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-large-nh8/graph_hash.txt
new file mode 100644
index 000000000..3e3d7fb69
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-large-nh8/graph_hash.txt
@@ -0,0 +1 @@
+03ef8a94dcdb3d506ac7dc06077de6f83df52d9714684027f48a2f9eee00a461
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-large-nl10/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-large-nl10/graph_hash.txt
new file mode 100644
index 000000000..57fe2ca4f
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-large-nl10/graph_hash.txt
@@ -0,0 +1 @@
+f154eaa8e7631a60a84cf3db2b3fb06aad6091641b8bf0b85452ed4829baed5f
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-large-nl16/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-large-nl16/graph_hash.txt
new file mode 100644
index 000000000..e7570e147
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-large-nl16/graph_hash.txt
@@ -0,0 +1 @@
+2157295077252c1c0494c30d060983924261304d83e9bef7ae28e9d6caa7c7a1
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-large-nl20/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-large-nl20/graph_hash.txt
new file mode 100644
index 000000000..310aa50f3
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-large-nl20/graph_hash.txt
@@ -0,0 +1 @@
+84aa982af7c3457829feb08e64577cf39221686103e65189de45c2471df26821
\ No newline at end of file
diff --git a/samples/transformers-auto-model/google/t5-efficient-large-nl32/graph_hash.txt b/samples/transformers-auto-model/google/t5-efficient-large-nl32/graph_hash.txt
new file mode 100644
index 000000000..23fec69f2
--- /dev/null
+++ b/samples/transformers-auto-model/google/t5-efficient-large-nl32/graph_hash.txt
@@ -0,0 +1 @@
+c57bbc9f8323de46604b8a6f9db7b00e9637d8b118b5a438a290cacdf0faeb34
\ No newline at end of file

From 873fa298f5ea1ec059a7c17f42d3550e93e7beea Mon Sep 17 00:00:00 2001
From: RbRe145 <czheng12399@outlook.com>
Date: Thu, 25 Sep 2025 07:41:26 +0000
Subject: [PATCH 2/4] add new bart and xlnet models

---
 graph_net/test/nlp_model_getter.py            |   47 +
 .../PaddleNLP/bart-base/graph_net.json        |    6 +
 .../PaddleNLP/bart-base/input_meta.py         |   27 +
 paddle_samples/PaddleNLP/bart-base/model.py   | 3182 +++++++
 .../PaddleNLP/bart-base/weight_meta.py        | 2847 ++++++
 .../chinese-xlnet-base/graph_net.json         |    6 +
 .../chinese-xlnet-base/input_meta.py          |   19 +
 .../PaddleNLP/chinese-xlnet-base/model.py     | 4369 +++++++++
 .../chinese-xlnet-base/weight_meta.py         | 2048 ++++
 .../chinese-xlnet-large/graph_net.json        |    6 +
 .../chinese-xlnet-large/input_meta.py         |   19 +
 .../PaddleNLP/chinese-xlnet-large/model.py    | 8389 +++++++++++++++++
 .../chinese-xlnet-large/weight_meta.py        | 4076 ++++++++
 .../chinese-xlnet-mid/graph_net.json          |    6 +
 .../PaddleNLP/chinese-xlnet-mid/input_meta.py |   19 +
 .../PaddleNLP/chinese-xlnet-mid/model.py      | 8389 +++++++++++++++++
 .../chinese-xlnet-mid/weight_meta.py          | 4076 ++++++++
 .../PaddleNLP/xlnet-base-cased/graph_net.json |    6 +
 .../PaddleNLP/xlnet-base-cased/input_meta.py  |   42 +
 .../PaddleNLP/xlnet-base-cased/model.py       | 4369 +++++++++
 .../PaddleNLP/xlnet-base-cased/weight_meta.py | 2048 ++++
 .../xlnet-large-cased/graph_net.json          |    6 +
 .../PaddleNLP/xlnet-large-cased/input_meta.py |   42 +
 .../PaddleNLP/xlnet-large-cased/model.py      | 8389 +++++++++++++++++
 .../xlnet-large-cased/weight_meta.py          | 4076 ++++++++
 25 files changed, 56509 insertions(+)
 create mode 100644 paddle_samples/PaddleNLP/bart-base/graph_net.json
 create mode 100644 paddle_samples/PaddleNLP/bart-base/input_meta.py
 create mode 100644 paddle_samples/PaddleNLP/bart-base/model.py
 create mode 100644 paddle_samples/PaddleNLP/bart-base/weight_meta.py
 create mode 100644 paddle_samples/PaddleNLP/chinese-xlnet-base/graph_net.json
 create mode 100644 paddle_samples/PaddleNLP/chinese-xlnet-base/input_meta.py
 create mode 100644 paddle_samples/PaddleNLP/chinese-xlnet-base/model.py
 create mode 100644 paddle_samples/PaddleNLP/chinese-xlnet-base/weight_meta.py
 create mode 100644 paddle_samples/PaddleNLP/chinese-xlnet-large/graph_net.json
 create mode 100644 paddle_samples/PaddleNLP/chinese-xlnet-large/input_meta.py
 create mode 100644 paddle_samples/PaddleNLP/chinese-xlnet-large/model.py
 create mode 100644 paddle_samples/PaddleNLP/chinese-xlnet-large/weight_meta.py
 create mode 100644 paddle_samples/PaddleNLP/chinese-xlnet-mid/graph_net.json
 create mode 100644 paddle_samples/PaddleNLP/chinese-xlnet-mid/input_meta.py
 create mode 100644 paddle_samples/PaddleNLP/chinese-xlnet-mid/model.py
 create mode 100644 paddle_samples/PaddleNLP/chinese-xlnet-mid/weight_meta.py
 create mode 100644 paddle_samples/PaddleNLP/xlnet-base-cased/graph_net.json
 create mode 100644 paddle_samples/PaddleNLP/xlnet-base-cased/input_meta.py
 create mode 100644 paddle_samples/PaddleNLP/xlnet-base-cased/model.py
 create mode 100644 paddle_samples/PaddleNLP/xlnet-base-cased/weight_meta.py
 create mode 100644 paddle_samples/PaddleNLP/xlnet-large-cased/graph_net.json
 create mode 100644 paddle_samples/PaddleNLP/xlnet-large-cased/input_meta.py
 create mode 100644 paddle_samples/PaddleNLP/xlnet-large-cased/model.py
 create mode 100644 paddle_samples/PaddleNLP/xlnet-large-cased/weight_meta.py

diff --git a/graph_net/test/nlp_model_getter.py b/graph_net/test/nlp_model_getter.py
index abebfee4d..5ce710b24 100644
--- a/graph_net/test/nlp_model_getter.py
+++ b/graph_net/test/nlp_model_getter.py
@@ -107,3 +107,50 @@ def get_skep_model_and_inputs(model_name, text, dtype):
     tokenizer = TokenizerClass.from_pretrained(model_name)
     inputs = tokenizer(text, return_tensors="pd")
     return model, inputs
+
+
+def get_bart_model_and_inputs(model_name, text, dtype):
+    from paddlenlp.transformers import BartModel, BartTokenizer
+
+    model = BartModel.from_pretrained(model_name)
+    model.eval()
+
+    tokenizer = BartTokenizer.from_pretrained(model_name)
+
+    inputs = tokenizer(
+        text,
+        return_tensors="pd",
+        padding=True,
+        truncation=True,
+        max_length=512,
+    )
+    inputs.pop("token_type_ids", None)
+
+    return model, inputs
+
+
+def get_xlnet_model_and_inputs(model_name, text, dtype):
+    import paddle
+    from paddlenlp.transformers import XLNetModel, XLNetTokenizer, XLNetConfig
+
+    config = XLNetConfig.from_pretrained(model_name)
+    model = XLNetModel(config)
+    if dtype == "float16":
+        model = model.astype(paddle.float16)
+    model.eval()
+
+    tokenizer = XLNetTokenizer.from_pretrained(model_name)
+
+    enc = tokenizer(
+        text,
+        return_tensors="pd",
+        padding=True,
+        truncation=True,
+        # max_length=512,
+    )
+    if "attention_mask" not in enc:
+        input_ids = enc["input_ids"]
+        pad_id = tokenizer.pad_token_id
+        enc["attention_mask"] = (input_ids != pad_id).astype("int64")
+
+    return model, enc
diff --git a/paddle_samples/PaddleNLP/bart-base/graph_net.json b/paddle_samples/PaddleNLP/bart-base/graph_net.json
new file mode 100644
index 000000000..25c5098dc
--- /dev/null
+++ b/paddle_samples/PaddleNLP/bart-base/graph_net.json
@@ -0,0 +1,6 @@
+{
+    "framework": "paddle",
+    "model_name": "bart-base",
+    "num_devices_required": 1,
+    "num_nodes_required": 1
+}
\ No newline at end of file
diff --git a/paddle_samples/PaddleNLP/bart-base/input_meta.py b/paddle_samples/PaddleNLP/bart-base/input_meta.py
new file mode 100644
index 000000000..91995ce2c
--- /dev/null
+++ b/paddle_samples/PaddleNLP/bart-base/input_meta.py
@@ -0,0 +1,27 @@
+class Program_weight_tensor_data_0:
+    name = "data_0"
+    shape = [1, 21]
+    dtype = "int64"
+    data = [
+        0,
+        31414,
+        6,
+        127,
+        766,
+        16,
+        3045,
+        4,
+        38,
+        524,
+        2239,
+        59,
+        739,
+        2777,
+        3092,
+        8,
+        49,
+        41885,
+        4,
+        1437,
+        2,
+    ]
diff --git a/paddle_samples/PaddleNLP/bart-base/model.py b/paddle_samples/PaddleNLP/bart-base/model.py
new file mode 100644
index 000000000..50c980186
--- /dev/null
+++ b/paddle_samples/PaddleNLP/bart-base/model.py
@@ -0,0 +1,3182 @@
+import paddle
+
+
+class GraphModule(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self,
+        parameter_0,
+        parameter_1,
+        parameter_2,
+        parameter_3,
+        parameter_4,
+        parameter_5,
+        parameter_6,
+        parameter_7,
+        parameter_8,
+        parameter_9,
+        parameter_10,
+        parameter_11,
+        parameter_12,
+        parameter_13,
+        parameter_14,
+        parameter_15,
+        parameter_16,
+        parameter_17,
+        parameter_18,
+        parameter_19,
+        parameter_20,
+        parameter_21,
+        parameter_22,
+        parameter_23,
+        parameter_24,
+        parameter_25,
+        parameter_26,
+        parameter_27,
+        parameter_28,
+        parameter_29,
+        parameter_30,
+        parameter_31,
+        parameter_32,
+        parameter_33,
+        parameter_34,
+        parameter_35,
+        parameter_36,
+        parameter_37,
+        parameter_38,
+        parameter_39,
+        parameter_40,
+        parameter_41,
+        parameter_42,
+        parameter_43,
+        parameter_44,
+        parameter_45,
+        parameter_46,
+        parameter_47,
+        parameter_48,
+        parameter_49,
+        parameter_50,
+        parameter_51,
+        parameter_52,
+        parameter_53,
+        parameter_54,
+        parameter_55,
+        parameter_56,
+        parameter_57,
+        parameter_58,
+        parameter_59,
+        parameter_60,
+        parameter_61,
+        parameter_62,
+        parameter_63,
+        parameter_64,
+        parameter_65,
+        parameter_66,
+        parameter_67,
+        parameter_68,
+        parameter_69,
+        parameter_70,
+        parameter_71,
+        parameter_72,
+        parameter_73,
+        parameter_74,
+        parameter_75,
+        parameter_76,
+        parameter_77,
+        parameter_78,
+        parameter_79,
+        parameter_80,
+        parameter_81,
+        parameter_82,
+        parameter_83,
+        parameter_84,
+        parameter_85,
+        parameter_86,
+        parameter_87,
+        parameter_88,
+        parameter_89,
+        parameter_90,
+        parameter_91,
+        parameter_92,
+        parameter_93,
+        parameter_94,
+        parameter_95,
+        parameter_96,
+        parameter_97,
+        parameter_98,
+        parameter_99,
+        parameter_100,
+        parameter_101,
+        parameter_102,
+        parameter_103,
+        parameter_104,
+        parameter_105,
+        parameter_106,
+        parameter_107,
+        parameter_108,
+        parameter_109,
+        parameter_110,
+        parameter_111,
+        parameter_112,
+        parameter_113,
+        parameter_114,
+        parameter_115,
+        parameter_116,
+        parameter_117,
+        parameter_118,
+        parameter_119,
+        parameter_120,
+        parameter_121,
+        parameter_122,
+        parameter_123,
+        parameter_124,
+        parameter_125,
+        parameter_126,
+        parameter_127,
+        parameter_128,
+        parameter_129,
+        parameter_130,
+        parameter_131,
+        parameter_132,
+        parameter_133,
+        parameter_134,
+        parameter_135,
+        parameter_136,
+        parameter_137,
+        parameter_138,
+        parameter_139,
+        parameter_140,
+        parameter_141,
+        parameter_142,
+        parameter_143,
+        parameter_144,
+        parameter_145,
+        parameter_146,
+        parameter_147,
+        parameter_148,
+        parameter_149,
+        parameter_150,
+        parameter_151,
+        parameter_152,
+        parameter_153,
+        parameter_154,
+        parameter_155,
+        parameter_156,
+        parameter_157,
+        parameter_158,
+        parameter_159,
+        parameter_160,
+        parameter_161,
+        parameter_162,
+        parameter_163,
+        parameter_164,
+        parameter_165,
+        parameter_166,
+        parameter_167,
+        parameter_168,
+        parameter_169,
+        parameter_170,
+        parameter_171,
+        parameter_172,
+        parameter_173,
+        parameter_174,
+        parameter_175,
+        parameter_176,
+        parameter_177,
+        parameter_178,
+        parameter_179,
+        parameter_180,
+        parameter_181,
+        parameter_182,
+        parameter_183,
+        parameter_184,
+        parameter_185,
+        parameter_186,
+        parameter_187,
+        parameter_188,
+        parameter_189,
+        parameter_190,
+        parameter_191,
+        parameter_192,
+        parameter_193,
+        parameter_194,
+        parameter_195,
+        parameter_196,
+        parameter_197,
+        parameter_198,
+        parameter_199,
+        parameter_200,
+        parameter_201,
+        parameter_202,
+        parameter_203,
+        parameter_204,
+        parameter_205,
+        parameter_206,
+        parameter_207,
+        parameter_208,
+        parameter_209,
+        parameter_210,
+        parameter_211,
+        parameter_212,
+        parameter_213,
+        parameter_214,
+        parameter_215,
+        parameter_216,
+        parameter_217,
+        parameter_218,
+        parameter_219,
+        parameter_220,
+        parameter_221,
+        parameter_222,
+        parameter_223,
+        parameter_224,
+        parameter_225,
+        parameter_226,
+        parameter_227,
+        parameter_228,
+        parameter_229,
+        parameter_230,
+        parameter_231,
+        parameter_232,
+        parameter_233,
+        parameter_234,
+        parameter_235,
+        parameter_236,
+        parameter_237,
+        parameter_238,
+        parameter_239,
+        parameter_240,
+        parameter_241,
+        parameter_242,
+        parameter_243,
+        parameter_244,
+        parameter_245,
+        parameter_246,
+        parameter_247,
+        parameter_248,
+        parameter_249,
+        parameter_250,
+        parameter_251,
+        parameter_252,
+        parameter_253,
+        parameter_254,
+        parameter_255,
+        parameter_256,
+        parameter_257,
+        parameter_258,
+        data_0,
+    ):
+        # pd_op.full: (1xf32) <- ()
+        full_0 = paddle._C_ops.full(
+            [1], float("0"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full_like: (1x21xi64) <- (1x21xi64, 1xf32)
+        full_like_0 = paddle._C_ops.full_like(
+            data_0, full_0, paddle.int64, paddle.framework._current_expected_place()
+        )
+        del full_0
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_0 = [0]
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_1 = [-1]
+
+        # pd_op.slice: (1x20xi64) <- (1x21xi64, 1xi64, 1xi64)
+        slice_0 = paddle._C_ops.slice(
+            data_0, [1], full_int_array_0, full_int_array_1, [1], []
+        )
+
+        # pd_op.assign: (1x20xi64) <- (1x20xi64)
+        assign_0 = slice_0
+        del slice_0
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_2 = [1]
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_3 = [2147483647]
+
+        # pd_op.set_value_with_tensor_: (1x21xi64) <- (1x21xi64, 1x20xi64, 1xi64, 1xi64, 1xi64)
+        set_value_with_tensor__0 = paddle._C_ops.set_value_with_tensor_(
+            full_like_0,
+            assign_0,
+            full_int_array_2,
+            full_int_array_3,
+            full_int_array_2,
+            [1],
+            [],
+            [],
+        )
+        del assign_0, full_like_0
+
+        # pd_op.set_value_: (1x21xi64) <- (1x21xi64, 1xi64, 1xi64, 1xi64)
+        set_value__0 = paddle._C_ops.set_value_(
+            set_value_with_tensor__0,
+            full_int_array_0,
+            full_int_array_2,
+            full_int_array_2,
+            [1],
+            [1],
+            [],
+            [1],
+            [float("2")],
+        )
+        del full_int_array_0, full_int_array_2, set_value_with_tensor__0
+
+        # pd_op.full: (xi64) <- ()
+        full_1 = paddle._C_ops.full(
+            [], float("1"), paddle.int64, paddle.framework._current_expected_place()
+        )
+
+        # pd_op.equal: (1x21xb) <- (1x21xi64, xi64)
+        equal_0 = paddle._C_ops.equal(data_0, full_1)
+        del full_1
+
+        # pd_op.cast: (1x21xf32) <- (1x21xb)
+        cast_0 = paddle._C_ops.cast(equal_0, paddle.float32)
+        del equal_0
+
+        # pd_op.full_int_array: (2xi64) <- ()
+        full_int_array_4 = [1, 2]
+
+        # pd_op.unsqueeze: (1x1x1x21xf32) <- (1x21xf32, 2xi64)
+        unsqueeze_0 = paddle._C_ops.unsqueeze(cast_0, full_int_array_4)
+        del cast_0, full_int_array_4
+
+        # pd_op.full: (1xf32) <- ()
+        full_2 = paddle._C_ops.full(
+            [1], float("-10000"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x1x1x21xf32) <- (1x1x1x21xf32, 1xf32)
+        scale_0 = paddle._C_ops.scale(unsqueeze_0, full_2, float("0"), True)
+        del full_2, unsqueeze_0
+
+        # pd_op.full_int_array: (2xi64) <- ()
+        full_int_array_5 = [-1, 21]
+
+        # pd_op.reshape: (1x21xi64) <- (1x21xi64, 2xi64)
+        reshape_0 = paddle._C_ops.reshape(data_0, full_int_array_5)
+        del data_0
+
+        # pd_op.embedding: (1x21x768xf32) <- (1x21xi64, 50265x768xf32)
+        embedding_0 = paddle._C_ops.embedding(reshape_0, parameter_258, -1, False)
+        del reshape_0
+
+        # pd_op.full: (1xf32) <- ()
+        full_3 = paddle._C_ops.full(
+            [1], float("1"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x21x768xf32) <- (1x21x768xf32, 1xf32)
+        scale_1 = paddle._C_ops.scale(embedding_0, full_3, float("0"), True)
+        del embedding_0
+
+        # pd_op.full: (1xf64) <- ()
+        full_4 = paddle._C_ops.full(
+            [1], float("0"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full: (1xf64) <- ()
+        full_5 = paddle._C_ops.full(
+            [1], float("21"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full: (1xf64) <- ()
+        full_6 = paddle._C_ops.full(
+            [1], float("1"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.arange: (21xi64) <- (1xf64, 1xf64, 1xf64)
+        arange_0 = paddle.arange(full_4, full_5, full_6, dtype="int64")
+        del full_4, full_5, full_6
+
+        # pd_op.scale: (21xi64) <- (21xi64, 1xf32)
+        scale_2 = paddle._C_ops.scale(arange_0, full_3, float("2"), True)
+        del arange_0
+
+        # pd_op.embedding: (21x768xf32) <- (21xi64, 1026x768xf32)
+        embedding_1 = paddle._C_ops.embedding(scale_2, parameter_257, -1, False)
+        del parameter_257
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 21x768xf32)
+        add_0 = paddle._C_ops.add(scale_1, embedding_1)
+        del embedding_1, scale_1
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_1, layer_norm_2, layer_norm_3 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_0, parameter_256, parameter_255, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_0, parameter_255, parameter_256
+
+        # pd_op.full: (1xf32) <- ()
+        full_7 = paddle._C_ops.full(
+            [1], float("0.1"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_0, dropout_1 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                layer_norm_1, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del layer_norm_1
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_0 = paddle._C_ops.matmul(dropout_0, parameter_254, False, False)
+        del parameter_254
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_1 = paddle._C_ops.add(matmul_0, parameter_253)
+        del matmul_0, parameter_253
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_6 = [0, 0, 12, 64]
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_1 = paddle._C_ops.reshape(add_1, full_int_array_6)
+        del add_1
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_0 = paddle._C_ops.transpose(reshape_1, [0, 2, 1, 3])
+        del reshape_1
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_1 = paddle._C_ops.matmul(dropout_0, parameter_252, False, False)
+        del parameter_252
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_2 = paddle._C_ops.add(matmul_1, parameter_251)
+        del matmul_1, parameter_251
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_2 = paddle._C_ops.matmul(dropout_0, parameter_250, False, False)
+        del parameter_250
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_3 = paddle._C_ops.add(matmul_2, parameter_249)
+        del matmul_2, parameter_249
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_2 = paddle._C_ops.reshape(add_2, full_int_array_6)
+        del add_2
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_1 = paddle._C_ops.transpose(reshape_2, [0, 2, 1, 3])
+        del reshape_2
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_3 = paddle._C_ops.reshape(add_3, full_int_array_6)
+        del add_3
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_2 = paddle._C_ops.transpose(reshape_3, [0, 2, 1, 3])
+        del reshape_3
+
+        # pd_op.full: (1xf32) <- ()
+        full_8 = paddle._C_ops.full(
+            [1], float("0.125"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x12x21x64xf32) <- (1x12x21x64xf32, 1xf32)
+        scale_3 = paddle._C_ops.scale(transpose_0, full_8, float("0"), True)
+        del transpose_0
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_3 = paddle._C_ops.matmul(scale_3, transpose_1, False, True)
+        del scale_3, transpose_1
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_4 = paddle._C_ops.add(matmul_3, scale_0)
+        del matmul_3
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_0 = paddle._C_ops.softmax(add_4, -1)
+        del add_4
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_2, dropout_3 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_0, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_0
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_4 = paddle._C_ops.matmul(dropout_2, transpose_2, False, False)
+        del dropout_2, transpose_2
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_3 = paddle._C_ops.transpose(matmul_4, [0, 2, 1, 3])
+        del matmul_4
+
+        # pd_op.full_int_array: (3xi64) <- ()
+        full_int_array_7 = [0, 0, 768]
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_4 = paddle._C_ops.reshape(transpose_3, full_int_array_7)
+        del transpose_3
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_5 = paddle._C_ops.matmul(reshape_4, parameter_248, False, False)
+        del parameter_248, reshape_4
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_5 = paddle._C_ops.add(matmul_5, parameter_247)
+        del matmul_5, parameter_247
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_4, dropout_5 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_5, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_5
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_6 = paddle._C_ops.add(dropout_0, dropout_4)
+        del dropout_0, dropout_4
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_4, layer_norm_5, layer_norm_6 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_6, parameter_242, parameter_241, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_6, parameter_241, parameter_242
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_6 = paddle._C_ops.matmul(layer_norm_4, parameter_246, False, False)
+        del parameter_246
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_7 = paddle._C_ops.add(matmul_6, parameter_245)
+        del matmul_6, parameter_245
+
+        # pd_op.gelu: (1x21x3072xf32) <- (1x21x3072xf32)
+        gelu_0 = paddle._C_ops.gelu(add_7, False)
+        del add_7
+
+        # pd_op.dropout: (1x21x3072xf32, 1x21x3072xui8) <- (1x21x3072xf32, None, 1xf32)
+        dropout_6, dropout_7 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_0, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_0
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_7 = paddle._C_ops.matmul(dropout_6, parameter_244, False, False)
+        del dropout_6, parameter_244
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_8 = paddle._C_ops.add(matmul_7, parameter_243)
+        del matmul_7, parameter_243
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_8, dropout_9 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_8, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_8
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_9 = paddle._C_ops.add(layer_norm_4, dropout_8)
+        del dropout_8, layer_norm_4
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_7, layer_norm_8, layer_norm_9 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_9, parameter_240, parameter_239, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_9, parameter_239, parameter_240
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_8 = paddle._C_ops.matmul(layer_norm_7, parameter_238, False, False)
+        del parameter_238
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_10 = paddle._C_ops.add(matmul_8, parameter_237)
+        del matmul_8, parameter_237
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_5 = paddle._C_ops.reshape(add_10, full_int_array_6)
+        del add_10
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_4 = paddle._C_ops.transpose(reshape_5, [0, 2, 1, 3])
+        del reshape_5
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_9 = paddle._C_ops.matmul(layer_norm_7, parameter_236, False, False)
+        del parameter_236
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_11 = paddle._C_ops.add(matmul_9, parameter_235)
+        del matmul_9, parameter_235
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_10 = paddle._C_ops.matmul(layer_norm_7, parameter_234, False, False)
+        del parameter_234
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_12 = paddle._C_ops.add(matmul_10, parameter_233)
+        del matmul_10, parameter_233
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_6 = paddle._C_ops.reshape(add_11, full_int_array_6)
+        del add_11
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_5 = paddle._C_ops.transpose(reshape_6, [0, 2, 1, 3])
+        del reshape_6
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_7 = paddle._C_ops.reshape(add_12, full_int_array_6)
+        del add_12
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_6 = paddle._C_ops.transpose(reshape_7, [0, 2, 1, 3])
+        del reshape_7
+
+        # pd_op.scale: (1x12x21x64xf32) <- (1x12x21x64xf32, 1xf32)
+        scale_4 = paddle._C_ops.scale(transpose_4, full_8, float("0"), True)
+        del transpose_4
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_11 = paddle._C_ops.matmul(scale_4, transpose_5, False, True)
+        del scale_4, transpose_5
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_13 = paddle._C_ops.add(matmul_11, scale_0)
+        del matmul_11
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_1 = paddle._C_ops.softmax(add_13, -1)
+        del add_13
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_10, dropout_11 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_1, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_1
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_12 = paddle._C_ops.matmul(dropout_10, transpose_6, False, False)
+        del dropout_10, transpose_6
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_7 = paddle._C_ops.transpose(matmul_12, [0, 2, 1, 3])
+        del matmul_12
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_8 = paddle._C_ops.reshape(transpose_7, full_int_array_7)
+        del transpose_7
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_13 = paddle._C_ops.matmul(reshape_8, parameter_232, False, False)
+        del parameter_232, reshape_8
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_14 = paddle._C_ops.add(matmul_13, parameter_231)
+        del matmul_13, parameter_231
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_12, dropout_13 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_14, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_14
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_15 = paddle._C_ops.add(layer_norm_7, dropout_12)
+        del dropout_12, layer_norm_7
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_10, layer_norm_11, layer_norm_12 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_15, parameter_226, parameter_225, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_15, parameter_225, parameter_226
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_14 = paddle._C_ops.matmul(layer_norm_10, parameter_230, False, False)
+        del parameter_230
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_16 = paddle._C_ops.add(matmul_14, parameter_229)
+        del matmul_14, parameter_229
+
+        # pd_op.gelu: (1x21x3072xf32) <- (1x21x3072xf32)
+        gelu_1 = paddle._C_ops.gelu(add_16, False)
+        del add_16
+
+        # pd_op.dropout: (1x21x3072xf32, 1x21x3072xui8) <- (1x21x3072xf32, None, 1xf32)
+        dropout_14, dropout_15 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_1, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_1
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_15 = paddle._C_ops.matmul(dropout_14, parameter_228, False, False)
+        del dropout_14, parameter_228
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_17 = paddle._C_ops.add(matmul_15, parameter_227)
+        del matmul_15, parameter_227
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_16, dropout_17 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_17, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_17
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_18 = paddle._C_ops.add(layer_norm_10, dropout_16)
+        del dropout_16, layer_norm_10
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_13, layer_norm_14, layer_norm_15 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_18, parameter_224, parameter_223, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_18, parameter_223, parameter_224
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_16 = paddle._C_ops.matmul(layer_norm_13, parameter_222, False, False)
+        del parameter_222
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_19 = paddle._C_ops.add(matmul_16, parameter_221)
+        del matmul_16, parameter_221
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_9 = paddle._C_ops.reshape(add_19, full_int_array_6)
+        del add_19
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_8 = paddle._C_ops.transpose(reshape_9, [0, 2, 1, 3])
+        del reshape_9
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_17 = paddle._C_ops.matmul(layer_norm_13, parameter_220, False, False)
+        del parameter_220
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_20 = paddle._C_ops.add(matmul_17, parameter_219)
+        del matmul_17, parameter_219
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_18 = paddle._C_ops.matmul(layer_norm_13, parameter_218, False, False)
+        del parameter_218
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_21 = paddle._C_ops.add(matmul_18, parameter_217)
+        del matmul_18, parameter_217
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_10 = paddle._C_ops.reshape(add_20, full_int_array_6)
+        del add_20
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_9 = paddle._C_ops.transpose(reshape_10, [0, 2, 1, 3])
+        del reshape_10
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_11 = paddle._C_ops.reshape(add_21, full_int_array_6)
+        del add_21
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_10 = paddle._C_ops.transpose(reshape_11, [0, 2, 1, 3])
+        del reshape_11
+
+        # pd_op.scale: (1x12x21x64xf32) <- (1x12x21x64xf32, 1xf32)
+        scale_5 = paddle._C_ops.scale(transpose_8, full_8, float("0"), True)
+        del transpose_8
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_19 = paddle._C_ops.matmul(scale_5, transpose_9, False, True)
+        del scale_5, transpose_9
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_22 = paddle._C_ops.add(matmul_19, scale_0)
+        del matmul_19
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_2 = paddle._C_ops.softmax(add_22, -1)
+        del add_22
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_18, dropout_19 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_2, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_2
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_20 = paddle._C_ops.matmul(dropout_18, transpose_10, False, False)
+        del dropout_18, transpose_10
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_11 = paddle._C_ops.transpose(matmul_20, [0, 2, 1, 3])
+        del matmul_20
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_12 = paddle._C_ops.reshape(transpose_11, full_int_array_7)
+        del transpose_11
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_21 = paddle._C_ops.matmul(reshape_12, parameter_216, False, False)
+        del parameter_216, reshape_12
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_23 = paddle._C_ops.add(matmul_21, parameter_215)
+        del matmul_21, parameter_215
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_20, dropout_21 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_23, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_23
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_24 = paddle._C_ops.add(layer_norm_13, dropout_20)
+        del dropout_20, layer_norm_13
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_16, layer_norm_17, layer_norm_18 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_24, parameter_210, parameter_209, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_24, parameter_209, parameter_210
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_22 = paddle._C_ops.matmul(layer_norm_16, parameter_214, False, False)
+        del parameter_214
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_25 = paddle._C_ops.add(matmul_22, parameter_213)
+        del matmul_22, parameter_213
+
+        # pd_op.gelu: (1x21x3072xf32) <- (1x21x3072xf32)
+        gelu_2 = paddle._C_ops.gelu(add_25, False)
+        del add_25
+
+        # pd_op.dropout: (1x21x3072xf32, 1x21x3072xui8) <- (1x21x3072xf32, None, 1xf32)
+        dropout_22, dropout_23 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_2, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_2
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_23 = paddle._C_ops.matmul(dropout_22, parameter_212, False, False)
+        del dropout_22, parameter_212
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_26 = paddle._C_ops.add(matmul_23, parameter_211)
+        del matmul_23, parameter_211
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_24, dropout_25 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_26, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_26
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_27 = paddle._C_ops.add(layer_norm_16, dropout_24)
+        del dropout_24, layer_norm_16
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_19, layer_norm_20, layer_norm_21 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_27, parameter_208, parameter_207, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_27, parameter_207, parameter_208
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_24 = paddle._C_ops.matmul(layer_norm_19, parameter_206, False, False)
+        del parameter_206
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_28 = paddle._C_ops.add(matmul_24, parameter_205)
+        del matmul_24, parameter_205
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_13 = paddle._C_ops.reshape(add_28, full_int_array_6)
+        del add_28
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_12 = paddle._C_ops.transpose(reshape_13, [0, 2, 1, 3])
+        del reshape_13
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_25 = paddle._C_ops.matmul(layer_norm_19, parameter_204, False, False)
+        del parameter_204
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_29 = paddle._C_ops.add(matmul_25, parameter_203)
+        del matmul_25, parameter_203
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_26 = paddle._C_ops.matmul(layer_norm_19, parameter_202, False, False)
+        del parameter_202
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_30 = paddle._C_ops.add(matmul_26, parameter_201)
+        del matmul_26, parameter_201
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_14 = paddle._C_ops.reshape(add_29, full_int_array_6)
+        del add_29
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_13 = paddle._C_ops.transpose(reshape_14, [0, 2, 1, 3])
+        del reshape_14
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_15 = paddle._C_ops.reshape(add_30, full_int_array_6)
+        del add_30
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_14 = paddle._C_ops.transpose(reshape_15, [0, 2, 1, 3])
+        del reshape_15
+
+        # pd_op.scale: (1x12x21x64xf32) <- (1x12x21x64xf32, 1xf32)
+        scale_6 = paddle._C_ops.scale(transpose_12, full_8, float("0"), True)
+        del transpose_12
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_27 = paddle._C_ops.matmul(scale_6, transpose_13, False, True)
+        del scale_6, transpose_13
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_31 = paddle._C_ops.add(matmul_27, scale_0)
+        del matmul_27
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_3 = paddle._C_ops.softmax(add_31, -1)
+        del add_31
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_26, dropout_27 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_3, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_3
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_28 = paddle._C_ops.matmul(dropout_26, transpose_14, False, False)
+        del dropout_26, transpose_14
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_15 = paddle._C_ops.transpose(matmul_28, [0, 2, 1, 3])
+        del matmul_28
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_16 = paddle._C_ops.reshape(transpose_15, full_int_array_7)
+        del transpose_15
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_29 = paddle._C_ops.matmul(reshape_16, parameter_200, False, False)
+        del parameter_200, reshape_16
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_32 = paddle._C_ops.add(matmul_29, parameter_199)
+        del matmul_29, parameter_199
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_28, dropout_29 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_32, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_32
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_33 = paddle._C_ops.add(layer_norm_19, dropout_28)
+        del dropout_28, layer_norm_19
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_22, layer_norm_23, layer_norm_24 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_33, parameter_194, parameter_193, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_33, parameter_193, parameter_194
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_30 = paddle._C_ops.matmul(layer_norm_22, parameter_198, False, False)
+        del parameter_198
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_34 = paddle._C_ops.add(matmul_30, parameter_197)
+        del matmul_30, parameter_197
+
+        # pd_op.gelu: (1x21x3072xf32) <- (1x21x3072xf32)
+        gelu_3 = paddle._C_ops.gelu(add_34, False)
+        del add_34
+
+        # pd_op.dropout: (1x21x3072xf32, 1x21x3072xui8) <- (1x21x3072xf32, None, 1xf32)
+        dropout_30, dropout_31 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_3, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_3
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_31 = paddle._C_ops.matmul(dropout_30, parameter_196, False, False)
+        del dropout_30, parameter_196
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_35 = paddle._C_ops.add(matmul_31, parameter_195)
+        del matmul_31, parameter_195
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_32, dropout_33 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_35, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_35
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_36 = paddle._C_ops.add(layer_norm_22, dropout_32)
+        del dropout_32, layer_norm_22
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_25, layer_norm_26, layer_norm_27 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_36, parameter_192, parameter_191, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_36, parameter_191, parameter_192
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_32 = paddle._C_ops.matmul(layer_norm_25, parameter_190, False, False)
+        del parameter_190
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_37 = paddle._C_ops.add(matmul_32, parameter_189)
+        del matmul_32, parameter_189
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_17 = paddle._C_ops.reshape(add_37, full_int_array_6)
+        del add_37
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_16 = paddle._C_ops.transpose(reshape_17, [0, 2, 1, 3])
+        del reshape_17
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_33 = paddle._C_ops.matmul(layer_norm_25, parameter_188, False, False)
+        del parameter_188
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_38 = paddle._C_ops.add(matmul_33, parameter_187)
+        del matmul_33, parameter_187
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_34 = paddle._C_ops.matmul(layer_norm_25, parameter_186, False, False)
+        del parameter_186
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_39 = paddle._C_ops.add(matmul_34, parameter_185)
+        del matmul_34, parameter_185
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_18 = paddle._C_ops.reshape(add_38, full_int_array_6)
+        del add_38
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_17 = paddle._C_ops.transpose(reshape_18, [0, 2, 1, 3])
+        del reshape_18
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_19 = paddle._C_ops.reshape(add_39, full_int_array_6)
+        del add_39
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_18 = paddle._C_ops.transpose(reshape_19, [0, 2, 1, 3])
+        del reshape_19
+
+        # pd_op.scale: (1x12x21x64xf32) <- (1x12x21x64xf32, 1xf32)
+        scale_7 = paddle._C_ops.scale(transpose_16, full_8, float("0"), True)
+        del transpose_16
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_35 = paddle._C_ops.matmul(scale_7, transpose_17, False, True)
+        del scale_7, transpose_17
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_40 = paddle._C_ops.add(matmul_35, scale_0)
+        del matmul_35
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_4 = paddle._C_ops.softmax(add_40, -1)
+        del add_40
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_34, dropout_35 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_4, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_4
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_36 = paddle._C_ops.matmul(dropout_34, transpose_18, False, False)
+        del dropout_34, transpose_18
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_19 = paddle._C_ops.transpose(matmul_36, [0, 2, 1, 3])
+        del matmul_36
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_20 = paddle._C_ops.reshape(transpose_19, full_int_array_7)
+        del transpose_19
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_37 = paddle._C_ops.matmul(reshape_20, parameter_184, False, False)
+        del parameter_184, reshape_20
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_41 = paddle._C_ops.add(matmul_37, parameter_183)
+        del matmul_37, parameter_183
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_36, dropout_37 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_41, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_41
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_42 = paddle._C_ops.add(layer_norm_25, dropout_36)
+        del dropout_36, layer_norm_25
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_28, layer_norm_29, layer_norm_30 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_42, parameter_178, parameter_177, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_42, parameter_177, parameter_178
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_38 = paddle._C_ops.matmul(layer_norm_28, parameter_182, False, False)
+        del parameter_182
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_43 = paddle._C_ops.add(matmul_38, parameter_181)
+        del matmul_38, parameter_181
+
+        # pd_op.gelu: (1x21x3072xf32) <- (1x21x3072xf32)
+        gelu_4 = paddle._C_ops.gelu(add_43, False)
+        del add_43
+
+        # pd_op.dropout: (1x21x3072xf32, 1x21x3072xui8) <- (1x21x3072xf32, None, 1xf32)
+        dropout_38, dropout_39 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_4, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_4
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_39 = paddle._C_ops.matmul(dropout_38, parameter_180, False, False)
+        del dropout_38, parameter_180
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_44 = paddle._C_ops.add(matmul_39, parameter_179)
+        del matmul_39, parameter_179
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_40, dropout_41 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_44, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_44
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_45 = paddle._C_ops.add(layer_norm_28, dropout_40)
+        del dropout_40, layer_norm_28
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_31, layer_norm_32, layer_norm_33 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_45, parameter_176, parameter_175, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_45, parameter_175, parameter_176
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_40 = paddle._C_ops.matmul(layer_norm_31, parameter_174, False, False)
+        del parameter_174
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_46 = paddle._C_ops.add(matmul_40, parameter_173)
+        del matmul_40, parameter_173
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_21 = paddle._C_ops.reshape(add_46, full_int_array_6)
+        del add_46
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_20 = paddle._C_ops.transpose(reshape_21, [0, 2, 1, 3])
+        del reshape_21
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_41 = paddle._C_ops.matmul(layer_norm_31, parameter_172, False, False)
+        del parameter_172
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_47 = paddle._C_ops.add(matmul_41, parameter_171)
+        del matmul_41, parameter_171
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_42 = paddle._C_ops.matmul(layer_norm_31, parameter_170, False, False)
+        del parameter_170
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_48 = paddle._C_ops.add(matmul_42, parameter_169)
+        del matmul_42, parameter_169
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_22 = paddle._C_ops.reshape(add_47, full_int_array_6)
+        del add_47
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_21 = paddle._C_ops.transpose(reshape_22, [0, 2, 1, 3])
+        del reshape_22
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_23 = paddle._C_ops.reshape(add_48, full_int_array_6)
+        del add_48
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_22 = paddle._C_ops.transpose(reshape_23, [0, 2, 1, 3])
+        del reshape_23
+
+        # pd_op.scale: (1x12x21x64xf32) <- (1x12x21x64xf32, 1xf32)
+        scale_8 = paddle._C_ops.scale(transpose_20, full_8, float("0"), True)
+        del transpose_20
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_43 = paddle._C_ops.matmul(scale_8, transpose_21, False, True)
+        del scale_8, transpose_21
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_49 = paddle._C_ops.add(matmul_43, scale_0)
+        del matmul_43
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_5 = paddle._C_ops.softmax(add_49, -1)
+        del add_49
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_42, dropout_43 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_5, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_5
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_44 = paddle._C_ops.matmul(dropout_42, transpose_22, False, False)
+        del dropout_42, transpose_22
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_23 = paddle._C_ops.transpose(matmul_44, [0, 2, 1, 3])
+        del matmul_44
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_24 = paddle._C_ops.reshape(transpose_23, full_int_array_7)
+        del transpose_23
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_45 = paddle._C_ops.matmul(reshape_24, parameter_168, False, False)
+        del parameter_168, reshape_24
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_50 = paddle._C_ops.add(matmul_45, parameter_167)
+        del matmul_45, parameter_167
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_44, dropout_45 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_50, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_50
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_51 = paddle._C_ops.add(layer_norm_31, dropout_44)
+        del dropout_44, layer_norm_31
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_34, layer_norm_35, layer_norm_36 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_51, parameter_162, parameter_161, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_51, parameter_161, parameter_162
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_46 = paddle._C_ops.matmul(layer_norm_34, parameter_166, False, False)
+        del parameter_166
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_52 = paddle._C_ops.add(matmul_46, parameter_165)
+        del matmul_46, parameter_165
+
+        # pd_op.gelu: (1x21x3072xf32) <- (1x21x3072xf32)
+        gelu_5 = paddle._C_ops.gelu(add_52, False)
+        del add_52
+
+        # pd_op.dropout: (1x21x3072xf32, 1x21x3072xui8) <- (1x21x3072xf32, None, 1xf32)
+        dropout_46, dropout_47 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_5, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_5
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_47 = paddle._C_ops.matmul(dropout_46, parameter_164, False, False)
+        del dropout_46, parameter_164
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_53 = paddle._C_ops.add(matmul_47, parameter_163)
+        del matmul_47, parameter_163
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_48, dropout_49 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_53, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_53
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_54 = paddle._C_ops.add(layer_norm_34, dropout_48)
+        del dropout_48, layer_norm_34
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_37, layer_norm_38, layer_norm_39 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_54, parameter_160, parameter_159, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_54, parameter_159, parameter_160
+
+        # pd_op.slice: (1x1x1x21xf32) <- (1x1x1x21xf32, 1xi64, 1xi64)
+        slice_1 = paddle._C_ops.slice(
+            scale_0, [2], full_int_array_1, full_int_array_3, [1], []
+        )
+        del full_int_array_1, full_int_array_3, scale_0
+
+        # pd_op.reshape: (1x21xi64) <- (1x21xi64, 2xi64)
+        reshape_25 = paddle._C_ops.reshape(set_value__0, full_int_array_5)
+        del full_int_array_5, set_value__0
+
+        # pd_op.full: (21x21xf32) <- ()
+        full_9 = paddle._C_ops.full(
+            [21, 21],
+            float("-inf"),
+            paddle.float32,
+            paddle.framework._current_expected_place(),
+        )
+
+        # pd_op.triu: (21x21xf32) <- (21x21xf32)
+        triu_0 = paddle._C_ops.triu(full_9, 1)
+        del full_9
+
+        # pd_op.embedding: (1x21x768xf32) <- (1x21xi64, 50265x768xf32)
+        embedding_2 = paddle._C_ops.embedding(reshape_25, parameter_258, -1, False)
+        del parameter_258, reshape_25
+
+        # pd_op.scale: (1x21x768xf32) <- (1x21x768xf32, 1xf32)
+        scale_9 = paddle._C_ops.scale(embedding_2, full_3, float("0"), True)
+        del embedding_2, full_3
+
+        # pd_op.embedding: (21x768xf32) <- (21xi64, 1026x768xf32)
+        embedding_3 = paddle._C_ops.embedding(scale_2, parameter_158, -1, False)
+        del parameter_158, scale_2
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 21x768xf32)
+        add_55 = paddle._C_ops.add(scale_9, embedding_3)
+        del embedding_3, scale_9
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_40, layer_norm_41, layer_norm_42 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_55, parameter_157, parameter_156, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_55, parameter_156, parameter_157
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_50, dropout_51 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                layer_norm_40, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del layer_norm_40
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_48 = paddle._C_ops.matmul(dropout_50, parameter_155, False, False)
+        del parameter_155
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_56 = paddle._C_ops.add(matmul_48, parameter_154)
+        del matmul_48, parameter_154
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_26 = paddle._C_ops.reshape(add_56, full_int_array_6)
+        del add_56
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_24 = paddle._C_ops.transpose(reshape_26, [0, 2, 1, 3])
+        del reshape_26
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_49 = paddle._C_ops.matmul(dropout_50, parameter_153, False, False)
+        del parameter_153
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_57 = paddle._C_ops.add(matmul_49, parameter_152)
+        del matmul_49, parameter_152
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_50 = paddle._C_ops.matmul(dropout_50, parameter_151, False, False)
+        del parameter_151
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_58 = paddle._C_ops.add(matmul_50, parameter_150)
+        del matmul_50, parameter_150
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_27 = paddle._C_ops.reshape(add_57, full_int_array_6)
+        del add_57
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_25 = paddle._C_ops.transpose(reshape_27, [0, 2, 1, 3])
+        del reshape_27
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_28 = paddle._C_ops.reshape(add_58, full_int_array_6)
+        del add_58
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_26 = paddle._C_ops.transpose(reshape_28, [0, 2, 1, 3])
+        del reshape_28
+
+        # pd_op.scale: (1x12x21x64xf32) <- (1x12x21x64xf32, 1xf32)
+        scale_10 = paddle._C_ops.scale(transpose_24, full_8, float("0"), True)
+        del transpose_24
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_51 = paddle._C_ops.matmul(scale_10, transpose_25, False, True)
+        del scale_10, transpose_25
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 21x21xf32)
+        add_59 = paddle._C_ops.add(matmul_51, triu_0)
+        del matmul_51
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_6 = paddle._C_ops.softmax(add_59, -1)
+        del add_59
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_52, dropout_53 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_6, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_6
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_52 = paddle._C_ops.matmul(dropout_52, transpose_26, False, False)
+        del dropout_52, transpose_26
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_27 = paddle._C_ops.transpose(matmul_52, [0, 2, 1, 3])
+        del matmul_52
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_29 = paddle._C_ops.reshape(transpose_27, full_int_array_7)
+        del transpose_27
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_53 = paddle._C_ops.matmul(reshape_29, parameter_149, False, False)
+        del parameter_149, reshape_29
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_60 = paddle._C_ops.add(matmul_53, parameter_148)
+        del matmul_53, parameter_148
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_54, dropout_55 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_60, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_60
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_61 = paddle._C_ops.add(dropout_50, dropout_54)
+        del dropout_50, dropout_54
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_43, layer_norm_44, layer_norm_45 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_61, parameter_135, parameter_134, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_61, parameter_134, parameter_135
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_54 = paddle._C_ops.matmul(layer_norm_43, parameter_147, False, False)
+        del parameter_147
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_62 = paddle._C_ops.add(matmul_54, parameter_146)
+        del matmul_54, parameter_146
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_30 = paddle._C_ops.reshape(add_62, full_int_array_6)
+        del add_62
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_28 = paddle._C_ops.transpose(reshape_30, [0, 2, 1, 3])
+        del reshape_30
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_55 = paddle._C_ops.matmul(layer_norm_37, parameter_145, False, False)
+        del parameter_145
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_63 = paddle._C_ops.add(matmul_55, parameter_144)
+        del matmul_55, parameter_144
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_56 = paddle._C_ops.matmul(layer_norm_37, parameter_143, False, False)
+        del parameter_143
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_64 = paddle._C_ops.add(matmul_56, parameter_142)
+        del matmul_56, parameter_142
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_31 = paddle._C_ops.reshape(add_63, full_int_array_6)
+        del add_63
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_29 = paddle._C_ops.transpose(reshape_31, [0, 2, 1, 3])
+        del reshape_31
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_32 = paddle._C_ops.reshape(add_64, full_int_array_6)
+        del add_64
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_30 = paddle._C_ops.transpose(reshape_32, [0, 2, 1, 3])
+        del reshape_32
+
+        # pd_op.scale: (1x12x21x64xf32) <- (1x12x21x64xf32, 1xf32)
+        scale_11 = paddle._C_ops.scale(transpose_28, full_8, float("0"), True)
+        del transpose_28
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_57 = paddle._C_ops.matmul(scale_11, transpose_29, False, True)
+        del scale_11, transpose_29
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_65 = paddle._C_ops.add(matmul_57, slice_1)
+        del matmul_57
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_7 = paddle._C_ops.softmax(add_65, -1)
+        del add_65
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_56, dropout_57 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_7, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_7
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_58 = paddle._C_ops.matmul(dropout_56, transpose_30, False, False)
+        del dropout_56, transpose_30
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_31 = paddle._C_ops.transpose(matmul_58, [0, 2, 1, 3])
+        del matmul_58
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_33 = paddle._C_ops.reshape(transpose_31, full_int_array_7)
+        del transpose_31
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_59 = paddle._C_ops.matmul(reshape_33, parameter_141, False, False)
+        del parameter_141, reshape_33
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_66 = paddle._C_ops.add(matmul_59, parameter_140)
+        del matmul_59, parameter_140
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_58, dropout_59 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_66, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_66
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_67 = paddle._C_ops.add(layer_norm_43, dropout_58)
+        del dropout_58, layer_norm_43
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_46, layer_norm_47, layer_norm_48 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_67, parameter_133, parameter_132, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_67, parameter_132, parameter_133
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_60 = paddle._C_ops.matmul(layer_norm_46, parameter_139, False, False)
+        del parameter_139
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_68 = paddle._C_ops.add(matmul_60, parameter_138)
+        del matmul_60, parameter_138
+
+        # pd_op.gelu: (1x21x3072xf32) <- (1x21x3072xf32)
+        gelu_6 = paddle._C_ops.gelu(add_68, False)
+        del add_68
+
+        # pd_op.dropout: (1x21x3072xf32, 1x21x3072xui8) <- (1x21x3072xf32, None, 1xf32)
+        dropout_60, dropout_61 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_6, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_6
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_61 = paddle._C_ops.matmul(dropout_60, parameter_137, False, False)
+        del dropout_60, parameter_137
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_69 = paddle._C_ops.add(matmul_61, parameter_136)
+        del matmul_61, parameter_136
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_62, dropout_63 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_69, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_69
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_70 = paddle._C_ops.add(layer_norm_46, dropout_62)
+        del dropout_62, layer_norm_46
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_49, layer_norm_50, layer_norm_51 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_70, parameter_131, parameter_130, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_70, parameter_130, parameter_131
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_62 = paddle._C_ops.matmul(layer_norm_49, parameter_129, False, False)
+        del parameter_129
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_71 = paddle._C_ops.add(matmul_62, parameter_128)
+        del matmul_62, parameter_128
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_34 = paddle._C_ops.reshape(add_71, full_int_array_6)
+        del add_71
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_32 = paddle._C_ops.transpose(reshape_34, [0, 2, 1, 3])
+        del reshape_34
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_63 = paddle._C_ops.matmul(layer_norm_49, parameter_127, False, False)
+        del parameter_127
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_72 = paddle._C_ops.add(matmul_63, parameter_126)
+        del matmul_63, parameter_126
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_64 = paddle._C_ops.matmul(layer_norm_49, parameter_125, False, False)
+        del parameter_125
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_73 = paddle._C_ops.add(matmul_64, parameter_124)
+        del matmul_64, parameter_124
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_35 = paddle._C_ops.reshape(add_72, full_int_array_6)
+        del add_72
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_33 = paddle._C_ops.transpose(reshape_35, [0, 2, 1, 3])
+        del reshape_35
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_36 = paddle._C_ops.reshape(add_73, full_int_array_6)
+        del add_73
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_34 = paddle._C_ops.transpose(reshape_36, [0, 2, 1, 3])
+        del reshape_36
+
+        # pd_op.scale: (1x12x21x64xf32) <- (1x12x21x64xf32, 1xf32)
+        scale_12 = paddle._C_ops.scale(transpose_32, full_8, float("0"), True)
+        del transpose_32
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_65 = paddle._C_ops.matmul(scale_12, transpose_33, False, True)
+        del scale_12, transpose_33
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 21x21xf32)
+        add_74 = paddle._C_ops.add(matmul_65, triu_0)
+        del matmul_65
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_8 = paddle._C_ops.softmax(add_74, -1)
+        del add_74
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_64, dropout_65 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_8, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_8
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_66 = paddle._C_ops.matmul(dropout_64, transpose_34, False, False)
+        del dropout_64, transpose_34
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_35 = paddle._C_ops.transpose(matmul_66, [0, 2, 1, 3])
+        del matmul_66
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_37 = paddle._C_ops.reshape(transpose_35, full_int_array_7)
+        del transpose_35
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_67 = paddle._C_ops.matmul(reshape_37, parameter_123, False, False)
+        del parameter_123, reshape_37
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_75 = paddle._C_ops.add(matmul_67, parameter_122)
+        del matmul_67, parameter_122
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_66, dropout_67 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_75, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_75
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_76 = paddle._C_ops.add(layer_norm_49, dropout_66)
+        del dropout_66, layer_norm_49
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_52, layer_norm_53, layer_norm_54 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_76, parameter_109, parameter_108, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_76, parameter_108, parameter_109
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_68 = paddle._C_ops.matmul(layer_norm_52, parameter_121, False, False)
+        del parameter_121
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_77 = paddle._C_ops.add(matmul_68, parameter_120)
+        del matmul_68, parameter_120
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_38 = paddle._C_ops.reshape(add_77, full_int_array_6)
+        del add_77
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_36 = paddle._C_ops.transpose(reshape_38, [0, 2, 1, 3])
+        del reshape_38
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_69 = paddle._C_ops.matmul(layer_norm_37, parameter_119, False, False)
+        del parameter_119
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_78 = paddle._C_ops.add(matmul_69, parameter_118)
+        del matmul_69, parameter_118
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_70 = paddle._C_ops.matmul(layer_norm_37, parameter_117, False, False)
+        del parameter_117
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_79 = paddle._C_ops.add(matmul_70, parameter_116)
+        del matmul_70, parameter_116
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_39 = paddle._C_ops.reshape(add_78, full_int_array_6)
+        del add_78
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_37 = paddle._C_ops.transpose(reshape_39, [0, 2, 1, 3])
+        del reshape_39
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_40 = paddle._C_ops.reshape(add_79, full_int_array_6)
+        del add_79
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_38 = paddle._C_ops.transpose(reshape_40, [0, 2, 1, 3])
+        del reshape_40
+
+        # pd_op.scale: (1x12x21x64xf32) <- (1x12x21x64xf32, 1xf32)
+        scale_13 = paddle._C_ops.scale(transpose_36, full_8, float("0"), True)
+        del transpose_36
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_71 = paddle._C_ops.matmul(scale_13, transpose_37, False, True)
+        del scale_13, transpose_37
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_80 = paddle._C_ops.add(matmul_71, slice_1)
+        del matmul_71
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_9 = paddle._C_ops.softmax(add_80, -1)
+        del add_80
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_68, dropout_69 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_9, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_9
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_72 = paddle._C_ops.matmul(dropout_68, transpose_38, False, False)
+        del dropout_68, transpose_38
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_39 = paddle._C_ops.transpose(matmul_72, [0, 2, 1, 3])
+        del matmul_72
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_41 = paddle._C_ops.reshape(transpose_39, full_int_array_7)
+        del transpose_39
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_73 = paddle._C_ops.matmul(reshape_41, parameter_115, False, False)
+        del parameter_115, reshape_41
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_81 = paddle._C_ops.add(matmul_73, parameter_114)
+        del matmul_73, parameter_114
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_70, dropout_71 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_81, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_81
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_82 = paddle._C_ops.add(layer_norm_52, dropout_70)
+        del dropout_70, layer_norm_52
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_55, layer_norm_56, layer_norm_57 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_82, parameter_107, parameter_106, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_82, parameter_106, parameter_107
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_74 = paddle._C_ops.matmul(layer_norm_55, parameter_113, False, False)
+        del parameter_113
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_83 = paddle._C_ops.add(matmul_74, parameter_112)
+        del matmul_74, parameter_112
+
+        # pd_op.gelu: (1x21x3072xf32) <- (1x21x3072xf32)
+        gelu_7 = paddle._C_ops.gelu(add_83, False)
+        del add_83
+
+        # pd_op.dropout: (1x21x3072xf32, 1x21x3072xui8) <- (1x21x3072xf32, None, 1xf32)
+        dropout_72, dropout_73 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_7, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_7
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_75 = paddle._C_ops.matmul(dropout_72, parameter_111, False, False)
+        del dropout_72, parameter_111
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_84 = paddle._C_ops.add(matmul_75, parameter_110)
+        del matmul_75, parameter_110
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_74, dropout_75 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_84, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_84
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_85 = paddle._C_ops.add(layer_norm_55, dropout_74)
+        del dropout_74, layer_norm_55
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_58, layer_norm_59, layer_norm_60 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_85, parameter_105, parameter_104, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_85, parameter_104, parameter_105
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_76 = paddle._C_ops.matmul(layer_norm_58, parameter_103, False, False)
+        del parameter_103
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_86 = paddle._C_ops.add(matmul_76, parameter_102)
+        del matmul_76, parameter_102
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_42 = paddle._C_ops.reshape(add_86, full_int_array_6)
+        del add_86
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_40 = paddle._C_ops.transpose(reshape_42, [0, 2, 1, 3])
+        del reshape_42
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_77 = paddle._C_ops.matmul(layer_norm_58, parameter_101, False, False)
+        del parameter_101
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_87 = paddle._C_ops.add(matmul_77, parameter_100)
+        del matmul_77, parameter_100
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_78 = paddle._C_ops.matmul(layer_norm_58, parameter_99, False, False)
+        del parameter_99
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_88 = paddle._C_ops.add(matmul_78, parameter_98)
+        del matmul_78, parameter_98
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_43 = paddle._C_ops.reshape(add_87, full_int_array_6)
+        del add_87
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_41 = paddle._C_ops.transpose(reshape_43, [0, 2, 1, 3])
+        del reshape_43
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_44 = paddle._C_ops.reshape(add_88, full_int_array_6)
+        del add_88
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_42 = paddle._C_ops.transpose(reshape_44, [0, 2, 1, 3])
+        del reshape_44
+
+        # pd_op.scale: (1x12x21x64xf32) <- (1x12x21x64xf32, 1xf32)
+        scale_14 = paddle._C_ops.scale(transpose_40, full_8, float("0"), True)
+        del transpose_40
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_79 = paddle._C_ops.matmul(scale_14, transpose_41, False, True)
+        del scale_14, transpose_41
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 21x21xf32)
+        add_89 = paddle._C_ops.add(matmul_79, triu_0)
+        del matmul_79
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_10 = paddle._C_ops.softmax(add_89, -1)
+        del add_89
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_76, dropout_77 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_10, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_10
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_80 = paddle._C_ops.matmul(dropout_76, transpose_42, False, False)
+        del dropout_76, transpose_42
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_43 = paddle._C_ops.transpose(matmul_80, [0, 2, 1, 3])
+        del matmul_80
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_45 = paddle._C_ops.reshape(transpose_43, full_int_array_7)
+        del transpose_43
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_81 = paddle._C_ops.matmul(reshape_45, parameter_97, False, False)
+        del parameter_97, reshape_45
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_90 = paddle._C_ops.add(matmul_81, parameter_96)
+        del matmul_81, parameter_96
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_78, dropout_79 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_90, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_90
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_91 = paddle._C_ops.add(layer_norm_58, dropout_78)
+        del dropout_78, layer_norm_58
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_61, layer_norm_62, layer_norm_63 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_91, parameter_83, parameter_82, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_91, parameter_82, parameter_83
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_82 = paddle._C_ops.matmul(layer_norm_61, parameter_95, False, False)
+        del parameter_95
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_92 = paddle._C_ops.add(matmul_82, parameter_94)
+        del matmul_82, parameter_94
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_46 = paddle._C_ops.reshape(add_92, full_int_array_6)
+        del add_92
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_44 = paddle._C_ops.transpose(reshape_46, [0, 2, 1, 3])
+        del reshape_46
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_83 = paddle._C_ops.matmul(layer_norm_37, parameter_93, False, False)
+        del parameter_93
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_93 = paddle._C_ops.add(matmul_83, parameter_92)
+        del matmul_83, parameter_92
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_84 = paddle._C_ops.matmul(layer_norm_37, parameter_91, False, False)
+        del parameter_91
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_94 = paddle._C_ops.add(matmul_84, parameter_90)
+        del matmul_84, parameter_90
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_47 = paddle._C_ops.reshape(add_93, full_int_array_6)
+        del add_93
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_45 = paddle._C_ops.transpose(reshape_47, [0, 2, 1, 3])
+        del reshape_47
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_48 = paddle._C_ops.reshape(add_94, full_int_array_6)
+        del add_94
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_46 = paddle._C_ops.transpose(reshape_48, [0, 2, 1, 3])
+        del reshape_48
+
+        # pd_op.scale: (1x12x21x64xf32) <- (1x12x21x64xf32, 1xf32)
+        scale_15 = paddle._C_ops.scale(transpose_44, full_8, float("0"), True)
+        del transpose_44
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_85 = paddle._C_ops.matmul(scale_15, transpose_45, False, True)
+        del scale_15, transpose_45
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_95 = paddle._C_ops.add(matmul_85, slice_1)
+        del matmul_85
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_11 = paddle._C_ops.softmax(add_95, -1)
+        del add_95
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_80, dropout_81 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_11, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_11
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_86 = paddle._C_ops.matmul(dropout_80, transpose_46, False, False)
+        del dropout_80, transpose_46
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_47 = paddle._C_ops.transpose(matmul_86, [0, 2, 1, 3])
+        del matmul_86
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_49 = paddle._C_ops.reshape(transpose_47, full_int_array_7)
+        del transpose_47
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_87 = paddle._C_ops.matmul(reshape_49, parameter_89, False, False)
+        del parameter_89, reshape_49
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_96 = paddle._C_ops.add(matmul_87, parameter_88)
+        del matmul_87, parameter_88
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_82, dropout_83 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_96, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_96
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_97 = paddle._C_ops.add(layer_norm_61, dropout_82)
+        del dropout_82, layer_norm_61
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_64, layer_norm_65, layer_norm_66 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_97, parameter_81, parameter_80, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_97, parameter_80, parameter_81
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_88 = paddle._C_ops.matmul(layer_norm_64, parameter_87, False, False)
+        del parameter_87
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_98 = paddle._C_ops.add(matmul_88, parameter_86)
+        del matmul_88, parameter_86
+
+        # pd_op.gelu: (1x21x3072xf32) <- (1x21x3072xf32)
+        gelu_8 = paddle._C_ops.gelu(add_98, False)
+        del add_98
+
+        # pd_op.dropout: (1x21x3072xf32, 1x21x3072xui8) <- (1x21x3072xf32, None, 1xf32)
+        dropout_84, dropout_85 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_8, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_8
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_89 = paddle._C_ops.matmul(dropout_84, parameter_85, False, False)
+        del dropout_84, parameter_85
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_99 = paddle._C_ops.add(matmul_89, parameter_84)
+        del matmul_89, parameter_84
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_86, dropout_87 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_99, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_99
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_100 = paddle._C_ops.add(layer_norm_64, dropout_86)
+        del dropout_86, layer_norm_64
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_67, layer_norm_68, layer_norm_69 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_100, parameter_79, parameter_78, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_100, parameter_78, parameter_79
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_90 = paddle._C_ops.matmul(layer_norm_67, parameter_77, False, False)
+        del parameter_77
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_101 = paddle._C_ops.add(matmul_90, parameter_76)
+        del matmul_90, parameter_76
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_50 = paddle._C_ops.reshape(add_101, full_int_array_6)
+        del add_101
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_48 = paddle._C_ops.transpose(reshape_50, [0, 2, 1, 3])
+        del reshape_50
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_91 = paddle._C_ops.matmul(layer_norm_67, parameter_75, False, False)
+        del parameter_75
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_102 = paddle._C_ops.add(matmul_91, parameter_74)
+        del matmul_91, parameter_74
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_92 = paddle._C_ops.matmul(layer_norm_67, parameter_73, False, False)
+        del parameter_73
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_103 = paddle._C_ops.add(matmul_92, parameter_72)
+        del matmul_92, parameter_72
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_51 = paddle._C_ops.reshape(add_102, full_int_array_6)
+        del add_102
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_49 = paddle._C_ops.transpose(reshape_51, [0, 2, 1, 3])
+        del reshape_51
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_52 = paddle._C_ops.reshape(add_103, full_int_array_6)
+        del add_103
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_50 = paddle._C_ops.transpose(reshape_52, [0, 2, 1, 3])
+        del reshape_52
+
+        # pd_op.scale: (1x12x21x64xf32) <- (1x12x21x64xf32, 1xf32)
+        scale_16 = paddle._C_ops.scale(transpose_48, full_8, float("0"), True)
+        del transpose_48
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_93 = paddle._C_ops.matmul(scale_16, transpose_49, False, True)
+        del scale_16, transpose_49
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 21x21xf32)
+        add_104 = paddle._C_ops.add(matmul_93, triu_0)
+        del matmul_93
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_12 = paddle._C_ops.softmax(add_104, -1)
+        del add_104
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_88, dropout_89 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_12, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_12
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_94 = paddle._C_ops.matmul(dropout_88, transpose_50, False, False)
+        del dropout_88, transpose_50
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_51 = paddle._C_ops.transpose(matmul_94, [0, 2, 1, 3])
+        del matmul_94
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_53 = paddle._C_ops.reshape(transpose_51, full_int_array_7)
+        del transpose_51
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_95 = paddle._C_ops.matmul(reshape_53, parameter_71, False, False)
+        del parameter_71, reshape_53
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_105 = paddle._C_ops.add(matmul_95, parameter_70)
+        del matmul_95, parameter_70
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_90, dropout_91 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_105, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_105
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_106 = paddle._C_ops.add(layer_norm_67, dropout_90)
+        del dropout_90, layer_norm_67
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_70, layer_norm_71, layer_norm_72 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_106, parameter_57, parameter_56, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_106, parameter_56, parameter_57
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_96 = paddle._C_ops.matmul(layer_norm_70, parameter_69, False, False)
+        del parameter_69
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_107 = paddle._C_ops.add(matmul_96, parameter_68)
+        del matmul_96, parameter_68
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_54 = paddle._C_ops.reshape(add_107, full_int_array_6)
+        del add_107
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_52 = paddle._C_ops.transpose(reshape_54, [0, 2, 1, 3])
+        del reshape_54
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_97 = paddle._C_ops.matmul(layer_norm_37, parameter_67, False, False)
+        del parameter_67
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_108 = paddle._C_ops.add(matmul_97, parameter_66)
+        del matmul_97, parameter_66
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_98 = paddle._C_ops.matmul(layer_norm_37, parameter_65, False, False)
+        del parameter_65
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_109 = paddle._C_ops.add(matmul_98, parameter_64)
+        del matmul_98, parameter_64
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_55 = paddle._C_ops.reshape(add_108, full_int_array_6)
+        del add_108
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_53 = paddle._C_ops.transpose(reshape_55, [0, 2, 1, 3])
+        del reshape_55
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_56 = paddle._C_ops.reshape(add_109, full_int_array_6)
+        del add_109
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_54 = paddle._C_ops.transpose(reshape_56, [0, 2, 1, 3])
+        del reshape_56
+
+        # pd_op.scale: (1x12x21x64xf32) <- (1x12x21x64xf32, 1xf32)
+        scale_17 = paddle._C_ops.scale(transpose_52, full_8, float("0"), True)
+        del transpose_52
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_99 = paddle._C_ops.matmul(scale_17, transpose_53, False, True)
+        del scale_17, transpose_53
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_110 = paddle._C_ops.add(matmul_99, slice_1)
+        del matmul_99
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_13 = paddle._C_ops.softmax(add_110, -1)
+        del add_110
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_92, dropout_93 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_13, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_13
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_100 = paddle._C_ops.matmul(dropout_92, transpose_54, False, False)
+        del dropout_92, transpose_54
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_55 = paddle._C_ops.transpose(matmul_100, [0, 2, 1, 3])
+        del matmul_100
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_57 = paddle._C_ops.reshape(transpose_55, full_int_array_7)
+        del transpose_55
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_101 = paddle._C_ops.matmul(reshape_57, parameter_63, False, False)
+        del parameter_63, reshape_57
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_111 = paddle._C_ops.add(matmul_101, parameter_62)
+        del matmul_101, parameter_62
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_94, dropout_95 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_111, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_111
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_112 = paddle._C_ops.add(layer_norm_70, dropout_94)
+        del dropout_94, layer_norm_70
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_73, layer_norm_74, layer_norm_75 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_112, parameter_55, parameter_54, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_112, parameter_54, parameter_55
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_102 = paddle._C_ops.matmul(layer_norm_73, parameter_61, False, False)
+        del parameter_61
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_113 = paddle._C_ops.add(matmul_102, parameter_60)
+        del matmul_102, parameter_60
+
+        # pd_op.gelu: (1x21x3072xf32) <- (1x21x3072xf32)
+        gelu_9 = paddle._C_ops.gelu(add_113, False)
+        del add_113
+
+        # pd_op.dropout: (1x21x3072xf32, 1x21x3072xui8) <- (1x21x3072xf32, None, 1xf32)
+        dropout_96, dropout_97 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_9, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_9
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_103 = paddle._C_ops.matmul(dropout_96, parameter_59, False, False)
+        del dropout_96, parameter_59
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_114 = paddle._C_ops.add(matmul_103, parameter_58)
+        del matmul_103, parameter_58
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_98, dropout_99 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_114, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_114
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_115 = paddle._C_ops.add(layer_norm_73, dropout_98)
+        del dropout_98, layer_norm_73
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_76, layer_norm_77, layer_norm_78 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_115, parameter_53, parameter_52, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_115, parameter_52, parameter_53
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_104 = paddle._C_ops.matmul(layer_norm_76, parameter_51, False, False)
+        del parameter_51
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_116 = paddle._C_ops.add(matmul_104, parameter_50)
+        del matmul_104, parameter_50
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_58 = paddle._C_ops.reshape(add_116, full_int_array_6)
+        del add_116
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_56 = paddle._C_ops.transpose(reshape_58, [0, 2, 1, 3])
+        del reshape_58
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_105 = paddle._C_ops.matmul(layer_norm_76, parameter_49, False, False)
+        del parameter_49
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_117 = paddle._C_ops.add(matmul_105, parameter_48)
+        del matmul_105, parameter_48
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_106 = paddle._C_ops.matmul(layer_norm_76, parameter_47, False, False)
+        del parameter_47
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_118 = paddle._C_ops.add(matmul_106, parameter_46)
+        del matmul_106, parameter_46
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_59 = paddle._C_ops.reshape(add_117, full_int_array_6)
+        del add_117
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_57 = paddle._C_ops.transpose(reshape_59, [0, 2, 1, 3])
+        del reshape_59
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_60 = paddle._C_ops.reshape(add_118, full_int_array_6)
+        del add_118
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_58 = paddle._C_ops.transpose(reshape_60, [0, 2, 1, 3])
+        del reshape_60
+
+        # pd_op.scale: (1x12x21x64xf32) <- (1x12x21x64xf32, 1xf32)
+        scale_18 = paddle._C_ops.scale(transpose_56, full_8, float("0"), True)
+        del transpose_56
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_107 = paddle._C_ops.matmul(scale_18, transpose_57, False, True)
+        del scale_18, transpose_57
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 21x21xf32)
+        add_119 = paddle._C_ops.add(matmul_107, triu_0)
+        del matmul_107
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_14 = paddle._C_ops.softmax(add_119, -1)
+        del add_119
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_100, dropout_101 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_14, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_14
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_108 = paddle._C_ops.matmul(dropout_100, transpose_58, False, False)
+        del dropout_100, transpose_58
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_59 = paddle._C_ops.transpose(matmul_108, [0, 2, 1, 3])
+        del matmul_108
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_61 = paddle._C_ops.reshape(transpose_59, full_int_array_7)
+        del transpose_59
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_109 = paddle._C_ops.matmul(reshape_61, parameter_45, False, False)
+        del parameter_45, reshape_61
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_120 = paddle._C_ops.add(matmul_109, parameter_44)
+        del matmul_109, parameter_44
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_102, dropout_103 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_120, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_120
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_121 = paddle._C_ops.add(layer_norm_76, dropout_102)
+        del dropout_102, layer_norm_76
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_79, layer_norm_80, layer_norm_81 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_121, parameter_31, parameter_30, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_121, parameter_30, parameter_31
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_110 = paddle._C_ops.matmul(layer_norm_79, parameter_43, False, False)
+        del parameter_43
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_122 = paddle._C_ops.add(matmul_110, parameter_42)
+        del matmul_110, parameter_42
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_62 = paddle._C_ops.reshape(add_122, full_int_array_6)
+        del add_122
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_60 = paddle._C_ops.transpose(reshape_62, [0, 2, 1, 3])
+        del reshape_62
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_111 = paddle._C_ops.matmul(layer_norm_37, parameter_41, False, False)
+        del parameter_41
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_123 = paddle._C_ops.add(matmul_111, parameter_40)
+        del matmul_111, parameter_40
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_112 = paddle._C_ops.matmul(layer_norm_37, parameter_39, False, False)
+        del parameter_39
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_124 = paddle._C_ops.add(matmul_112, parameter_38)
+        del matmul_112, parameter_38
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_63 = paddle._C_ops.reshape(add_123, full_int_array_6)
+        del add_123
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_61 = paddle._C_ops.transpose(reshape_63, [0, 2, 1, 3])
+        del reshape_63
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_64 = paddle._C_ops.reshape(add_124, full_int_array_6)
+        del add_124
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_62 = paddle._C_ops.transpose(reshape_64, [0, 2, 1, 3])
+        del reshape_64
+
+        # pd_op.scale: (1x12x21x64xf32) <- (1x12x21x64xf32, 1xf32)
+        scale_19 = paddle._C_ops.scale(transpose_60, full_8, float("0"), True)
+        del transpose_60
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_113 = paddle._C_ops.matmul(scale_19, transpose_61, False, True)
+        del scale_19, transpose_61
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_125 = paddle._C_ops.add(matmul_113, slice_1)
+        del matmul_113
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_15 = paddle._C_ops.softmax(add_125, -1)
+        del add_125
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_104, dropout_105 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_15, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_15
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_114 = paddle._C_ops.matmul(dropout_104, transpose_62, False, False)
+        del dropout_104, transpose_62
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_63 = paddle._C_ops.transpose(matmul_114, [0, 2, 1, 3])
+        del matmul_114
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_65 = paddle._C_ops.reshape(transpose_63, full_int_array_7)
+        del transpose_63
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_115 = paddle._C_ops.matmul(reshape_65, parameter_37, False, False)
+        del parameter_37, reshape_65
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_126 = paddle._C_ops.add(matmul_115, parameter_36)
+        del matmul_115, parameter_36
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_106, dropout_107 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_126, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_126
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_127 = paddle._C_ops.add(layer_norm_79, dropout_106)
+        del dropout_106, layer_norm_79
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_82, layer_norm_83, layer_norm_84 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_127, parameter_29, parameter_28, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_127, parameter_28, parameter_29
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_116 = paddle._C_ops.matmul(layer_norm_82, parameter_35, False, False)
+        del parameter_35
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_128 = paddle._C_ops.add(matmul_116, parameter_34)
+        del matmul_116, parameter_34
+
+        # pd_op.gelu: (1x21x3072xf32) <- (1x21x3072xf32)
+        gelu_10 = paddle._C_ops.gelu(add_128, False)
+        del add_128
+
+        # pd_op.dropout: (1x21x3072xf32, 1x21x3072xui8) <- (1x21x3072xf32, None, 1xf32)
+        dropout_108, dropout_109 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_10, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_10
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_117 = paddle._C_ops.matmul(dropout_108, parameter_33, False, False)
+        del dropout_108, parameter_33
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_129 = paddle._C_ops.add(matmul_117, parameter_32)
+        del matmul_117, parameter_32
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_110, dropout_111 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_129, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_129
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_130 = paddle._C_ops.add(layer_norm_82, dropout_110)
+        del dropout_110, layer_norm_82
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_85, layer_norm_86, layer_norm_87 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_130, parameter_27, parameter_26, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_130, parameter_26, parameter_27
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_118 = paddle._C_ops.matmul(layer_norm_85, parameter_25, False, False)
+        del parameter_25
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_131 = paddle._C_ops.add(matmul_118, parameter_24)
+        del matmul_118, parameter_24
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_66 = paddle._C_ops.reshape(add_131, full_int_array_6)
+        del add_131
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_64 = paddle._C_ops.transpose(reshape_66, [0, 2, 1, 3])
+        del reshape_66
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_119 = paddle._C_ops.matmul(layer_norm_85, parameter_23, False, False)
+        del parameter_23
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_132 = paddle._C_ops.add(matmul_119, parameter_22)
+        del matmul_119, parameter_22
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_120 = paddle._C_ops.matmul(layer_norm_85, parameter_21, False, False)
+        del parameter_21
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_133 = paddle._C_ops.add(matmul_120, parameter_20)
+        del matmul_120, parameter_20
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_67 = paddle._C_ops.reshape(add_132, full_int_array_6)
+        del add_132
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_65 = paddle._C_ops.transpose(reshape_67, [0, 2, 1, 3])
+        del reshape_67
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_68 = paddle._C_ops.reshape(add_133, full_int_array_6)
+        del add_133
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_66 = paddle._C_ops.transpose(reshape_68, [0, 2, 1, 3])
+        del reshape_68
+
+        # pd_op.scale: (1x12x21x64xf32) <- (1x12x21x64xf32, 1xf32)
+        scale_20 = paddle._C_ops.scale(transpose_64, full_8, float("0"), True)
+        del transpose_64
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_121 = paddle._C_ops.matmul(scale_20, transpose_65, False, True)
+        del scale_20, transpose_65
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 21x21xf32)
+        add_134 = paddle._C_ops.add(matmul_121, triu_0)
+        del matmul_121, triu_0
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_16 = paddle._C_ops.softmax(add_134, -1)
+        del add_134
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_112, dropout_113 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_16, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_16
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_122 = paddle._C_ops.matmul(dropout_112, transpose_66, False, False)
+        del dropout_112, transpose_66
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_67 = paddle._C_ops.transpose(matmul_122, [0, 2, 1, 3])
+        del matmul_122
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_69 = paddle._C_ops.reshape(transpose_67, full_int_array_7)
+        del transpose_67
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_123 = paddle._C_ops.matmul(reshape_69, parameter_19, False, False)
+        del parameter_19, reshape_69
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_135 = paddle._C_ops.add(matmul_123, parameter_18)
+        del matmul_123, parameter_18
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_114, dropout_115 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_135, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_135
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_136 = paddle._C_ops.add(layer_norm_85, dropout_114)
+        del dropout_114, layer_norm_85
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_88, layer_norm_89, layer_norm_90 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_136, parameter_5, parameter_4, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_136, parameter_4, parameter_5
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_124 = paddle._C_ops.matmul(layer_norm_88, parameter_17, False, False)
+        del parameter_17
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_137 = paddle._C_ops.add(matmul_124, parameter_16)
+        del matmul_124, parameter_16
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_70 = paddle._C_ops.reshape(add_137, full_int_array_6)
+        del add_137
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_68 = paddle._C_ops.transpose(reshape_70, [0, 2, 1, 3])
+        del reshape_70
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_125 = paddle._C_ops.matmul(layer_norm_37, parameter_15, False, False)
+        del parameter_15
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_138 = paddle._C_ops.add(matmul_125, parameter_14)
+        del matmul_125, parameter_14
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_126 = paddle._C_ops.matmul(layer_norm_37, parameter_13, False, False)
+        del parameter_13
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_139 = paddle._C_ops.add(matmul_126, parameter_12)
+        del matmul_126, parameter_12
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_71 = paddle._C_ops.reshape(add_138, full_int_array_6)
+        del add_138
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_69 = paddle._C_ops.transpose(reshape_71, [0, 2, 1, 3])
+        del reshape_71
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_72 = paddle._C_ops.reshape(add_139, full_int_array_6)
+        del add_139, full_int_array_6
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_70 = paddle._C_ops.transpose(reshape_72, [0, 2, 1, 3])
+        del reshape_72
+
+        # pd_op.scale: (1x12x21x64xf32) <- (1x12x21x64xf32, 1xf32)
+        scale_21 = paddle._C_ops.scale(transpose_68, full_8, float("0"), True)
+        del full_8, transpose_68
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_127 = paddle._C_ops.matmul(scale_21, transpose_69, False, True)
+        del scale_21, transpose_69
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_140 = paddle._C_ops.add(matmul_127, slice_1)
+        del matmul_127, slice_1
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_17 = paddle._C_ops.softmax(add_140, -1)
+        del add_140
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_116, dropout_117 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_17, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_17
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_128 = paddle._C_ops.matmul(dropout_116, transpose_70, False, False)
+        del dropout_116, transpose_70
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_71 = paddle._C_ops.transpose(matmul_128, [0, 2, 1, 3])
+        del matmul_128
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_73 = paddle._C_ops.reshape(transpose_71, full_int_array_7)
+        del full_int_array_7, transpose_71
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_129 = paddle._C_ops.matmul(reshape_73, parameter_11, False, False)
+        del parameter_11, reshape_73
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_141 = paddle._C_ops.add(matmul_129, parameter_10)
+        del matmul_129, parameter_10
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_118, dropout_119 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_141, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_141
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_142 = paddle._C_ops.add(layer_norm_88, dropout_118)
+        del dropout_118, layer_norm_88
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_91, layer_norm_92, layer_norm_93 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_142, parameter_3, parameter_2, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_142, parameter_2, parameter_3
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_130 = paddle._C_ops.matmul(layer_norm_91, parameter_9, False, False)
+        del parameter_9
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_143 = paddle._C_ops.add(matmul_130, parameter_8)
+        del matmul_130, parameter_8
+
+        # pd_op.gelu: (1x21x3072xf32) <- (1x21x3072xf32)
+        gelu_11 = paddle._C_ops.gelu(add_143, False)
+        del add_143
+
+        # pd_op.dropout: (1x21x3072xf32, 1x21x3072xui8) <- (1x21x3072xf32, None, 1xf32)
+        dropout_120, dropout_121 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_11, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_11
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_131 = paddle._C_ops.matmul(dropout_120, parameter_7, False, False)
+        del dropout_120, parameter_7
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_144 = paddle._C_ops.add(matmul_131, parameter_6)
+        del matmul_131, parameter_6
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_122, dropout_123 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_144, None, full_7, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_144, full_7
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_145 = paddle._C_ops.add(layer_norm_91, dropout_122)
+        del dropout_122, layer_norm_91
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_0, layer_norm_94, layer_norm_95 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_145, parameter_1, parameter_0, float("1e-05"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_145, layer_norm_37, parameter_0, parameter_1
+
+        return layer_norm_0
diff --git a/paddle_samples/PaddleNLP/bart-base/weight_meta.py b/paddle_samples/PaddleNLP/bart-base/weight_meta.py
new file mode 100644
index 000000000..da97709f4
--- /dev/null
+++ b/paddle_samples/PaddleNLP/bart-base/weight_meta.py
@@ -0,0 +1,2847 @@
+class Program_weight_tensor_parameter_0:
+    name = "parameter_0"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-1.04492")
+    max_val = float("0.306152")
+    mean = float("0.055763")
+    std = float("0.089925")
+    data = None
+
+
+class Program_weight_tensor_parameter_1:
+    name = "parameter_1"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.173218")
+    max_val = float("3.4375")
+    mean = float("1.94378")
+    std = float("0.223736")
+    data = None
+
+
+class Program_weight_tensor_parameter_2:
+    name = "parameter_2"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-3.43164")
+    max_val = float("0.798828")
+    mean = float("-0.0740031")
+    std = float("0.183041")
+    data = None
+
+
+class Program_weight_tensor_parameter_3:
+    name = "parameter_3"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.0898438")
+    max_val = float("1.05859")
+    mean = float("0.448075")
+    std = float("0.0791667")
+    data = None
+
+
+class Program_weight_tensor_parameter_4:
+    name = "parameter_4"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-1.96484")
+    max_val = float("0.452637")
+    mean = float("-0.0887964")
+    std = float("0.179041")
+    data = None
+
+
+class Program_weight_tensor_parameter_5:
+    name = "parameter_5"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.100952")
+    max_val = float("1.38281")
+    mean = float("0.700254")
+    std = float("0.0784551")
+    data = None
+
+
+class Program_weight_tensor_parameter_6:
+    name = "parameter_6"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.169556")
+    max_val = float("0.132935")
+    mean = float("0.000291553")
+    std = float("0.0443588")
+    data = None
+
+
+class Program_weight_tensor_parameter_7:
+    name = "parameter_7"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-3.50391")
+    max_val = float("2.57422")
+    mean = float("-6.89082e-06")
+    std = float("0.0345879")
+    data = None
+
+
+class Program_weight_tensor_parameter_8:
+    name = "parameter_8"
+    shape = [3072]
+    dtype = "float32"
+    min_val = float("-0.286377")
+    max_val = float("0.204346")
+    mean = float("-0.0645512")
+    std = float("0.0490256")
+    data = None
+
+
+class Program_weight_tensor_parameter_9:
+    name = "parameter_9"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.619629")
+    max_val = float("1.25098")
+    mean = float("0.00965715")
+    std = float("0.0439153")
+    data = None
+
+
+class Program_weight_tensor_parameter_10:
+    name = "parameter_10"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.250488")
+    max_val = float("1.37598")
+    mean = float("-0.00115733")
+    std = float("0.095266")
+    data = None
+
+
+class Program_weight_tensor_parameter_11:
+    name = "parameter_11"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.835938")
+    max_val = float("0.911621")
+    mean = float("-3.88315e-05")
+    std = float("0.0716457")
+    data = None
+
+
+class Program_weight_tensor_parameter_12:
+    name = "parameter_12"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.251221")
+    max_val = float("0.283203")
+    mean = float("0.000441133")
+    std = float("0.0337921")
+    data = None
+
+
+class Program_weight_tensor_parameter_13:
+    name = "parameter_13"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.452393")
+    max_val = float("0.563477")
+    mean = float("3.47693e-05")
+    std = float("0.0701192")
+    data = None
+
+
+class Program_weight_tensor_parameter_14:
+    name = "parameter_14"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-3.61328")
+    max_val = float("4.75781")
+    mean = float("0.00439294")
+    std = float("0.663468")
+    data = None
+
+
+class Program_weight_tensor_parameter_15:
+    name = "parameter_15"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.865723")
+    max_val = float("0.789551")
+    mean = float("8.22635e-05")
+    std = float("0.0927952")
+    data = None
+
+
+class Program_weight_tensor_parameter_16:
+    name = "parameter_16"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.488037")
+    max_val = float("0.389404")
+    mean = float("0.000932506")
+    std = float("0.163855")
+    data = None
+
+
+class Program_weight_tensor_parameter_17:
+    name = "parameter_17"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-1.7334")
+    max_val = float("1.85742")
+    mean = float("-0.000255722")
+    std = float("0.0943624")
+    data = None
+
+
+class Program_weight_tensor_parameter_18:
+    name = "parameter_18"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.235352")
+    max_val = float("0.364746")
+    mean = float("0.00010426")
+    std = float("0.0253047")
+    data = None
+
+
+class Program_weight_tensor_parameter_19:
+    name = "parameter_19"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.807617")
+    max_val = float("0.620605")
+    mean = float("1.63106e-05")
+    std = float("0.041461")
+    data = None
+
+
+class Program_weight_tensor_parameter_20:
+    name = "parameter_20"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.0592346")
+    max_val = float("0.0612488")
+    mean = float("-0.000747097")
+    std = float("0.0133801")
+    data = None
+
+
+class Program_weight_tensor_parameter_21:
+    name = "parameter_21"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.300781")
+    max_val = float("0.322266")
+    mean = float("0.000227124")
+    std = float("0.0476506")
+    data = None
+
+
+class Program_weight_tensor_parameter_22:
+    name = "parameter_22"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-1.29199")
+    max_val = float("1.19531")
+    mean = float("-0.0081555")
+    std = float("0.132014")
+    data = None
+
+
+class Program_weight_tensor_parameter_23:
+    name = "parameter_23"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-1.55371")
+    max_val = float("1.74512")
+    mean = float("0.000290464")
+    std = float("0.0809273")
+    data = None
+
+
+class Program_weight_tensor_parameter_24:
+    name = "parameter_24"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.81543")
+    max_val = float("0.980469")
+    mean = float("0.0296052")
+    std = float("0.33146")
+    data = None
+
+
+class Program_weight_tensor_parameter_25:
+    name = "parameter_25"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-1.41016")
+    max_val = float("1.4834")
+    mean = float("-0.00106662")
+    std = float("0.0814963")
+    data = None
+
+
+class Program_weight_tensor_parameter_26:
+    name = "parameter_26"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.525391")
+    max_val = float("1.01855")
+    mean = float("-0.0194161")
+    std = float("0.0492517")
+    data = None
+
+
+class Program_weight_tensor_parameter_27:
+    name = "parameter_27"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.162476")
+    max_val = float("1.24023")
+    mean = float("0.450819")
+    std = float("0.039689")
+    data = None
+
+
+class Program_weight_tensor_parameter_28:
+    name = "parameter_28"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.584961")
+    max_val = float("0.831543")
+    mean = float("-0.0801088")
+    std = float("0.0702904")
+    data = None
+
+
+class Program_weight_tensor_parameter_29:
+    name = "parameter_29"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.300537")
+    max_val = float("4.84766")
+    mean = float("0.43358")
+    std = float("0.1703")
+    data = None
+
+
+class Program_weight_tensor_parameter_30:
+    name = "parameter_30"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-1.84668")
+    max_val = float("0.795898")
+    mean = float("-0.0622566")
+    std = float("0.11909")
+    data = None
+
+
+class Program_weight_tensor_parameter_31:
+    name = "parameter_31"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.430176")
+    max_val = float("1.08105")
+    mean = float("0.856139")
+    std = float("0.0427192")
+    data = None
+
+
+class Program_weight_tensor_parameter_32:
+    name = "parameter_32"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.389648")
+    max_val = float("0.293213")
+    mean = float("0.00130731")
+    std = float("0.0739026")
+    data = None
+
+
+class Program_weight_tensor_parameter_33:
+    name = "parameter_33"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-7.99219")
+    max_val = float("1.37988")
+    mean = float("1.10711e-05")
+    std = float("0.0398258")
+    data = None
+
+
+class Program_weight_tensor_parameter_34:
+    name = "parameter_34"
+    shape = [3072]
+    dtype = "float32"
+    min_val = float("-0.512695")
+    max_val = float("0.184204")
+    mean = float("-0.101517")
+    std = float("0.0660059")
+    data = None
+
+
+class Program_weight_tensor_parameter_35:
+    name = "parameter_35"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.644043")
+    max_val = float("0.643555")
+    mean = float("0.0102842")
+    std = float("0.0506794")
+    data = None
+
+
+class Program_weight_tensor_parameter_36:
+    name = "parameter_36"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.354248")
+    max_val = float("0.405029")
+    mean = float("-0.000719839")
+    std = float("0.107218")
+    data = None
+
+
+class Program_weight_tensor_parameter_37:
+    name = "parameter_37"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.670898")
+    max_val = float("0.637207")
+    mean = float("1.43374e-05")
+    std = float("0.0605907")
+    data = None
+
+
+class Program_weight_tensor_parameter_38:
+    name = "parameter_38"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.30127")
+    max_val = float("0.42627")
+    mean = float("0.000952943")
+    std = float("0.0434266")
+    data = None
+
+
+class Program_weight_tensor_parameter_39:
+    name = "parameter_39"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.387207")
+    max_val = float("0.429688")
+    mean = float("0.000127731")
+    std = float("0.061711")
+    data = None
+
+
+class Program_weight_tensor_parameter_40:
+    name = "parameter_40"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.253174")
+    max_val = float("0.419189")
+    mean = float("-0.00177001")
+    std = float("0.0497663")
+    data = None
+
+
+class Program_weight_tensor_parameter_41:
+    name = "parameter_41"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.776855")
+    max_val = float("0.799316")
+    mean = float("7.70503e-05")
+    std = float("0.0864494")
+    data = None
+
+
+class Program_weight_tensor_parameter_42:
+    name = "parameter_42"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.891113")
+    max_val = float("0.683594")
+    mean = float("-0.00213796")
+    std = float("0.261913")
+    data = None
+
+
+class Program_weight_tensor_parameter_43:
+    name = "parameter_43"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.750977")
+    max_val = float("0.873047")
+    mean = float("6.84442e-05")
+    std = float("0.0878441")
+    data = None
+
+
+class Program_weight_tensor_parameter_44:
+    name = "parameter_44"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.168823")
+    max_val = float("0.632324")
+    mean = float("0.00101271")
+    std = float("0.0418257")
+    data = None
+
+
+class Program_weight_tensor_parameter_45:
+    name = "parameter_45"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.821289")
+    max_val = float("0.790527")
+    mean = float("3.19495e-05")
+    std = float("0.0396491")
+    data = None
+
+
+class Program_weight_tensor_parameter_46:
+    name = "parameter_46"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.182373")
+    max_val = float("0.14624")
+    mean = float("-0.000428564")
+    std = float("0.0211172")
+    data = None
+
+
+class Program_weight_tensor_parameter_47:
+    name = "parameter_47"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.29126")
+    max_val = float("0.411133")
+    mean = float("8.66595e-05")
+    std = float("0.044702")
+    data = None
+
+
+class Program_weight_tensor_parameter_48:
+    name = "parameter_48"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.190063")
+    max_val = float("0.234985")
+    mean = float("0.000707203")
+    std = float("0.0223429")
+    data = None
+
+
+class Program_weight_tensor_parameter_49:
+    name = "parameter_49"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-1.29395")
+    max_val = float("1.32422")
+    mean = float("0.00019809")
+    std = float("0.0789618")
+    data = None
+
+
+class Program_weight_tensor_parameter_50:
+    name = "parameter_50"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.854492")
+    max_val = float("0.936523")
+    mean = float("0.0127257")
+    std = float("0.330065")
+    data = None
+
+
+class Program_weight_tensor_parameter_51:
+    name = "parameter_51"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-1.15039")
+    max_val = float("1.05664")
+    mean = float("-0.000430712")
+    std = float("0.0801342")
+    data = None
+
+
+class Program_weight_tensor_parameter_52:
+    name = "parameter_52"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.574219")
+    max_val = float("0.175659")
+    mean = float("-0.0174997")
+    std = float("0.0296027")
+    data = None
+
+
+class Program_weight_tensor_parameter_53:
+    name = "parameter_53"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.113892")
+    max_val = float("0.970215")
+    mean = float("0.45621")
+    std = float("0.0352021")
+    data = None
+
+
+class Program_weight_tensor_parameter_54:
+    name = "parameter_54"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.939453")
+    max_val = float("1.45996")
+    mean = float("-0.0318848")
+    std = float("0.102574")
+    data = None
+
+
+class Program_weight_tensor_parameter_55:
+    name = "parameter_55"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.342529")
+    max_val = float("2.05273")
+    mean = float("0.4438")
+    std = float("0.0814034")
+    data = None
+
+
+class Program_weight_tensor_parameter_56:
+    name = "parameter_56"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.831055")
+    max_val = float("1.01172")
+    mean = float("0.0763767")
+    std = float("0.0859585")
+    data = None
+
+
+class Program_weight_tensor_parameter_57:
+    name = "parameter_57"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.376465")
+    max_val = float("1.48242")
+    mean = float("0.867288")
+    std = float("0.0510089")
+    data = None
+
+
+class Program_weight_tensor_parameter_58:
+    name = "parameter_58"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.449463")
+    max_val = float("0.487061")
+    mean = float("0.000971075")
+    std = float("0.0768242")
+    data = None
+
+
+class Program_weight_tensor_parameter_59:
+    name = "parameter_59"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-8.75")
+    max_val = float("1.29297")
+    mean = float("1.43174e-05")
+    std = float("0.0403077")
+    data = None
+
+
+class Program_weight_tensor_parameter_60:
+    name = "parameter_60"
+    shape = [3072]
+    dtype = "float32"
+    min_val = float("-0.50293")
+    max_val = float("0.518066")
+    mean = float("-0.117669")
+    std = float("0.089743")
+    data = None
+
+
+class Program_weight_tensor_parameter_61:
+    name = "parameter_61"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.59082")
+    max_val = float("0.566895")
+    mean = float("0.00265853")
+    std = float("0.0538356")
+    data = None
+
+
+class Program_weight_tensor_parameter_62:
+    name = "parameter_62"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.241455")
+    max_val = float("0.182861")
+    mean = float("-0.00041845")
+    std = float("0.0623274")
+    data = None
+
+
+class Program_weight_tensor_parameter_63:
+    name = "parameter_63"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.678223")
+    max_val = float("0.672852")
+    mean = float("8.92212e-06")
+    std = float("0.059449")
+    data = None
+
+
+class Program_weight_tensor_parameter_64:
+    name = "parameter_64"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.218506")
+    max_val = float("0.34082")
+    mean = float("0.00411712")
+    std = float("0.0441978")
+    data = None
+
+
+class Program_weight_tensor_parameter_65:
+    name = "parameter_65"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.334961")
+    max_val = float("0.41748")
+    mean = float("0.000128222")
+    std = float("0.061607")
+    data = None
+
+
+class Program_weight_tensor_parameter_66:
+    name = "parameter_66"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.226685")
+    max_val = float("0.141357")
+    mean = float("-8.03176e-05")
+    std = float("0.0257067")
+    data = None
+
+
+class Program_weight_tensor_parameter_67:
+    name = "parameter_67"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.789551")
+    max_val = float("0.868164")
+    mean = float("0.000165547")
+    std = float("0.0860855")
+    data = None
+
+
+class Program_weight_tensor_parameter_68:
+    name = "parameter_68"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-1.04688")
+    max_val = float("1.00195")
+    mean = float("0.00284015")
+    std = float("0.212825")
+    data = None
+
+
+class Program_weight_tensor_parameter_69:
+    name = "parameter_69"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.649902")
+    max_val = float("0.692871")
+    mean = float("0.000140688")
+    std = float("0.0874408")
+    data = None
+
+
+class Program_weight_tensor_parameter_70:
+    name = "parameter_70"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.123291")
+    max_val = float("0.224243")
+    mean = float("0.00126714")
+    std = float("0.0280615")
+    data = None
+
+
+class Program_weight_tensor_parameter_71:
+    name = "parameter_71"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.560059")
+    max_val = float("0.460693")
+    mean = float("2.28893e-06")
+    std = float("0.0395396")
+    data = None
+
+
+class Program_weight_tensor_parameter_72:
+    name = "parameter_72"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.0745239")
+    max_val = float("0.12915")
+    mean = float("0.00015724")
+    std = float("0.0152254")
+    data = None
+
+
+class Program_weight_tensor_parameter_73:
+    name = "parameter_73"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.318604")
+    max_val = float("0.417236")
+    mean = float("-6.24724e-05")
+    std = float("0.0436322")
+    data = None
+
+
+class Program_weight_tensor_parameter_74:
+    name = "parameter_74"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.0912476")
+    max_val = float("0.0578613")
+    mean = float("2.14512e-05")
+    std = float("0.011152")
+    data = None
+
+
+class Program_weight_tensor_parameter_75:
+    name = "parameter_75"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-1.28223")
+    max_val = float("1.32227")
+    mean = float("3.04413e-05")
+    std = float("0.0783237")
+    data = None
+
+
+class Program_weight_tensor_parameter_76:
+    name = "parameter_76"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.82666")
+    max_val = float("0.908203")
+    mean = float("-0.0128437")
+    std = float("0.341964")
+    data = None
+
+
+class Program_weight_tensor_parameter_77:
+    name = "parameter_77"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-1.18848")
+    max_val = float("1.08301")
+    mean = float("0.00056576")
+    std = float("0.0795572")
+    data = None
+
+
+class Program_weight_tensor_parameter_78:
+    name = "parameter_78"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.217529")
+    max_val = float("0.217651")
+    mean = float("-0.0218131")
+    std = float("0.0205795")
+    data = None
+
+
+class Program_weight_tensor_parameter_79:
+    name = "parameter_79"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.0614319")
+    max_val = float("0.955078")
+    mean = float("0.449719")
+    std = float("0.0347093")
+    data = None
+
+
+class Program_weight_tensor_parameter_80:
+    name = "parameter_80"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-2.22266")
+    max_val = float("0.947754")
+    mean = float("0.0312634")
+    std = float("0.114534")
+    data = None
+
+
+class Program_weight_tensor_parameter_81:
+    name = "parameter_81"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.351562")
+    max_val = float("4.48438")
+    mean = float("0.447064")
+    std = float("0.158699")
+    data = None
+
+
+class Program_weight_tensor_parameter_82:
+    name = "parameter_82"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-1.71289")
+    max_val = float("0.557617")
+    mean = float("-0.0879682")
+    std = float("0.0934652")
+    data = None
+
+
+class Program_weight_tensor_parameter_83:
+    name = "parameter_83"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.454346")
+    max_val = float("1.08105")
+    mean = float("0.860268")
+    std = float("0.0483674")
+    data = None
+
+
+class Program_weight_tensor_parameter_84:
+    name = "parameter_84"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.310059")
+    max_val = float("0.560059")
+    mean = float("0.000284155")
+    std = float("0.0833974")
+    data = None
+
+
+class Program_weight_tensor_parameter_85:
+    name = "parameter_85"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-8.57031")
+    max_val = float("1.39453")
+    mean = float("-1.11405e-05")
+    std = float("0.0429414")
+    data = None
+
+
+class Program_weight_tensor_parameter_86:
+    name = "parameter_86"
+    shape = [3072]
+    dtype = "float32"
+    min_val = float("-0.414551")
+    max_val = float("0.718262")
+    mean = float("-0.108521")
+    std = float("0.0752428")
+    data = None
+
+
+class Program_weight_tensor_parameter_87:
+    name = "parameter_87"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.921387")
+    max_val = float("0.639648")
+    mean = float("-0.00312582")
+    std = float("0.0542425")
+    data = None
+
+
+class Program_weight_tensor_parameter_88:
+    name = "parameter_88"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.215576")
+    max_val = float("0.171021")
+    mean = float("-0.000124251")
+    std = float("0.0563701")
+    data = None
+
+
+class Program_weight_tensor_parameter_89:
+    name = "parameter_89"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.643066")
+    max_val = float("0.593262")
+    mean = float("-1.83982e-05")
+    std = float("0.066252")
+    data = None
+
+
+class Program_weight_tensor_parameter_90:
+    name = "parameter_90"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.279297")
+    max_val = float("0.187866")
+    mean = float("-0.000428466")
+    std = float("0.0345215")
+    data = None
+
+
+class Program_weight_tensor_parameter_91:
+    name = "parameter_91"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.373535")
+    max_val = float("0.445312")
+    mean = float("-6.60571e-05")
+    std = float("0.064403")
+    data = None
+
+
+class Program_weight_tensor_parameter_92:
+    name = "parameter_92"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.0603638")
+    max_val = float("0.0473633")
+    mean = float("-2.00152e-05")
+    std = float("0.00939217")
+    data = None
+
+
+class Program_weight_tensor_parameter_93:
+    name = "parameter_93"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-1.24902")
+    max_val = float("1.01172")
+    mean = float("-6.80772e-05")
+    std = float("0.085726")
+    data = None
+
+
+class Program_weight_tensor_parameter_94:
+    name = "parameter_94"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.70166")
+    max_val = float("0.717773")
+    mean = float("0.00868722")
+    std = float("0.185542")
+    data = None
+
+
+class Program_weight_tensor_parameter_95:
+    name = "parameter_95"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.970703")
+    max_val = float("1.11133")
+    mean = float("-0.000605337")
+    std = float("0.0885201")
+    data = None
+
+
+class Program_weight_tensor_parameter_96:
+    name = "parameter_96"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.169678")
+    max_val = float("0.265625")
+    mean = float("0.00446949")
+    std = float("0.0439148")
+    data = None
+
+
+class Program_weight_tensor_parameter_97:
+    name = "parameter_97"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.785645")
+    max_val = float("0.54834")
+    mean = float("-6.01225e-06")
+    std = float("0.0431069")
+    data = None
+
+
+class Program_weight_tensor_parameter_98:
+    name = "parameter_98"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.130371")
+    max_val = float("0.0531311")
+    mean = float("-0.000277936")
+    std = float("0.0113482")
+    data = None
+
+
+class Program_weight_tensor_parameter_99:
+    name = "parameter_99"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.432861")
+    max_val = float("0.347412")
+    mean = float("1.49241e-05")
+    std = float("0.0471165")
+    data = None
+
+
+class Program_weight_tensor_parameter_100:
+    name = "parameter_100"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.0888672")
+    max_val = float("0.0707397")
+    mean = float("-0.000208571")
+    std = float("0.00922685")
+    data = None
+
+
+class Program_weight_tensor_parameter_101:
+    name = "parameter_101"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-1.27832")
+    max_val = float("1.38672")
+    mean = float("-0.000194419")
+    std = float("0.0777348")
+    data = None
+
+
+class Program_weight_tensor_parameter_102:
+    name = "parameter_102"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.838379")
+    max_val = float("0.794922")
+    mean = float("-0.012945")
+    std = float("0.297579")
+    data = None
+
+
+class Program_weight_tensor_parameter_103:
+    name = "parameter_103"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-1.19336")
+    max_val = float("1.12012")
+    mean = float("0.000514542")
+    std = float("0.0786299")
+    data = None
+
+
+class Program_weight_tensor_parameter_104:
+    name = "parameter_104"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.445801")
+    max_val = float("0.427002")
+    mean = float("-0.0174128")
+    std = float("0.0391896")
+    data = None
+
+
+class Program_weight_tensor_parameter_105:
+    name = "parameter_105"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.0701904")
+    max_val = float("0.846191")
+    mean = float("0.453216")
+    std = float("0.0378213")
+    data = None
+
+
+class Program_weight_tensor_parameter_106:
+    name = "parameter_106"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-1.66406")
+    max_val = float("1.25586")
+    mean = float("0.00152094")
+    std = float("0.152635")
+    data = None
+
+
+class Program_weight_tensor_parameter_107:
+    name = "parameter_107"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.35498")
+    max_val = float("2.83594")
+    mean = float("0.422616")
+    std = float("0.113766")
+    data = None
+
+
+class Program_weight_tensor_parameter_108:
+    name = "parameter_108"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-1.19141")
+    max_val = float("0.778809")
+    mean = float("-0.0969552")
+    std = float("0.102802")
+    data = None
+
+
+class Program_weight_tensor_parameter_109:
+    name = "parameter_109"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.325195")
+    max_val = float("1.28223")
+    mean = float("0.789448")
+    std = float("0.0633351")
+    data = None
+
+
+class Program_weight_tensor_parameter_110:
+    name = "parameter_110"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.505371")
+    max_val = float("0.302734")
+    mean = float("-2.42236e-05")
+    std = float("0.079225")
+    data = None
+
+
+class Program_weight_tensor_parameter_111:
+    name = "parameter_111"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-16.0")
+    max_val = float("1.5293")
+    mean = float("3.81097e-06")
+    std = float("0.047523")
+    data = None
+
+
+class Program_weight_tensor_parameter_112:
+    name = "parameter_112"
+    shape = [3072]
+    dtype = "float32"
+    min_val = float("-0.657715")
+    max_val = float("0.853027")
+    mean = float("-0.113052")
+    std = float("0.0792442")
+    data = None
+
+
+class Program_weight_tensor_parameter_113:
+    name = "parameter_113"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-4.28516")
+    max_val = float("3.55273")
+    mean = float("-0.000331231")
+    std = float("0.0573905")
+    data = None
+
+
+class Program_weight_tensor_parameter_114:
+    name = "parameter_114"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.30957")
+    max_val = float("0.254395")
+    mean = float("0.00151452")
+    std = float("0.079716")
+    data = None
+
+
+class Program_weight_tensor_parameter_115:
+    name = "parameter_115"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.581055")
+    max_val = float("0.697266")
+    mean = float("-8.59857e-05")
+    std = float("0.0701496")
+    data = None
+
+
+class Program_weight_tensor_parameter_116:
+    name = "parameter_116"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.207397")
+    max_val = float("0.177246")
+    mean = float("-0.000163083")
+    std = float("0.0344208")
+    data = None
+
+
+class Program_weight_tensor_parameter_117:
+    name = "parameter_117"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.362793")
+    max_val = float("0.40918")
+    mean = float("1.47647e-05")
+    std = float("0.0686297")
+    data = None
+
+
+class Program_weight_tensor_parameter_118:
+    name = "parameter_118"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.0366516")
+    max_val = float("0.0196533")
+    mean = float("-0.000163542")
+    std = float("0.00484102")
+    data = None
+
+
+class Program_weight_tensor_parameter_119:
+    name = "parameter_119"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-1.05859")
+    max_val = float("0.830566")
+    mean = float("-1.70015e-05")
+    std = float("0.0835551")
+    data = None
+
+
+class Program_weight_tensor_parameter_120:
+    name = "parameter_120"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.691895")
+    max_val = float("0.602539")
+    mean = float("-0.000167921")
+    std = float("0.187074")
+    data = None
+
+
+class Program_weight_tensor_parameter_121:
+    name = "parameter_121"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.73291")
+    max_val = float("0.792969")
+    mean = float("-0.000171674")
+    std = float("0.0862973")
+    data = None
+
+
+class Program_weight_tensor_parameter_122:
+    name = "parameter_122"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.322266")
+    max_val = float("0.326172")
+    mean = float("0.00665764")
+    std = float("0.0539895")
+    data = None
+
+
+class Program_weight_tensor_parameter_123:
+    name = "parameter_123"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.719727")
+    max_val = float("0.59668")
+    mean = float("8.39694e-06")
+    std = float("0.0400239")
+    data = None
+
+
+class Program_weight_tensor_parameter_124:
+    name = "parameter_124"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.232178")
+    max_val = float("0.144531")
+    mean = float("-0.000777754")
+    std = float("0.0177002")
+    data = None
+
+
+class Program_weight_tensor_parameter_125:
+    name = "parameter_125"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.311035")
+    max_val = float("0.535645")
+    mean = float("3.55577e-05")
+    std = float("0.0412915")
+    data = None
+
+
+class Program_weight_tensor_parameter_126:
+    name = "parameter_126"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.04953")
+    max_val = float("0.0541992")
+    mean = float("7.13267e-05")
+    std = float("0.00657817")
+    data = None
+
+
+class Program_weight_tensor_parameter_127:
+    name = "parameter_127"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-1.42676")
+    max_val = float("1.26953")
+    mean = float("-5.46152e-07")
+    std = float("0.076587")
+    data = None
+
+
+class Program_weight_tensor_parameter_128:
+    name = "parameter_128"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-1.16016")
+    max_val = float("1.12598")
+    mean = float("-0.0047377")
+    std = float("0.312352")
+    data = None
+
+
+class Program_weight_tensor_parameter_129:
+    name = "parameter_129"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-1.17578")
+    max_val = float("1.56641")
+    mean = float("0.000141877")
+    std = float("0.0771282")
+    data = None
+
+
+class Program_weight_tensor_parameter_130:
+    name = "parameter_130"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-1.08496")
+    max_val = float("0.491943")
+    mean = float("-0.0137443")
+    std = float("0.0612447")
+    data = None
+
+
+class Program_weight_tensor_parameter_131:
+    name = "parameter_131"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.0552979")
+    max_val = float("0.859375")
+    mean = float("0.448837")
+    std = float("0.0475385")
+    data = None
+
+
+class Program_weight_tensor_parameter_132:
+    name = "parameter_132"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-1.79102")
+    max_val = float("2.39453")
+    mean = float("0.0118306")
+    std = float("0.15963")
+    data = None
+
+
+class Program_weight_tensor_parameter_133:
+    name = "parameter_133"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.221802")
+    max_val = float("1.71582")
+    mean = float("0.386176")
+    std = float("0.0745859")
+    data = None
+
+
+class Program_weight_tensor_parameter_134:
+    name = "parameter_134"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.683594")
+    max_val = float("1.88184")
+    mean = float("-0.118065")
+    std = float("0.116923")
+    data = None
+
+
+class Program_weight_tensor_parameter_135:
+    name = "parameter_135"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.0745239")
+    max_val = float("1.16699")
+    mean = float("0.82075")
+    std = float("0.0724713")
+    data = None
+
+
+class Program_weight_tensor_parameter_136:
+    name = "parameter_136"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.502441")
+    max_val = float("0.364258")
+    mean = float("0.00420779")
+    std = float("0.0872173")
+    data = None
+
+
+class Program_weight_tensor_parameter_137:
+    name = "parameter_137"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-8.89844")
+    max_val = float("1.70898")
+    mean = float("-5.21274e-05")
+    std = float("0.0544477")
+    data = None
+
+
+class Program_weight_tensor_parameter_138:
+    name = "parameter_138"
+    shape = [3072]
+    dtype = "float32"
+    min_val = float("-0.597168")
+    max_val = float("0.663574")
+    mean = float("-0.099702")
+    std = float("0.0814544")
+    data = None
+
+
+class Program_weight_tensor_parameter_139:
+    name = "parameter_139"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-2.30469")
+    max_val = float("1.85645")
+    mean = float("-0.000716272")
+    std = float("0.0636168")
+    data = None
+
+
+class Program_weight_tensor_parameter_140:
+    name = "parameter_140"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.357178")
+    max_val = float("0.34375")
+    mean = float("0.00170967")
+    std = float("0.118222")
+    data = None
+
+
+class Program_weight_tensor_parameter_141:
+    name = "parameter_141"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.945312")
+    max_val = float("0.866699")
+    mean = float("-1.87612e-05")
+    std = float("0.0678677")
+    data = None
+
+
+class Program_weight_tensor_parameter_142:
+    name = "parameter_142"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.371826")
+    max_val = float("0.228516")
+    mean = float("-0.00193949")
+    std = float("0.0475839")
+    data = None
+
+
+class Program_weight_tensor_parameter_143:
+    name = "parameter_143"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.336426")
+    max_val = float("0.433105")
+    mean = float("-0.000198563")
+    std = float("0.066908")
+    data = None
+
+
+class Program_weight_tensor_parameter_144:
+    name = "parameter_144"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.0270386")
+    max_val = float("0.0551453")
+    mean = float("0.000303509")
+    std = float("0.00573231")
+    data = None
+
+
+class Program_weight_tensor_parameter_145:
+    name = "parameter_145"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.637207")
+    max_val = float("0.703125")
+    mean = float("-8.79024e-05")
+    std = float("0.085253")
+    data = None
+
+
+class Program_weight_tensor_parameter_146:
+    name = "parameter_146"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.702637")
+    max_val = float("0.77002")
+    mean = float("-0.00542469")
+    std = float("0.166002")
+    data = None
+
+
+class Program_weight_tensor_parameter_147:
+    name = "parameter_147"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.941406")
+    max_val = float("1.03711")
+    mean = float("0.000273284")
+    std = float("0.0895451")
+    data = None
+
+
+class Program_weight_tensor_parameter_148:
+    name = "parameter_148"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.610352")
+    max_val = float("0.316406")
+    mean = float("0.000763194")
+    std = float("0.08586")
+    data = None
+
+
+class Program_weight_tensor_parameter_149:
+    name = "parameter_149"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.824707")
+    max_val = float("0.943848")
+    mean = float("7.09477e-06")
+    std = float("0.037178")
+    data = None
+
+
+class Program_weight_tensor_parameter_150:
+    name = "parameter_150"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.5625")
+    max_val = float("0.742188")
+    mean = float("-0.00283873")
+    std = float("0.0791875")
+    data = None
+
+
+class Program_weight_tensor_parameter_151:
+    name = "parameter_151"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.390137")
+    max_val = float("0.364502")
+    mean = float("-1.03331e-05")
+    std = float("0.0367269")
+    data = None
+
+
+class Program_weight_tensor_parameter_152:
+    name = "parameter_152"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.0423279")
+    max_val = float("0.0409546")
+    mean = float("-0.000207591")
+    std = float("0.00617958")
+    data = None
+
+
+class Program_weight_tensor_parameter_153:
+    name = "parameter_153"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-1.24707")
+    max_val = float("1.1123")
+    mean = float("-2.14991e-05")
+    std = float("0.0831736")
+    data = None
+
+
+class Program_weight_tensor_parameter_154:
+    name = "parameter_154"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-1.91016")
+    max_val = float("1.57617")
+    mean = float("-0.0154683")
+    std = float("0.578655")
+    data = None
+
+
+class Program_weight_tensor_parameter_155:
+    name = "parameter_155"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-1.47754")
+    max_val = float("1.46191")
+    mean = float("-5.53774e-06")
+    std = float("0.0802971")
+    data = None
+
+
+class Program_weight_tensor_parameter_156:
+    name = "parameter_156"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.535645")
+    max_val = float("0.442627")
+    mean = float("0.00182839")
+    std = float("0.0813837")
+    data = None
+
+
+class Program_weight_tensor_parameter_157:
+    name = "parameter_157"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.0585022")
+    max_val = float("0.912109")
+    mean = float("0.544659")
+    std = float("0.084057")
+    data = None
+
+
+class Program_weight_tensor_parameter_158:
+    name = "parameter_158"
+    shape = [1026, 768]
+    dtype = "float32"
+    min_val = float("-3.52539")
+    max_val = float("2.60938")
+    mean = float("1.24352e-05")
+    std = float("0.0444859")
+    data = None
+
+
+class Program_weight_tensor_parameter_159:
+    name = "parameter_159"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-1.36328")
+    max_val = float("0.548828")
+    mean = float("0.00836064")
+    std = float("0.0607747")
+    data = None
+
+
+class Program_weight_tensor_parameter_160:
+    name = "parameter_160"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.081665")
+    max_val = float("0.330566")
+    mean = float("0.285268")
+    std = float("0.0176816")
+    data = None
+
+
+class Program_weight_tensor_parameter_161:
+    name = "parameter_161"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.663086")
+    max_val = float("1.60449")
+    mean = float("0.0474303")
+    std = float("0.0966976")
+    data = None
+
+
+class Program_weight_tensor_parameter_162:
+    name = "parameter_162"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.245239")
+    max_val = float("3.61523")
+    mean = float("0.525189")
+    std = float("0.151526")
+    data = None
+
+
+class Program_weight_tensor_parameter_163:
+    name = "parameter_163"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.190796")
+    max_val = float("0.19165")
+    mean = float("-0.000218492")
+    std = float("0.0559028")
+    data = None
+
+
+class Program_weight_tensor_parameter_164:
+    name = "parameter_164"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-2.40625")
+    max_val = float("2.63672")
+    mean = float("-9.17343e-06")
+    std = float("0.0367557")
+    data = None
+
+
+class Program_weight_tensor_parameter_165:
+    name = "parameter_165"
+    shape = [3072]
+    dtype = "float32"
+    min_val = float("-0.673828")
+    max_val = float("0.380859")
+    mean = float("-0.0986715")
+    std = float("0.0815347")
+    data = None
+
+
+class Program_weight_tensor_parameter_166:
+    name = "parameter_166"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.740234")
+    max_val = float("0.485596")
+    mean = float("-0.00427577")
+    std = float("0.0506977")
+    data = None
+
+
+class Program_weight_tensor_parameter_167:
+    name = "parameter_167"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.127808")
+    max_val = float("0.125244")
+    mean = float("0.000134931")
+    std = float("0.0280722")
+    data = None
+
+
+class Program_weight_tensor_parameter_168:
+    name = "parameter_168"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.450928")
+    max_val = float("0.445801")
+    mean = float("-6.7444e-06")
+    std = float("0.0402753")
+    data = None
+
+
+class Program_weight_tensor_parameter_169:
+    name = "parameter_169"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.219238")
+    max_val = float("0.112427")
+    mean = float("-0.000542198")
+    std = float("0.0201758")
+    data = None
+
+
+class Program_weight_tensor_parameter_170:
+    name = "parameter_170"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.287354")
+    max_val = float("0.304932")
+    mean = float("-9.6642e-05")
+    std = float("0.0408038")
+    data = None
+
+
+class Program_weight_tensor_parameter_171:
+    name = "parameter_171"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.0557556")
+    max_val = float("0.0474243")
+    mean = float("-0.000290657")
+    std = float("0.00681903")
+    data = None
+
+
+class Program_weight_tensor_parameter_172:
+    name = "parameter_172"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.864746")
+    max_val = float("0.875")
+    mean = float("2.86067e-05")
+    std = float("0.0779768")
+    data = None
+
+
+class Program_weight_tensor_parameter_173:
+    name = "parameter_173"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.892578")
+    max_val = float("0.794922")
+    mean = float("-0.00337976")
+    std = float("0.29479")
+    data = None
+
+
+class Program_weight_tensor_parameter_174:
+    name = "parameter_174"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.821289")
+    max_val = float("0.724609")
+    mean = float("0.000132867")
+    std = float("0.0780431")
+    data = None
+
+
+class Program_weight_tensor_parameter_175:
+    name = "parameter_175"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.53125")
+    max_val = float("0.651367")
+    mean = float("0.0206089")
+    std = float("0.0447142")
+    data = None
+
+
+class Program_weight_tensor_parameter_176:
+    name = "parameter_176"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.216431")
+    max_val = float("0.509277")
+    mean = float("0.438104")
+    std = float("0.0412302")
+    data = None
+
+
+class Program_weight_tensor_parameter_177:
+    name = "parameter_177"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-1.02734")
+    max_val = float("1.24902")
+    mean = float("0.0191815")
+    std = float("0.0949325")
+    data = None
+
+
+class Program_weight_tensor_parameter_178:
+    name = "parameter_178"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.249268")
+    max_val = float("2.80664")
+    mean = float("0.506467")
+    std = float("0.117859")
+    data = None
+
+
+class Program_weight_tensor_parameter_179:
+    name = "parameter_179"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.20105")
+    max_val = float("0.202759")
+    mean = float("0.00112513")
+    std = float("0.0543952")
+    data = None
+
+
+class Program_weight_tensor_parameter_180:
+    name = "parameter_180"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-1.33887")
+    max_val = float("3.57812")
+    mean = float("-4.80607e-06")
+    std = float("0.0419992")
+    data = None
+
+
+class Program_weight_tensor_parameter_181:
+    name = "parameter_181"
+    shape = [3072]
+    dtype = "float32"
+    min_val = float("-0.450439")
+    max_val = float("0.274658")
+    mean = float("-0.100086")
+    std = float("0.0798469")
+    data = None
+
+
+class Program_weight_tensor_parameter_182:
+    name = "parameter_182"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.700195")
+    max_val = float("0.496094")
+    mean = float("-0.00125378")
+    std = float("0.0545837")
+    data = None
+
+
+class Program_weight_tensor_parameter_183:
+    name = "parameter_183"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.126099")
+    max_val = float("0.134155")
+    mean = float("-0.000686947")
+    std = float("0.0293917")
+    data = None
+
+
+class Program_weight_tensor_parameter_184:
+    name = "parameter_184"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.363525")
+    max_val = float("0.42627")
+    mean = float("-4.60709e-05")
+    std = float("0.049824")
+    data = None
+
+
+class Program_weight_tensor_parameter_185:
+    name = "parameter_185"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.140259")
+    max_val = float("0.101807")
+    mean = float("0.00126621")
+    std = float("0.0208813")
+    data = None
+
+
+class Program_weight_tensor_parameter_186:
+    name = "parameter_186"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.348633")
+    max_val = float("0.354736")
+    mean = float("0.000206098")
+    std = float("0.0512433")
+    data = None
+
+
+class Program_weight_tensor_parameter_187:
+    name = "parameter_187"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.0294342")
+    max_val = float("0.0189362")
+    mean = float("-5.09798e-05")
+    std = float("0.00220086")
+    data = None
+
+
+class Program_weight_tensor_parameter_188:
+    name = "parameter_188"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.785645")
+    max_val = float("0.836914")
+    mean = float("-0.000125414")
+    std = float("0.075942")
+    data = None
+
+
+class Program_weight_tensor_parameter_189:
+    name = "parameter_189"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-1.08594")
+    max_val = float("0.89209")
+    mean = float("0.00592435")
+    std = float("0.315497")
+    data = None
+
+
+class Program_weight_tensor_parameter_190:
+    name = "parameter_190"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.728516")
+    max_val = float("0.709961")
+    mean = float("0.000360004")
+    std = float("0.076351")
+    data = None
+
+
+class Program_weight_tensor_parameter_191:
+    name = "parameter_191"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.340088")
+    max_val = float("0.50293")
+    mean = float("0.0183708")
+    std = float("0.0399188")
+    data = None
+
+
+class Program_weight_tensor_parameter_192:
+    name = "parameter_192"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.171143")
+    max_val = float("0.491455")
+    mean = float("0.41213")
+    std = float("0.051015")
+    data = None
+
+
+class Program_weight_tensor_parameter_193:
+    name = "parameter_193"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-1.31836")
+    max_val = float("1.36426")
+    mean = float("-0.0202332")
+    std = float("0.122209")
+    data = None
+
+
+class Program_weight_tensor_parameter_194:
+    name = "parameter_194"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.266602")
+    max_val = float("2.16602")
+    mean = float("0.474157")
+    std = float("0.113208")
+    data = None
+
+
+class Program_weight_tensor_parameter_195:
+    name = "parameter_195"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.227051")
+    max_val = float("0.191406")
+    mean = float("0.000647112")
+    std = float("0.0631629")
+    data = None
+
+
+class Program_weight_tensor_parameter_196:
+    name = "parameter_196"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-1.42871")
+    max_val = float("6.24219")
+    mean = float("9.09174e-06")
+    std = float("0.0448247")
+    data = None
+
+
+class Program_weight_tensor_parameter_197:
+    name = "parameter_197"
+    shape = [3072]
+    dtype = "float32"
+    min_val = float("-0.449219")
+    max_val = float("0.574707")
+    mean = float("-0.104708")
+    std = float("0.0974598")
+    data = None
+
+
+class Program_weight_tensor_parameter_198:
+    name = "parameter_198"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.727051")
+    max_val = float("0.63623")
+    mean = float("0.0019482")
+    std = float("0.0582201")
+    data = None
+
+
+class Program_weight_tensor_parameter_199:
+    name = "parameter_199"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.109436")
+    max_val = float("0.155518")
+    mean = float("7.43844e-06")
+    std = float("0.0291774")
+    data = None
+
+
+class Program_weight_tensor_parameter_200:
+    name = "parameter_200"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.586426")
+    max_val = float("0.4375")
+    mean = float("1.10467e-05")
+    std = float("0.0527658")
+    data = None
+
+
+class Program_weight_tensor_parameter_201:
+    name = "parameter_201"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.0904541")
+    max_val = float("0.178589")
+    mean = float("-0.000592086")
+    std = float("0.0249144")
+    data = None
+
+
+class Program_weight_tensor_parameter_202:
+    name = "parameter_202"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.379639")
+    max_val = float("0.483398")
+    mean = float("-0.000203025")
+    std = float("0.0524548")
+    data = None
+
+
+class Program_weight_tensor_parameter_203:
+    name = "parameter_203"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.0160217")
+    max_val = float("0.00654984")
+    mean = float("-5.22973e-05")
+    std = float("0.00123598")
+    data = None
+
+
+class Program_weight_tensor_parameter_204:
+    name = "parameter_204"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.908691")
+    max_val = float("0.821289")
+    mean = float("7.01113e-05")
+    std = float("0.0774551")
+    data = None
+
+
+class Program_weight_tensor_parameter_205:
+    name = "parameter_205"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.941895")
+    max_val = float("1.03516")
+    mean = float("-0.00343673")
+    std = float("0.340425")
+    data = None
+
+
+class Program_weight_tensor_parameter_206:
+    name = "parameter_206"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.572266")
+    max_val = float("0.587891")
+    mean = float("1.32304e-06")
+    std = float("0.0773302")
+    data = None
+
+
+class Program_weight_tensor_parameter_207:
+    name = "parameter_207"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.281006")
+    max_val = float("0.467529")
+    mean = float("0.0220191")
+    std = float("0.0447399")
+    data = None
+
+
+class Program_weight_tensor_parameter_208:
+    name = "parameter_208"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.19812")
+    max_val = float("0.473389")
+    mean = float("0.401014")
+    std = float("0.0414225")
+    data = None
+
+
+class Program_weight_tensor_parameter_209:
+    name = "parameter_209"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-2.01367")
+    max_val = float("1.22168")
+    mean = float("0.00702611")
+    std = float("0.141371")
+    data = None
+
+
+class Program_weight_tensor_parameter_210:
+    name = "parameter_210"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.210938")
+    max_val = float("1.79785")
+    mean = float("0.471035")
+    std = float("0.11155")
+    data = None
+
+
+class Program_weight_tensor_parameter_211:
+    name = "parameter_211"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.256592")
+    max_val = float("0.265381")
+    mean = float("0.00051839")
+    std = float("0.0699863")
+    data = None
+
+
+class Program_weight_tensor_parameter_212:
+    name = "parameter_212"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.864258")
+    max_val = float("6.08984")
+    mean = float("1.55439e-05")
+    std = float("0.0479069")
+    data = None
+
+
+class Program_weight_tensor_parameter_213:
+    name = "parameter_213"
+    shape = [3072]
+    dtype = "float32"
+    min_val = float("-0.397705")
+    max_val = float("0.609863")
+    mean = float("-0.102438")
+    std = float("0.092124")
+    data = None
+
+
+class Program_weight_tensor_parameter_214:
+    name = "parameter_214"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.618652")
+    max_val = float("0.547363")
+    mean = float("-0.000260851")
+    std = float("0.0614206")
+    data = None
+
+
+class Program_weight_tensor_parameter_215:
+    name = "parameter_215"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.147949")
+    max_val = float("0.167358")
+    mean = float("0.000753449")
+    std = float("0.0484929")
+    data = None
+
+
+class Program_weight_tensor_parameter_216:
+    name = "parameter_216"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-1.68555")
+    max_val = float("0.493408")
+    mean = float("-9.99574e-06")
+    std = float("0.0503106")
+    data = None
+
+
+class Program_weight_tensor_parameter_217:
+    name = "parameter_217"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.106934")
+    max_val = float("0.193115")
+    mean = float("0.000729642")
+    std = float("0.0258956")
+    data = None
+
+
+class Program_weight_tensor_parameter_218:
+    name = "parameter_218"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.492188")
+    max_val = float("0.588867")
+    mean = float("0.000162088")
+    std = float("0.0484471")
+    data = None
+
+
+class Program_weight_tensor_parameter_219:
+    name = "parameter_219"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.00254059")
+    max_val = float("0.00301743")
+    mean = float("-8.58749e-06")
+    std = float("0.000581241")
+    data = None
+
+
+class Program_weight_tensor_parameter_220:
+    name = "parameter_220"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.936035")
+    max_val = float("0.812988")
+    mean = float("0.00015915")
+    std = float("0.0781999")
+    data = None
+
+
+class Program_weight_tensor_parameter_221:
+    name = "parameter_221"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.995605")
+    max_val = float("1.25781")
+    mean = float("-0.0191986")
+    std = float("0.398188")
+    data = None
+
+
+class Program_weight_tensor_parameter_222:
+    name = "parameter_222"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.508789")
+    max_val = float("0.615723")
+    mean = float("-0.000705008")
+    std = float("0.0774705")
+    data = None
+
+
+class Program_weight_tensor_parameter_223:
+    name = "parameter_223"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.376953")
+    max_val = float("0.784668")
+    mean = float("0.0181663")
+    std = float("0.0606264")
+    data = None
+
+
+class Program_weight_tensor_parameter_224:
+    name = "parameter_224"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.13501")
+    max_val = float("0.467285")
+    mean = float("0.408541")
+    std = float("0.0393503")
+    data = None
+
+
+class Program_weight_tensor_parameter_225:
+    name = "parameter_225"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-2.4043")
+    max_val = float("1.15332")
+    mean = float("0.0139082")
+    std = float("0.156624")
+    data = None
+
+
+class Program_weight_tensor_parameter_226:
+    name = "parameter_226"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.226196")
+    max_val = float("2.40625")
+    mean = float("0.489929")
+    std = float("0.13166")
+    data = None
+
+
+class Program_weight_tensor_parameter_227:
+    name = "parameter_227"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.392822")
+    max_val = float("0.665527")
+    mean = float("-0.000149415")
+    std = float("0.0778859")
+    data = None
+
+
+class Program_weight_tensor_parameter_228:
+    name = "parameter_228"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-1.51758")
+    max_val = float("6.30078")
+    mean = float("2.22621e-06")
+    std = float("0.0454506")
+    data = None
+
+
+class Program_weight_tensor_parameter_229:
+    name = "parameter_229"
+    shape = [3072]
+    dtype = "float32"
+    min_val = float("-0.58252")
+    max_val = float("0.402832")
+    mean = float("-0.103758")
+    std = float("0.0886523")
+    data = None
+
+
+class Program_weight_tensor_parameter_230:
+    name = "parameter_230"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.598633")
+    max_val = float("0.502441")
+    mean = float("-0.000837495")
+    std = float("0.0597324")
+    data = None
+
+
+class Program_weight_tensor_parameter_231:
+    name = "parameter_231"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.202637")
+    max_val = float("0.200806")
+    mean = float("-0.000828136")
+    std = float("0.0597395")
+    data = None
+
+
+class Program_weight_tensor_parameter_232:
+    name = "parameter_232"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-2.21875")
+    max_val = float("0.460938")
+    mean = float("-3.24762e-05")
+    std = float("0.0442046")
+    data = None
+
+
+class Program_weight_tensor_parameter_233:
+    name = "parameter_233"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.320801")
+    max_val = float("0.339355")
+    mean = float("-0.0021041")
+    std = float("0.0456797")
+    data = None
+
+
+class Program_weight_tensor_parameter_234:
+    name = "parameter_234"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.42749")
+    max_val = float("0.499268")
+    mean = float("-0.000116")
+    std = float("0.0410461")
+    data = None
+
+
+class Program_weight_tensor_parameter_235:
+    name = "parameter_235"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.00375557")
+    max_val = float("0.00421143")
+    mean = float("-2.58039e-06")
+    std = float("0.000723744")
+    data = None
+
+
+class Program_weight_tensor_parameter_236:
+    name = "parameter_236"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.815918")
+    max_val = float("0.759766")
+    mean = float("-3.60192e-05")
+    std = float("0.0794539")
+    data = None
+
+
+class Program_weight_tensor_parameter_237:
+    name = "parameter_237"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-1.27734")
+    max_val = float("1.3623")
+    mean = float("0.00837527")
+    std = float("0.442717")
+    data = None
+
+
+class Program_weight_tensor_parameter_238:
+    name = "parameter_238"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.661621")
+    max_val = float("0.859375")
+    mean = float("0.000224048")
+    std = float("0.0775781")
+    data = None
+
+
+class Program_weight_tensor_parameter_239:
+    name = "parameter_239"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.49585")
+    max_val = float("0.665039")
+    mean = float("0.00910543")
+    std = float("0.0552938")
+    data = None
+
+
+class Program_weight_tensor_parameter_240:
+    name = "parameter_240"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.0596313")
+    max_val = float("0.497314")
+    mean = float("0.40279")
+    std = float("0.0420985")
+    data = None
+
+
+class Program_weight_tensor_parameter_241:
+    name = "parameter_241"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-1.48145")
+    max_val = float("2.00977")
+    mean = float("0.0247173")
+    std = float("0.164284")
+    data = None
+
+
+class Program_weight_tensor_parameter_242:
+    name = "parameter_242"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.274414")
+    max_val = float("1.89258")
+    mean = float("0.48181")
+    std = float("0.108055")
+    data = None
+
+
+class Program_weight_tensor_parameter_243:
+    name = "parameter_243"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.424072")
+    max_val = float("0.342285")
+    mean = float("-0.000543796")
+    std = float("0.0890763")
+    data = None
+
+
+class Program_weight_tensor_parameter_244:
+    name = "parameter_244"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-10.75")
+    max_val = float("1.6543")
+    mean = float("3.3375e-06")
+    std = float("0.0448999")
+    data = None
+
+
+class Program_weight_tensor_parameter_245:
+    name = "parameter_245"
+    shape = [3072]
+    dtype = "float32"
+    min_val = float("-0.648438")
+    max_val = float("0.394531")
+    mean = float("-0.115854")
+    std = float("0.110271")
+    data = None
+
+
+class Program_weight_tensor_parameter_246:
+    name = "parameter_246"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-1.1084")
+    max_val = float("1.2793")
+    mean = float("-0.00122496")
+    std = float("0.0567686")
+    data = None
+
+
+class Program_weight_tensor_parameter_247:
+    name = "parameter_247"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.433838")
+    max_val = float("0.326904")
+    mean = float("0.000325684")
+    std = float("0.0948319")
+    data = None
+
+
+class Program_weight_tensor_parameter_248:
+    name = "parameter_248"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.937012")
+    max_val = float("0.84668")
+    mean = float("5.27927e-06")
+    std = float("0.0390663")
+    data = None
+
+
+class Program_weight_tensor_parameter_249:
+    name = "parameter_249"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.563965")
+    max_val = float("0.51123")
+    mean = float("-0.00231634")
+    std = float("0.0678634")
+    data = None
+
+
+class Program_weight_tensor_parameter_250:
+    name = "parameter_250"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.445801")
+    max_val = float("0.385498")
+    mean = float("-1.48992e-05")
+    std = float("0.036985")
+    data = None
+
+
+class Program_weight_tensor_parameter_251:
+    name = "parameter_251"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.0192108")
+    max_val = float("0.0244904")
+    mean = float("-0.000172688")
+    std = float("0.004512")
+    data = None
+
+
+class Program_weight_tensor_parameter_252:
+    name = "parameter_252"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.726074")
+    max_val = float("0.708008")
+    mean = float("9.19318e-05")
+    std = float("0.0796358")
+    data = None
+
+
+class Program_weight_tensor_parameter_253:
+    name = "parameter_253"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-1.06348")
+    max_val = float("1.08789")
+    mean = float("0.00030978")
+    std = float("0.337028")
+    data = None
+
+
+class Program_weight_tensor_parameter_254:
+    name = "parameter_254"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.79248")
+    max_val = float("1.35254")
+    mean = float("2.30758e-05")
+    std = float("0.0800477")
+    data = None
+
+
+class Program_weight_tensor_parameter_255:
+    name = "parameter_255"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("-0.530762")
+    max_val = float("0.297852")
+    mean = float("0.00128834")
+    std = float("0.0702966")
+    data = None
+
+
+class Program_weight_tensor_parameter_256:
+    name = "parameter_256"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("0.0639038")
+    max_val = float("0.825195")
+    mean = float("0.41862")
+    std = float("0.060744")
+    data = None
+
+
+class Program_weight_tensor_parameter_257:
+    name = "parameter_257"
+    shape = [1026, 768]
+    dtype = "float32"
+    min_val = float("-3.90039")
+    max_val = float("1.04004")
+    mean = float("-8.20941e-05")
+    std = float("0.0304671")
+    data = None
+
+
+class Program_weight_tensor_parameter_258:
+    name = "parameter_258"
+    shape = [50265, 768]
+    dtype = "float32"
+    min_val = float("-0.50293")
+    max_val = float("1.12109")
+    mean = float("-0.014526")
+    std = float("0.06534")
+    data = None
diff --git a/paddle_samples/PaddleNLP/chinese-xlnet-base/graph_net.json b/paddle_samples/PaddleNLP/chinese-xlnet-base/graph_net.json
new file mode 100644
index 000000000..d1fce9d18
--- /dev/null
+++ b/paddle_samples/PaddleNLP/chinese-xlnet-base/graph_net.json
@@ -0,0 +1,6 @@
+{
+    "framework": "paddle",
+    "model_name": "chinese-xlnet-base",
+    "num_devices_required": 1,
+    "num_nodes_required": 1
+}
\ No newline at end of file
diff --git a/paddle_samples/PaddleNLP/chinese-xlnet-base/input_meta.py b/paddle_samples/PaddleNLP/chinese-xlnet-base/input_meta.py
new file mode 100644
index 000000000..9ea1655e0
--- /dev/null
+++ b/paddle_samples/PaddleNLP/chinese-xlnet-base/input_meta.py
@@ -0,0 +1,19 @@
+class Program_weight_tensor_data_0:
+    name = "data_0"
+    shape = [1, 9]
+    dtype = "int64"
+    data = [19, 11684, 121, 15954, 2090, 21957, 1039, 4, 3]
+
+
+class Program_weight_tensor_data_1:
+    name = "data_1"
+    shape = [1, 9]
+    dtype = "int64"
+    data = [0, 0, 0, 0, 0, 0, 0, 0, 2]
+
+
+class Program_weight_tensor_data_2:
+    name = "data_2"
+    shape = [1, 9]
+    dtype = "int64"
+    data = [1, 1, 1, 1, 1, 1, 1, 1, 1]
diff --git a/paddle_samples/PaddleNLP/chinese-xlnet-base/model.py b/paddle_samples/PaddleNLP/chinese-xlnet-base/model.py
new file mode 100644
index 000000000..f7be5aed5
--- /dev/null
+++ b/paddle_samples/PaddleNLP/chinese-xlnet-base/model.py
@@ -0,0 +1,4369 @@
+import paddle
+
+
+class GraphModule(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self,
+        parameter_0,
+        parameter_1,
+        parameter_2,
+        parameter_3,
+        parameter_4,
+        parameter_5,
+        parameter_6,
+        parameter_7,
+        parameter_8,
+        parameter_9,
+        parameter_10,
+        parameter_11,
+        parameter_12,
+        parameter_13,
+        parameter_14,
+        parameter_15,
+        parameter_16,
+        parameter_17,
+        parameter_18,
+        parameter_19,
+        parameter_20,
+        parameter_21,
+        parameter_22,
+        parameter_23,
+        parameter_24,
+        parameter_25,
+        parameter_26,
+        parameter_27,
+        parameter_28,
+        parameter_29,
+        parameter_30,
+        parameter_31,
+        parameter_32,
+        parameter_33,
+        parameter_34,
+        parameter_35,
+        parameter_36,
+        parameter_37,
+        parameter_38,
+        parameter_39,
+        parameter_40,
+        parameter_41,
+        parameter_42,
+        parameter_43,
+        parameter_44,
+        parameter_45,
+        parameter_46,
+        parameter_47,
+        parameter_48,
+        parameter_49,
+        parameter_50,
+        parameter_51,
+        parameter_52,
+        parameter_53,
+        parameter_54,
+        parameter_55,
+        parameter_56,
+        parameter_57,
+        parameter_58,
+        parameter_59,
+        parameter_60,
+        parameter_61,
+        parameter_62,
+        parameter_63,
+        parameter_64,
+        parameter_65,
+        parameter_66,
+        parameter_67,
+        parameter_68,
+        parameter_69,
+        parameter_70,
+        parameter_71,
+        parameter_72,
+        parameter_73,
+        parameter_74,
+        parameter_75,
+        parameter_76,
+        parameter_77,
+        parameter_78,
+        parameter_79,
+        parameter_80,
+        parameter_81,
+        parameter_82,
+        parameter_83,
+        parameter_84,
+        parameter_85,
+        parameter_86,
+        parameter_87,
+        parameter_88,
+        parameter_89,
+        parameter_90,
+        parameter_91,
+        parameter_92,
+        parameter_93,
+        parameter_94,
+        parameter_95,
+        parameter_96,
+        parameter_97,
+        parameter_98,
+        parameter_99,
+        parameter_100,
+        parameter_101,
+        parameter_102,
+        parameter_103,
+        parameter_104,
+        parameter_105,
+        parameter_106,
+        parameter_107,
+        parameter_108,
+        parameter_109,
+        parameter_110,
+        parameter_111,
+        parameter_112,
+        parameter_113,
+        parameter_114,
+        parameter_115,
+        parameter_116,
+        parameter_117,
+        parameter_118,
+        parameter_119,
+        parameter_120,
+        parameter_121,
+        parameter_122,
+        parameter_123,
+        parameter_124,
+        parameter_125,
+        parameter_126,
+        parameter_127,
+        parameter_128,
+        parameter_129,
+        parameter_130,
+        parameter_131,
+        parameter_132,
+        parameter_133,
+        parameter_134,
+        parameter_135,
+        parameter_136,
+        parameter_137,
+        parameter_138,
+        parameter_139,
+        parameter_140,
+        parameter_141,
+        parameter_142,
+        parameter_143,
+        parameter_144,
+        parameter_145,
+        parameter_146,
+        parameter_147,
+        parameter_148,
+        parameter_149,
+        parameter_150,
+        parameter_151,
+        parameter_152,
+        parameter_153,
+        parameter_154,
+        parameter_155,
+        parameter_156,
+        parameter_157,
+        parameter_158,
+        parameter_159,
+        parameter_160,
+        parameter_161,
+        parameter_162,
+        parameter_163,
+        parameter_164,
+        parameter_165,
+        parameter_166,
+        parameter_167,
+        parameter_168,
+        parameter_169,
+        parameter_170,
+        parameter_171,
+        parameter_172,
+        parameter_173,
+        parameter_174,
+        parameter_175,
+        parameter_176,
+        parameter_177,
+        parameter_178,
+        parameter_179,
+        parameter_180,
+        parameter_181,
+        parameter_182,
+        parameter_183,
+        parameter_184,
+        parameter_185,
+        parameter_186,
+        parameter_187,
+        parameter_188,
+        parameter_189,
+        parameter_190,
+        parameter_191,
+        parameter_192,
+        parameter_193,
+        parameter_194,
+        parameter_195,
+        parameter_196,
+        parameter_197,
+        parameter_198,
+        parameter_199,
+        parameter_200,
+        parameter_201,
+        parameter_202,
+        parameter_203,
+        parameter_204,
+        parameter_205,
+        data_0,
+        data_1,
+        data_2,
+    ):
+        # pd_op.transpose: (9x1xi64) <- (1x9xi64)
+        transpose_1 = paddle._C_ops.transpose(data_0, [1, 0])
+        del data_0
+
+        # pd_op.transpose: (9x1xi64) <- (1x9xi64)
+        transpose_2 = paddle._C_ops.transpose(data_1, [1, 0])
+        del data_1
+
+        # pd_op.transpose: (9x1xi64) <- (1x9xi64)
+        transpose_3 = paddle._C_ops.transpose(data_2, [1, 0])
+        del data_2
+
+        # pd_op.cast: (9x1xf32) <- (9x1xi64)
+        cast_0 = paddle._C_ops.cast(transpose_3, paddle.float32)
+        del transpose_3
+
+        # pd_op.full: (1xf32) <- ()
+        full_0 = paddle._C_ops.full(
+            [1], float("-1"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (9x1xf32) <- (9x1xf32, 1xf32)
+        scale_0 = paddle._C_ops.scale(cast_0, full_0, float("1"), True)
+        del cast_0
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_0 = [0]
+
+        # pd_op.unsqueeze: (1x9x1xf32) <- (9x1xf32, 1xi64)
+        unsqueeze_0 = paddle._C_ops.unsqueeze(scale_0, full_int_array_0)
+        del scale_0
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_1 = [-1]
+
+        # pd_op.unsqueeze: (1x9x1x1xf32) <- (1x9x1xf32, 1xi64)
+        unsqueeze_1 = paddle._C_ops.unsqueeze(unsqueeze_0, full_int_array_1)
+        del full_int_array_1, unsqueeze_0
+
+        # pd_op.full: (xf32) <- ()
+        full_1 = paddle._C_ops.full(
+            [], float("0"), paddle.float32, paddle.framework._current_expected_place()
+        )
+
+        # pd_op.greater_than: (1x9x1x1xb) <- (1x9x1x1xf32, xf32)
+        greater_than_0 = paddle._C_ops.greater_than(unsqueeze_1, full_1)
+        del unsqueeze_1
+
+        # pd_op.cast: (1x9x1x1xf32) <- (1x9x1x1xb)
+        cast_1 = paddle._C_ops.cast(greater_than_0, paddle.float32)
+        del greater_than_0
+
+        # pd_op.full: (9xf32) <- ()
+        full_2 = paddle._C_ops.full(
+            [9], float("1"), paddle.float32, paddle.framework._current_expected_place()
+        )
+
+        # pd_op.diag: (9x9xf32) <- (9xf32)
+        diag_0 = paddle._C_ops.diag(full_2, 0, float("0"))
+        del full_2
+
+        # pd_op.scale: (9x9xf32) <- (9x9xf32, 1xf32)
+        scale_1 = paddle._C_ops.scale(diag_0, full_0, float("0"), True)
+        del diag_0, full_0
+
+        # pd_op.cast: (9x9xf32) <- (9x9xf32)
+        cast_2 = paddle._C_ops.cast(scale_1, paddle.float32)
+        del scale_1
+
+        # pd_op.full_int_array: (2xi64) <- ()
+        full_int_array_2 = [2, 3]
+
+        # pd_op.unsqueeze: (9x9x1x1xf32) <- (9x9xf32, 2xi64)
+        unsqueeze_2 = paddle._C_ops.unsqueeze(cast_2, full_int_array_2)
+        del cast_2, full_int_array_2
+
+        # pd_op.add: (9x9x1x1xf32) <- (1x9x1x1xf32, 9x9x1x1xf32)
+        add_0 = paddle._C_ops.add(cast_1, unsqueeze_2)
+        del cast_1, unsqueeze_2
+
+        # pd_op.greater_than: (9x9x1x1xb) <- (9x9x1x1xf32, xf32)
+        greater_than_1 = paddle._C_ops.greater_than(add_0, full_1)
+        del add_0, full_1
+
+        # pd_op.cast: (9x9x1x1xf32) <- (9x9x1x1xb)
+        cast_3 = paddle._C_ops.cast(greater_than_1, paddle.float32)
+        del greater_than_1
+
+        # pd_op.embedding: (9x1x768xf32) <- (9x1xi64, 32000x768xf32)
+        embedding_0 = paddle._C_ops.embedding(transpose_1, parameter_204, -1, False)
+        del parameter_204, transpose_1
+
+        # pd_op.full: (1xf32) <- ()
+        full_3 = paddle._C_ops.full(
+            [1], float("0.1"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_0, dropout_1 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                embedding_0, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del embedding_0
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_3 = [1]
+
+        # pd_op.unsqueeze: (9x1x1xi64) <- (9x1xi64, 1xi64)
+        unsqueeze_3 = paddle._C_ops.unsqueeze(transpose_2, full_int_array_3)
+
+        # pd_op.unsqueeze: (1x9x1xi64) <- (9x1xi64, 1xi64)
+        unsqueeze_4 = paddle._C_ops.unsqueeze(transpose_2, full_int_array_0)
+        del full_int_array_0, transpose_2
+
+        # pd_op.not_equal: (9x9x1xb) <- (9x1x1xi64, 1x9x1xi64)
+        not_equal_0 = paddle._C_ops.not_equal(unsqueeze_3, unsqueeze_4)
+        del unsqueeze_3, unsqueeze_4
+
+        # pd_op.cast: (9x9x1xi64) <- (9x9x1xb)
+        cast_4 = paddle._C_ops.cast(not_equal_0, paddle.int64)
+        del not_equal_0
+
+        # pd_op.full: (1xi32) <- ()
+        full_4 = paddle._C_ops.full(
+            [1], float("2"), paddle.int32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.one_hot: (9x9x1x2xf32) <- (9x9x1xi64, 1xi32)
+        one_hot_0 = paddle._C_ops.one_hot(
+            cast_4 % paddle.cast(full_4, cast_4.dtype), full_4
+        )
+        del cast_4, full_4
+
+        # pd_op.cast: (9x9x1x2xf32) <- (9x9x1x2xf32)
+        cast_5 = paddle._C_ops.cast(one_hot_0, paddle.float32)
+        del one_hot_0
+
+        # pd_op.full: (1xf64) <- ()
+        full_5 = paddle._C_ops.full(
+            [1], float("0"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full: (1xf64) <- ()
+        full_6 = paddle._C_ops.full(
+            [1], float("768"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full: (1xf64) <- ()
+        full_7 = paddle._C_ops.full(
+            [1], float("2"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.arange: (384xf32) <- (1xf64, 1xf64, 1xf64)
+        arange_0 = paddle.arange(full_5, full_6, full_7, dtype="float32")
+        del full_6, full_7
+
+        # pd_op.full: (1xf32) <- ()
+        full_8 = paddle._C_ops.full(
+            [1], float("0.00130208"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (384xf32) <- (384xf32, 1xf32)
+        scale_2 = paddle._C_ops.scale(arange_0, full_8, float("0"), True)
+        del arange_0, full_8
+
+        # pd_op.full: (384xf32) <- ()
+        full_9 = paddle._C_ops.full(
+            [384],
+            float("10000"),
+            paddle.float32,
+            paddle.framework._current_expected_place(),
+        )
+
+        # pd_op.elementwise_pow: (384xf32) <- (384xf32, 384xf32)
+        elementwise_pow_0 = paddle._C_ops.elementwise_pow(full_9, scale_2)
+        del full_9, scale_2
+
+        # pd_op.full: (384xf32) <- ()
+        full_10 = paddle._C_ops.full(
+            [384],
+            float("1"),
+            paddle.float32,
+            paddle.framework._current_expected_place(),
+        )
+
+        # pd_op.divide: (384xf32) <- (384xf32, 384xf32)
+        divide_0 = paddle._C_ops.divide(full_10, elementwise_pow_0)
+        del elementwise_pow_0, full_10
+
+        # pd_op.full: (1xf64) <- ()
+        full_11 = paddle._C_ops.full(
+            [1], float("9"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full: (1xf64) <- ()
+        full_12 = paddle._C_ops.full(
+            [1], float("-9"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full: (1xf64) <- ()
+        full_13 = paddle._C_ops.full(
+            [1], float("-1"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.arange: (18xf32) <- (1xf64, 1xf64, 1xf64)
+        arange_1 = paddle.arange(full_11, full_12, full_13, dtype="float32")
+        del full_12, full_13
+
+        # builtin.combine: ([18xf32, 384xf32]) <- (18xf32, 384xf32)
+        combine_0 = [arange_1, divide_0]
+        del arange_1, divide_0
+
+        # pd_op.einsum: (18x384xf32, [0xf32, 0xf32], [18xf32, 384xf32]) <- ([18xf32, 384xf32])
+        einsum_0, einsum_1, einsum_2 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_0, "i,d->id"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_0
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_0,
+            split_1,
+        ) = einsum_1
+        del einsum_1
+
+        # builtin.split: (18xf32, 384xf32) <- ([18xf32, 384xf32])
+        (
+            split_2,
+            split_3,
+        ) = einsum_2
+        del einsum_2
+
+        # pd_op.sin: (18x384xf32) <- (18x384xf32)
+        sin_0 = paddle._C_ops.sin(einsum_0)
+
+        # pd_op.cos: (18x384xf32) <- (18x384xf32)
+        cos_0 = paddle._C_ops.cos(einsum_0)
+        del einsum_0
+
+        # pd_op.full: (1xi32) <- ()
+        full_14 = paddle._C_ops.full(
+            [1], float("-1"), paddle.int32, paddle.core.CPUPlace()
+        )
+
+        # builtin.combine: ([18x384xf32, 18x384xf32]) <- (18x384xf32, 18x384xf32)
+        combine_1 = [sin_0, cos_0]
+        del cos_0, sin_0
+
+        # pd_op.concat: (18x768xf32) <- ([18x384xf32, 18x384xf32], 1xi32)
+        concat_0 = paddle._C_ops.concat(combine_1, full_14)
+        del combine_1, full_14
+
+        # pd_op.unsqueeze: (18x1x768xf32) <- (18x768xf32, 1xi64)
+        unsqueeze_5 = paddle._C_ops.unsqueeze(concat_0, full_int_array_3)
+        del concat_0
+
+        # pd_op.full_int_array: (3xi64) <- ()
+        full_int_array_4 = [-1, 1, -1]
+
+        # pd_op.expand: (18x1x768xf32) <- (18x1x768xf32, 3xi64)
+        expand_0 = paddle._C_ops.expand(unsqueeze_5, full_int_array_4)
+        del full_int_array_4, unsqueeze_5
+
+        # pd_op.dropout: (18x1x768xf32, 18x1x768xui8) <- (18x1x768xf32, None, 1xf32)
+        dropout_2, dropout_3 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                expand_0, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del expand_0
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_0 = paddle._C_ops.matmul(dropout_0, parameter_203, False, False)
+        del parameter_203
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_5 = [9, 1, 12, 64]
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_0 = paddle._C_ops.reshape(matmul_0, full_int_array_5)
+        del matmul_0
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_1 = paddle._C_ops.matmul(dropout_0, parameter_202, False, False)
+        del parameter_202
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_1 = paddle._C_ops.reshape(matmul_1, full_int_array_5)
+        del matmul_1
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_2 = paddle._C_ops.matmul(dropout_0, parameter_201, False, False)
+        del parameter_201
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_2 = paddle._C_ops.reshape(matmul_2, full_int_array_5)
+        del matmul_2
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_3 = paddle._C_ops.matmul(dropout_2, parameter_199, False, False)
+        del parameter_199
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_6 = [18, -1, 12, 64]
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_3 = paddle._C_ops.reshape(matmul_3, full_int_array_6)
+        del matmul_3
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_1 = paddle._C_ops.add(reshape_0, parameter_196)
+        del parameter_196
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_2 = [add_1, reshape_1]
+        del add_1, reshape_1
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_3, einsum_4, einsum_5 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_2, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_2
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_4,
+            split_5,
+        ) = einsum_4
+        del einsum_4
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_6,
+            split_7,
+        ) = einsum_5
+        del einsum_5
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_2 = paddle._C_ops.add(reshape_0, parameter_198)
+        del parameter_198
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_3 = [add_2, reshape_3]
+        del add_2, reshape_3
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_6, einsum_7, einsum_8 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_3, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_3
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_8,
+            split_9,
+        ) = einsum_7
+        del einsum_7
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_10,
+            split_11,
+        ) = einsum_8
+        del einsum_8
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_7 = [1, 12, 18, 9]
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_4 = paddle._C_ops.reshape(einsum_6, full_int_array_7)
+        del einsum_6
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_8 = [2147483647]
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_0 = paddle._C_ops.slice(
+            reshape_4, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_4
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_9 = [1, 12, 9, 17]
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_5 = paddle._C_ops.reshape(slice_0, full_int_array_9)
+        del slice_0
+
+        # pd_op.full: (1xf64) <- ()
+        full_15 = paddle._C_ops.full(
+            [1], float("1"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.arange: (9xi64) <- (1xf64, 1xf64, 1xf64)
+        arange_2 = paddle.arange(full_5, full_11, full_15, dtype="int64")
+        del full_11, full_15, full_5
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_0 = paddle._C_ops.index_select(reshape_5, arange_2, 3)
+        del reshape_5
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_3 = paddle._C_ops.add(reshape_0, parameter_197)
+        del parameter_197, reshape_0
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_4 = [add_3, parameter_195]
+        del add_3, parameter_195
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_9, einsum_10, einsum_11 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_4, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_4
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_12,
+            split_13,
+        ) = einsum_10
+        del einsum_10
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_14,
+            split_15,
+        ) = einsum_11
+        del einsum_11
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_5 = [cast_5, einsum_9]
+        del einsum_9
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_12, einsum_13, einsum_14 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_5, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_5
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_16,
+            split_17,
+        ) = einsum_13
+        del einsum_13
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_18,
+            split_19,
+        ) = einsum_14
+        del einsum_14
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_4 = paddle._C_ops.add(einsum_3, index_select_0)
+        del einsum_3, index_select_0
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_5 = paddle._C_ops.add(add_4, einsum_12)
+        del add_4, einsum_12
+
+        # pd_op.full: (1xf32) <- ()
+        full_16 = paddle._C_ops.full(
+            [1], float("0.125"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_3 = paddle._C_ops.scale(add_5, full_16, float("0"), True)
+        del add_5
+
+        # pd_op.transpose: (1x1x9x9xf32) <- (9x9x1x1xf32)
+        transpose_4 = paddle._C_ops.transpose(cast_3, [2, 3, 0, 1])
+        del cast_3
+
+        # pd_op.full: (1xf32) <- ()
+        full_17 = paddle._C_ops.full(
+            [1], float("1e+30"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x1x9x9xf32) <- (1x1x9x9xf32, 1xf32)
+        scale_4 = paddle._C_ops.scale(transpose_4, full_17, float("0"), True)
+        del full_17, transpose_4
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_0 = paddle._C_ops.subtract(scale_3, scale_4)
+        del scale_3
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_0 = paddle._C_ops.softmax(subtract_0, 3)
+        del subtract_0
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_4, dropout_5 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_0, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_0
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_6 = [dropout_4, reshape_2]
+        del dropout_4, reshape_2
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_15, einsum_16, einsum_17 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_6, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_6
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_20,
+            split_21,
+        ) = einsum_16
+        del einsum_16
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_22,
+            split_23,
+        ) = einsum_17
+        del einsum_17
+
+        # pd_op.full_int_array: (3xi64) <- ()
+        full_int_array_10 = [9, 1, 768]
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_6 = paddle._C_ops.reshape(einsum_15, full_int_array_10)
+        del einsum_15
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_7 = [reshape_6, parameter_200]
+        del parameter_200, reshape_6
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_18, einsum_19, einsum_20 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_7, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_7
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_24,
+            split_25,
+        ) = einsum_19
+        del einsum_19
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_26,
+            split_27,
+        ) = einsum_20
+        del einsum_20
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_6, dropout_7 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_18, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_18
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_6 = paddle._C_ops.add(dropout_6, dropout_0)
+        del dropout_0, dropout_6
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_0, layer_norm_1, layer_norm_2 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_6, parameter_194, parameter_193, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_6, parameter_193, parameter_194
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_4 = paddle._C_ops.matmul(layer_norm_0, parameter_190, False, False)
+        del parameter_190
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_7 = paddle._C_ops.add(matmul_4, parameter_189)
+        del matmul_4, parameter_189
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_0 = paddle._C_ops.relu(add_7)
+        del add_7
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_8, dropout_9 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_0, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_0
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_5 = paddle._C_ops.matmul(dropout_8, parameter_188, False, False)
+        del dropout_8, parameter_188
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_8 = paddle._C_ops.add(matmul_5, parameter_187)
+        del matmul_5, parameter_187
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_10, dropout_11 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_8, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_8
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_9 = paddle._C_ops.add(dropout_10, layer_norm_0)
+        del dropout_10, layer_norm_0
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_3, layer_norm_4, layer_norm_5 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_9, parameter_192, parameter_191, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_9, parameter_191, parameter_192
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_6 = paddle._C_ops.matmul(layer_norm_3, parameter_186, False, False)
+        del parameter_186
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_7 = paddle._C_ops.reshape(matmul_6, full_int_array_5)
+        del matmul_6
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_7 = paddle._C_ops.matmul(layer_norm_3, parameter_185, False, False)
+        del parameter_185
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_8 = paddle._C_ops.reshape(matmul_7, full_int_array_5)
+        del matmul_7
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_8 = paddle._C_ops.matmul(layer_norm_3, parameter_184, False, False)
+        del parameter_184
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_9 = paddle._C_ops.reshape(matmul_8, full_int_array_5)
+        del matmul_8
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_9 = paddle._C_ops.matmul(dropout_2, parameter_182, False, False)
+        del parameter_182
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_10 = paddle._C_ops.reshape(matmul_9, full_int_array_6)
+        del matmul_9
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_10 = paddle._C_ops.add(reshape_7, parameter_179)
+        del parameter_179
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_8 = [add_10, reshape_8]
+        del add_10, reshape_8
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_21, einsum_22, einsum_23 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_8, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_8
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_28,
+            split_29,
+        ) = einsum_22
+        del einsum_22
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_30,
+            split_31,
+        ) = einsum_23
+        del einsum_23
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_11 = paddle._C_ops.add(reshape_7, parameter_181)
+        del parameter_181
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_9 = [add_11, reshape_10]
+        del add_11, reshape_10
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_24, einsum_25, einsum_26 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_9, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_9
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_32,
+            split_33,
+        ) = einsum_25
+        del einsum_25
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_34,
+            split_35,
+        ) = einsum_26
+        del einsum_26
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_11 = paddle._C_ops.reshape(einsum_24, full_int_array_7)
+        del einsum_24
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_1 = paddle._C_ops.slice(
+            reshape_11, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_11
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_12 = paddle._C_ops.reshape(slice_1, full_int_array_9)
+        del slice_1
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_1 = paddle._C_ops.index_select(reshape_12, arange_2, 3)
+        del reshape_12
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_12 = paddle._C_ops.add(reshape_7, parameter_180)
+        del parameter_180, reshape_7
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_10 = [add_12, parameter_178]
+        del add_12, parameter_178
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_27, einsum_28, einsum_29 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_10, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_10
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_36,
+            split_37,
+        ) = einsum_28
+        del einsum_28
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_38,
+            split_39,
+        ) = einsum_29
+        del einsum_29
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_11 = [cast_5, einsum_27]
+        del einsum_27
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_30, einsum_31, einsum_32 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_11, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_11
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_40,
+            split_41,
+        ) = einsum_31
+        del einsum_31
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_42,
+            split_43,
+        ) = einsum_32
+        del einsum_32
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_13 = paddle._C_ops.add(einsum_21, index_select_1)
+        del einsum_21, index_select_1
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_14 = paddle._C_ops.add(add_13, einsum_30)
+        del add_13, einsum_30
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_5 = paddle._C_ops.scale(add_14, full_16, float("0"), True)
+        del add_14
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_1 = paddle._C_ops.subtract(scale_5, scale_4)
+        del scale_5
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_1 = paddle._C_ops.softmax(subtract_1, 3)
+        del subtract_1
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_12, dropout_13 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_1, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_1
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_12 = [dropout_12, reshape_9]
+        del dropout_12, reshape_9
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_33, einsum_34, einsum_35 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_12, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_12
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_44,
+            split_45,
+        ) = einsum_34
+        del einsum_34
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_46,
+            split_47,
+        ) = einsum_35
+        del einsum_35
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_13 = paddle._C_ops.reshape(einsum_33, full_int_array_10)
+        del einsum_33
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_13 = [reshape_13, parameter_183]
+        del parameter_183, reshape_13
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_36, einsum_37, einsum_38 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_13, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_13
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_48,
+            split_49,
+        ) = einsum_37
+        del einsum_37
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_50,
+            split_51,
+        ) = einsum_38
+        del einsum_38
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_14, dropout_15 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_36, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_36
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_15 = paddle._C_ops.add(dropout_14, layer_norm_3)
+        del dropout_14, layer_norm_3
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_6, layer_norm_7, layer_norm_8 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_15, parameter_177, parameter_176, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_15, parameter_176, parameter_177
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_10 = paddle._C_ops.matmul(layer_norm_6, parameter_173, False, False)
+        del parameter_173
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_16 = paddle._C_ops.add(matmul_10, parameter_172)
+        del matmul_10, parameter_172
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_1 = paddle._C_ops.relu(add_16)
+        del add_16
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_16, dropout_17 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_1, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_1
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_11 = paddle._C_ops.matmul(dropout_16, parameter_171, False, False)
+        del dropout_16, parameter_171
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_17 = paddle._C_ops.add(matmul_11, parameter_170)
+        del matmul_11, parameter_170
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_18, dropout_19 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_17, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_17
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_18 = paddle._C_ops.add(dropout_18, layer_norm_6)
+        del dropout_18, layer_norm_6
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_9, layer_norm_10, layer_norm_11 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_18, parameter_175, parameter_174, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_18, parameter_174, parameter_175
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_12 = paddle._C_ops.matmul(layer_norm_9, parameter_169, False, False)
+        del parameter_169
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_14 = paddle._C_ops.reshape(matmul_12, full_int_array_5)
+        del matmul_12
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_13 = paddle._C_ops.matmul(layer_norm_9, parameter_168, False, False)
+        del parameter_168
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_15 = paddle._C_ops.reshape(matmul_13, full_int_array_5)
+        del matmul_13
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_14 = paddle._C_ops.matmul(layer_norm_9, parameter_167, False, False)
+        del parameter_167
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_16 = paddle._C_ops.reshape(matmul_14, full_int_array_5)
+        del matmul_14
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_15 = paddle._C_ops.matmul(dropout_2, parameter_165, False, False)
+        del parameter_165
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_17 = paddle._C_ops.reshape(matmul_15, full_int_array_6)
+        del matmul_15
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_19 = paddle._C_ops.add(reshape_14, parameter_162)
+        del parameter_162
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_14 = [add_19, reshape_15]
+        del add_19, reshape_15
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_39, einsum_40, einsum_41 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_14, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_14
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_52,
+            split_53,
+        ) = einsum_40
+        del einsum_40
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_54,
+            split_55,
+        ) = einsum_41
+        del einsum_41
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_20 = paddle._C_ops.add(reshape_14, parameter_164)
+        del parameter_164
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_15 = [add_20, reshape_17]
+        del add_20, reshape_17
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_42, einsum_43, einsum_44 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_15, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_15
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_56,
+            split_57,
+        ) = einsum_43
+        del einsum_43
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_58,
+            split_59,
+        ) = einsum_44
+        del einsum_44
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_18 = paddle._C_ops.reshape(einsum_42, full_int_array_7)
+        del einsum_42
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_2 = paddle._C_ops.slice(
+            reshape_18, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_18
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_19 = paddle._C_ops.reshape(slice_2, full_int_array_9)
+        del slice_2
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_2 = paddle._C_ops.index_select(reshape_19, arange_2, 3)
+        del reshape_19
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_21 = paddle._C_ops.add(reshape_14, parameter_163)
+        del parameter_163, reshape_14
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_16 = [add_21, parameter_161]
+        del add_21, parameter_161
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_45, einsum_46, einsum_47 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_16, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_16
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_60,
+            split_61,
+        ) = einsum_46
+        del einsum_46
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_62,
+            split_63,
+        ) = einsum_47
+        del einsum_47
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_17 = [cast_5, einsum_45]
+        del einsum_45
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_48, einsum_49, einsum_50 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_17, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_17
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_64,
+            split_65,
+        ) = einsum_49
+        del einsum_49
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_66,
+            split_67,
+        ) = einsum_50
+        del einsum_50
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_22 = paddle._C_ops.add(einsum_39, index_select_2)
+        del einsum_39, index_select_2
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_23 = paddle._C_ops.add(add_22, einsum_48)
+        del add_22, einsum_48
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_6 = paddle._C_ops.scale(add_23, full_16, float("0"), True)
+        del add_23
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_2 = paddle._C_ops.subtract(scale_6, scale_4)
+        del scale_6
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_2 = paddle._C_ops.softmax(subtract_2, 3)
+        del subtract_2
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_20, dropout_21 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_2, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_2
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_18 = [dropout_20, reshape_16]
+        del dropout_20, reshape_16
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_51, einsum_52, einsum_53 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_18, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_18
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_68,
+            split_69,
+        ) = einsum_52
+        del einsum_52
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_70,
+            split_71,
+        ) = einsum_53
+        del einsum_53
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_20 = paddle._C_ops.reshape(einsum_51, full_int_array_10)
+        del einsum_51
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_19 = [reshape_20, parameter_166]
+        del parameter_166, reshape_20
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_54, einsum_55, einsum_56 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_19, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_19
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_72,
+            split_73,
+        ) = einsum_55
+        del einsum_55
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_74,
+            split_75,
+        ) = einsum_56
+        del einsum_56
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_22, dropout_23 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_54, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_54
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_24 = paddle._C_ops.add(dropout_22, layer_norm_9)
+        del dropout_22, layer_norm_9
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_12, layer_norm_13, layer_norm_14 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_24, parameter_160, parameter_159, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_24, parameter_159, parameter_160
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_16 = paddle._C_ops.matmul(layer_norm_12, parameter_156, False, False)
+        del parameter_156
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_25 = paddle._C_ops.add(matmul_16, parameter_155)
+        del matmul_16, parameter_155
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_2 = paddle._C_ops.relu(add_25)
+        del add_25
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_24, dropout_25 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_2, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_2
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_17 = paddle._C_ops.matmul(dropout_24, parameter_154, False, False)
+        del dropout_24, parameter_154
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_26 = paddle._C_ops.add(matmul_17, parameter_153)
+        del matmul_17, parameter_153
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_26, dropout_27 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_26, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_26
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_27 = paddle._C_ops.add(dropout_26, layer_norm_12)
+        del dropout_26, layer_norm_12
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_15, layer_norm_16, layer_norm_17 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_27, parameter_158, parameter_157, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_27, parameter_157, parameter_158
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_18 = paddle._C_ops.matmul(layer_norm_15, parameter_152, False, False)
+        del parameter_152
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_21 = paddle._C_ops.reshape(matmul_18, full_int_array_5)
+        del matmul_18
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_19 = paddle._C_ops.matmul(layer_norm_15, parameter_151, False, False)
+        del parameter_151
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_22 = paddle._C_ops.reshape(matmul_19, full_int_array_5)
+        del matmul_19
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_20 = paddle._C_ops.matmul(layer_norm_15, parameter_150, False, False)
+        del parameter_150
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_23 = paddle._C_ops.reshape(matmul_20, full_int_array_5)
+        del matmul_20
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_21 = paddle._C_ops.matmul(dropout_2, parameter_148, False, False)
+        del parameter_148
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_24 = paddle._C_ops.reshape(matmul_21, full_int_array_6)
+        del matmul_21
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_28 = paddle._C_ops.add(reshape_21, parameter_145)
+        del parameter_145
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_20 = [add_28, reshape_22]
+        del add_28, reshape_22
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_57, einsum_58, einsum_59 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_20, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_20
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_76,
+            split_77,
+        ) = einsum_58
+        del einsum_58
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_78,
+            split_79,
+        ) = einsum_59
+        del einsum_59
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_29 = paddle._C_ops.add(reshape_21, parameter_147)
+        del parameter_147
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_21 = [add_29, reshape_24]
+        del add_29, reshape_24
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_60, einsum_61, einsum_62 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_21, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_21
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_80,
+            split_81,
+        ) = einsum_61
+        del einsum_61
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_82,
+            split_83,
+        ) = einsum_62
+        del einsum_62
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_25 = paddle._C_ops.reshape(einsum_60, full_int_array_7)
+        del einsum_60
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_3 = paddle._C_ops.slice(
+            reshape_25, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_25
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_26 = paddle._C_ops.reshape(slice_3, full_int_array_9)
+        del slice_3
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_3 = paddle._C_ops.index_select(reshape_26, arange_2, 3)
+        del reshape_26
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_30 = paddle._C_ops.add(reshape_21, parameter_146)
+        del parameter_146, reshape_21
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_22 = [add_30, parameter_144]
+        del add_30, parameter_144
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_63, einsum_64, einsum_65 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_22, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_22
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_84,
+            split_85,
+        ) = einsum_64
+        del einsum_64
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_86,
+            split_87,
+        ) = einsum_65
+        del einsum_65
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_23 = [cast_5, einsum_63]
+        del einsum_63
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_66, einsum_67, einsum_68 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_23, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_23
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_88,
+            split_89,
+        ) = einsum_67
+        del einsum_67
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_90,
+            split_91,
+        ) = einsum_68
+        del einsum_68
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_31 = paddle._C_ops.add(einsum_57, index_select_3)
+        del einsum_57, index_select_3
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_32 = paddle._C_ops.add(add_31, einsum_66)
+        del add_31, einsum_66
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_7 = paddle._C_ops.scale(add_32, full_16, float("0"), True)
+        del add_32
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_3 = paddle._C_ops.subtract(scale_7, scale_4)
+        del scale_7
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_3 = paddle._C_ops.softmax(subtract_3, 3)
+        del subtract_3
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_28, dropout_29 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_3, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_3
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_24 = [dropout_28, reshape_23]
+        del dropout_28, reshape_23
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_69, einsum_70, einsum_71 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_24, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_24
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_92,
+            split_93,
+        ) = einsum_70
+        del einsum_70
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_94,
+            split_95,
+        ) = einsum_71
+        del einsum_71
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_27 = paddle._C_ops.reshape(einsum_69, full_int_array_10)
+        del einsum_69
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_25 = [reshape_27, parameter_149]
+        del parameter_149, reshape_27
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_72, einsum_73, einsum_74 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_25, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_25
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_96,
+            split_97,
+        ) = einsum_73
+        del einsum_73
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_98,
+            split_99,
+        ) = einsum_74
+        del einsum_74
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_30, dropout_31 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_72, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_72
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_33 = paddle._C_ops.add(dropout_30, layer_norm_15)
+        del dropout_30, layer_norm_15
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_18, layer_norm_19, layer_norm_20 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_33, parameter_143, parameter_142, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_33, parameter_142, parameter_143
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_22 = paddle._C_ops.matmul(layer_norm_18, parameter_139, False, False)
+        del parameter_139
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_34 = paddle._C_ops.add(matmul_22, parameter_138)
+        del matmul_22, parameter_138
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_3 = paddle._C_ops.relu(add_34)
+        del add_34
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_32, dropout_33 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_3, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_3
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_23 = paddle._C_ops.matmul(dropout_32, parameter_137, False, False)
+        del dropout_32, parameter_137
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_35 = paddle._C_ops.add(matmul_23, parameter_136)
+        del matmul_23, parameter_136
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_34, dropout_35 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_35, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_35
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_36 = paddle._C_ops.add(dropout_34, layer_norm_18)
+        del dropout_34, layer_norm_18
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_21, layer_norm_22, layer_norm_23 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_36, parameter_141, parameter_140, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_36, parameter_140, parameter_141
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_24 = paddle._C_ops.matmul(layer_norm_21, parameter_135, False, False)
+        del parameter_135
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_28 = paddle._C_ops.reshape(matmul_24, full_int_array_5)
+        del matmul_24
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_25 = paddle._C_ops.matmul(layer_norm_21, parameter_134, False, False)
+        del parameter_134
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_29 = paddle._C_ops.reshape(matmul_25, full_int_array_5)
+        del matmul_25
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_26 = paddle._C_ops.matmul(layer_norm_21, parameter_133, False, False)
+        del parameter_133
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_30 = paddle._C_ops.reshape(matmul_26, full_int_array_5)
+        del matmul_26
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_27 = paddle._C_ops.matmul(dropout_2, parameter_131, False, False)
+        del parameter_131
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_31 = paddle._C_ops.reshape(matmul_27, full_int_array_6)
+        del matmul_27
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_37 = paddle._C_ops.add(reshape_28, parameter_128)
+        del parameter_128
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_26 = [add_37, reshape_29]
+        del add_37, reshape_29
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_75, einsum_76, einsum_77 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_26, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_26
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_100,
+            split_101,
+        ) = einsum_76
+        del einsum_76
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_102,
+            split_103,
+        ) = einsum_77
+        del einsum_77
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_38 = paddle._C_ops.add(reshape_28, parameter_130)
+        del parameter_130
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_27 = [add_38, reshape_31]
+        del add_38, reshape_31
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_78, einsum_79, einsum_80 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_27, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_27
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_104,
+            split_105,
+        ) = einsum_79
+        del einsum_79
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_106,
+            split_107,
+        ) = einsum_80
+        del einsum_80
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_32 = paddle._C_ops.reshape(einsum_78, full_int_array_7)
+        del einsum_78
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_4 = paddle._C_ops.slice(
+            reshape_32, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_32
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_33 = paddle._C_ops.reshape(slice_4, full_int_array_9)
+        del slice_4
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_4 = paddle._C_ops.index_select(reshape_33, arange_2, 3)
+        del reshape_33
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_39 = paddle._C_ops.add(reshape_28, parameter_129)
+        del parameter_129, reshape_28
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_28 = [add_39, parameter_127]
+        del add_39, parameter_127
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_81, einsum_82, einsum_83 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_28, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_28
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_108,
+            split_109,
+        ) = einsum_82
+        del einsum_82
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_110,
+            split_111,
+        ) = einsum_83
+        del einsum_83
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_29 = [cast_5, einsum_81]
+        del einsum_81
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_84, einsum_85, einsum_86 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_29, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_29
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_112,
+            split_113,
+        ) = einsum_85
+        del einsum_85
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_114,
+            split_115,
+        ) = einsum_86
+        del einsum_86
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_40 = paddle._C_ops.add(einsum_75, index_select_4)
+        del einsum_75, index_select_4
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_41 = paddle._C_ops.add(add_40, einsum_84)
+        del add_40, einsum_84
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_8 = paddle._C_ops.scale(add_41, full_16, float("0"), True)
+        del add_41
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_4 = paddle._C_ops.subtract(scale_8, scale_4)
+        del scale_8
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_4 = paddle._C_ops.softmax(subtract_4, 3)
+        del subtract_4
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_36, dropout_37 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_4, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_4
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_30 = [dropout_36, reshape_30]
+        del dropout_36, reshape_30
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_87, einsum_88, einsum_89 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_30, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_30
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_116,
+            split_117,
+        ) = einsum_88
+        del einsum_88
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_118,
+            split_119,
+        ) = einsum_89
+        del einsum_89
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_34 = paddle._C_ops.reshape(einsum_87, full_int_array_10)
+        del einsum_87
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_31 = [reshape_34, parameter_132]
+        del parameter_132, reshape_34
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_90, einsum_91, einsum_92 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_31, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_31
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_120,
+            split_121,
+        ) = einsum_91
+        del einsum_91
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_122,
+            split_123,
+        ) = einsum_92
+        del einsum_92
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_38, dropout_39 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_90, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_90
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_42 = paddle._C_ops.add(dropout_38, layer_norm_21)
+        del dropout_38, layer_norm_21
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_24, layer_norm_25, layer_norm_26 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_42, parameter_126, parameter_125, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_42, parameter_125, parameter_126
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_28 = paddle._C_ops.matmul(layer_norm_24, parameter_122, False, False)
+        del parameter_122
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_43 = paddle._C_ops.add(matmul_28, parameter_121)
+        del matmul_28, parameter_121
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_4 = paddle._C_ops.relu(add_43)
+        del add_43
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_40, dropout_41 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_4, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_4
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_29 = paddle._C_ops.matmul(dropout_40, parameter_120, False, False)
+        del dropout_40, parameter_120
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_44 = paddle._C_ops.add(matmul_29, parameter_119)
+        del matmul_29, parameter_119
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_42, dropout_43 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_44, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_44
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_45 = paddle._C_ops.add(dropout_42, layer_norm_24)
+        del dropout_42, layer_norm_24
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_27, layer_norm_28, layer_norm_29 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_45, parameter_124, parameter_123, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_45, parameter_123, parameter_124
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_30 = paddle._C_ops.matmul(layer_norm_27, parameter_118, False, False)
+        del parameter_118
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_35 = paddle._C_ops.reshape(matmul_30, full_int_array_5)
+        del matmul_30
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_31 = paddle._C_ops.matmul(layer_norm_27, parameter_117, False, False)
+        del parameter_117
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_36 = paddle._C_ops.reshape(matmul_31, full_int_array_5)
+        del matmul_31
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_32 = paddle._C_ops.matmul(layer_norm_27, parameter_116, False, False)
+        del parameter_116
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_37 = paddle._C_ops.reshape(matmul_32, full_int_array_5)
+        del matmul_32
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_33 = paddle._C_ops.matmul(dropout_2, parameter_114, False, False)
+        del parameter_114
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_38 = paddle._C_ops.reshape(matmul_33, full_int_array_6)
+        del matmul_33
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_46 = paddle._C_ops.add(reshape_35, parameter_111)
+        del parameter_111
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_32 = [add_46, reshape_36]
+        del add_46, reshape_36
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_93, einsum_94, einsum_95 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_32, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_32
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_124,
+            split_125,
+        ) = einsum_94
+        del einsum_94
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_126,
+            split_127,
+        ) = einsum_95
+        del einsum_95
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_47 = paddle._C_ops.add(reshape_35, parameter_113)
+        del parameter_113
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_33 = [add_47, reshape_38]
+        del add_47, reshape_38
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_96, einsum_97, einsum_98 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_33, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_33
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_128,
+            split_129,
+        ) = einsum_97
+        del einsum_97
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_130,
+            split_131,
+        ) = einsum_98
+        del einsum_98
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_39 = paddle._C_ops.reshape(einsum_96, full_int_array_7)
+        del einsum_96
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_5 = paddle._C_ops.slice(
+            reshape_39, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_39
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_40 = paddle._C_ops.reshape(slice_5, full_int_array_9)
+        del slice_5
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_5 = paddle._C_ops.index_select(reshape_40, arange_2, 3)
+        del reshape_40
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_48 = paddle._C_ops.add(reshape_35, parameter_112)
+        del parameter_112, reshape_35
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_34 = [add_48, parameter_110]
+        del add_48, parameter_110
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_99, einsum_100, einsum_101 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_34, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_34
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_132,
+            split_133,
+        ) = einsum_100
+        del einsum_100
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_134,
+            split_135,
+        ) = einsum_101
+        del einsum_101
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_35 = [cast_5, einsum_99]
+        del einsum_99
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_102, einsum_103, einsum_104 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_35, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_35
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_136,
+            split_137,
+        ) = einsum_103
+        del einsum_103
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_138,
+            split_139,
+        ) = einsum_104
+        del einsum_104
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_49 = paddle._C_ops.add(einsum_93, index_select_5)
+        del einsum_93, index_select_5
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_50 = paddle._C_ops.add(add_49, einsum_102)
+        del add_49, einsum_102
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_9 = paddle._C_ops.scale(add_50, full_16, float("0"), True)
+        del add_50
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_5 = paddle._C_ops.subtract(scale_9, scale_4)
+        del scale_9
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_5 = paddle._C_ops.softmax(subtract_5, 3)
+        del subtract_5
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_44, dropout_45 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_5, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_5
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_36 = [dropout_44, reshape_37]
+        del dropout_44, reshape_37
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_105, einsum_106, einsum_107 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_36, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_36
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_140,
+            split_141,
+        ) = einsum_106
+        del einsum_106
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_142,
+            split_143,
+        ) = einsum_107
+        del einsum_107
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_41 = paddle._C_ops.reshape(einsum_105, full_int_array_10)
+        del einsum_105
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_37 = [reshape_41, parameter_115]
+        del parameter_115, reshape_41
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_108, einsum_109, einsum_110 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_37, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_37
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_144,
+            split_145,
+        ) = einsum_109
+        del einsum_109
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_146,
+            split_147,
+        ) = einsum_110
+        del einsum_110
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_46, dropout_47 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_108, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_108
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_51 = paddle._C_ops.add(dropout_46, layer_norm_27)
+        del dropout_46, layer_norm_27
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_30, layer_norm_31, layer_norm_32 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_51, parameter_109, parameter_108, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_51, parameter_108, parameter_109
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_34 = paddle._C_ops.matmul(layer_norm_30, parameter_105, False, False)
+        del parameter_105
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_52 = paddle._C_ops.add(matmul_34, parameter_104)
+        del matmul_34, parameter_104
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_5 = paddle._C_ops.relu(add_52)
+        del add_52
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_48, dropout_49 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_5, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_5
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_35 = paddle._C_ops.matmul(dropout_48, parameter_103, False, False)
+        del dropout_48, parameter_103
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_53 = paddle._C_ops.add(matmul_35, parameter_102)
+        del matmul_35, parameter_102
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_50, dropout_51 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_53, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_53
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_54 = paddle._C_ops.add(dropout_50, layer_norm_30)
+        del dropout_50, layer_norm_30
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_33, layer_norm_34, layer_norm_35 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_54, parameter_107, parameter_106, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_54, parameter_106, parameter_107
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_36 = paddle._C_ops.matmul(layer_norm_33, parameter_101, False, False)
+        del parameter_101
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_42 = paddle._C_ops.reshape(matmul_36, full_int_array_5)
+        del matmul_36
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_37 = paddle._C_ops.matmul(layer_norm_33, parameter_100, False, False)
+        del parameter_100
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_43 = paddle._C_ops.reshape(matmul_37, full_int_array_5)
+        del matmul_37
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_38 = paddle._C_ops.matmul(layer_norm_33, parameter_99, False, False)
+        del parameter_99
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_44 = paddle._C_ops.reshape(matmul_38, full_int_array_5)
+        del matmul_38
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_39 = paddle._C_ops.matmul(dropout_2, parameter_97, False, False)
+        del parameter_97
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_45 = paddle._C_ops.reshape(matmul_39, full_int_array_6)
+        del matmul_39
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_55 = paddle._C_ops.add(reshape_42, parameter_94)
+        del parameter_94
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_38 = [add_55, reshape_43]
+        del add_55, reshape_43
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_111, einsum_112, einsum_113 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_38, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_38
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_148,
+            split_149,
+        ) = einsum_112
+        del einsum_112
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_150,
+            split_151,
+        ) = einsum_113
+        del einsum_113
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_56 = paddle._C_ops.add(reshape_42, parameter_96)
+        del parameter_96
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_39 = [add_56, reshape_45]
+        del add_56, reshape_45
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_114, einsum_115, einsum_116 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_39, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_39
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_152,
+            split_153,
+        ) = einsum_115
+        del einsum_115
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_154,
+            split_155,
+        ) = einsum_116
+        del einsum_116
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_46 = paddle._C_ops.reshape(einsum_114, full_int_array_7)
+        del einsum_114
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_6 = paddle._C_ops.slice(
+            reshape_46, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_46
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_47 = paddle._C_ops.reshape(slice_6, full_int_array_9)
+        del slice_6
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_6 = paddle._C_ops.index_select(reshape_47, arange_2, 3)
+        del reshape_47
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_57 = paddle._C_ops.add(reshape_42, parameter_95)
+        del parameter_95, reshape_42
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_40 = [add_57, parameter_93]
+        del add_57, parameter_93
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_117, einsum_118, einsum_119 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_40, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_40
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_156,
+            split_157,
+        ) = einsum_118
+        del einsum_118
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_158,
+            split_159,
+        ) = einsum_119
+        del einsum_119
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_41 = [cast_5, einsum_117]
+        del einsum_117
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_120, einsum_121, einsum_122 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_41, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_41
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_160,
+            split_161,
+        ) = einsum_121
+        del einsum_121
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_162,
+            split_163,
+        ) = einsum_122
+        del einsum_122
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_58 = paddle._C_ops.add(einsum_111, index_select_6)
+        del einsum_111, index_select_6
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_59 = paddle._C_ops.add(add_58, einsum_120)
+        del add_58, einsum_120
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_10 = paddle._C_ops.scale(add_59, full_16, float("0"), True)
+        del add_59
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_6 = paddle._C_ops.subtract(scale_10, scale_4)
+        del scale_10
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_6 = paddle._C_ops.softmax(subtract_6, 3)
+        del subtract_6
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_52, dropout_53 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_6, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_6
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_42 = [dropout_52, reshape_44]
+        del dropout_52, reshape_44
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_123, einsum_124, einsum_125 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_42, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_42
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_164,
+            split_165,
+        ) = einsum_124
+        del einsum_124
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_166,
+            split_167,
+        ) = einsum_125
+        del einsum_125
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_48 = paddle._C_ops.reshape(einsum_123, full_int_array_10)
+        del einsum_123
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_43 = [reshape_48, parameter_98]
+        del parameter_98, reshape_48
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_126, einsum_127, einsum_128 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_43, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_43
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_168,
+            split_169,
+        ) = einsum_127
+        del einsum_127
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_170,
+            split_171,
+        ) = einsum_128
+        del einsum_128
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_54, dropout_55 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_126, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_126
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_60 = paddle._C_ops.add(dropout_54, layer_norm_33)
+        del dropout_54, layer_norm_33
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_36, layer_norm_37, layer_norm_38 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_60, parameter_92, parameter_91, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_60, parameter_91, parameter_92
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_40 = paddle._C_ops.matmul(layer_norm_36, parameter_88, False, False)
+        del parameter_88
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_61 = paddle._C_ops.add(matmul_40, parameter_87)
+        del matmul_40, parameter_87
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_6 = paddle._C_ops.relu(add_61)
+        del add_61
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_56, dropout_57 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_6, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_6
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_41 = paddle._C_ops.matmul(dropout_56, parameter_86, False, False)
+        del dropout_56, parameter_86
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_62 = paddle._C_ops.add(matmul_41, parameter_85)
+        del matmul_41, parameter_85
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_58, dropout_59 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_62, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_62
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_63 = paddle._C_ops.add(dropout_58, layer_norm_36)
+        del dropout_58, layer_norm_36
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_39, layer_norm_40, layer_norm_41 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_63, parameter_90, parameter_89, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_63, parameter_89, parameter_90
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_42 = paddle._C_ops.matmul(layer_norm_39, parameter_84, False, False)
+        del parameter_84
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_49 = paddle._C_ops.reshape(matmul_42, full_int_array_5)
+        del matmul_42
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_43 = paddle._C_ops.matmul(layer_norm_39, parameter_83, False, False)
+        del parameter_83
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_50 = paddle._C_ops.reshape(matmul_43, full_int_array_5)
+        del matmul_43
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_44 = paddle._C_ops.matmul(layer_norm_39, parameter_82, False, False)
+        del parameter_82
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_51 = paddle._C_ops.reshape(matmul_44, full_int_array_5)
+        del matmul_44
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_45 = paddle._C_ops.matmul(dropout_2, parameter_80, False, False)
+        del parameter_80
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_52 = paddle._C_ops.reshape(matmul_45, full_int_array_6)
+        del matmul_45
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_64 = paddle._C_ops.add(reshape_49, parameter_77)
+        del parameter_77
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_44 = [add_64, reshape_50]
+        del add_64, reshape_50
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_129, einsum_130, einsum_131 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_44, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_44
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_172,
+            split_173,
+        ) = einsum_130
+        del einsum_130
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_174,
+            split_175,
+        ) = einsum_131
+        del einsum_131
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_65 = paddle._C_ops.add(reshape_49, parameter_79)
+        del parameter_79
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_45 = [add_65, reshape_52]
+        del add_65, reshape_52
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_132, einsum_133, einsum_134 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_45, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_45
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_176,
+            split_177,
+        ) = einsum_133
+        del einsum_133
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_178,
+            split_179,
+        ) = einsum_134
+        del einsum_134
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_53 = paddle._C_ops.reshape(einsum_132, full_int_array_7)
+        del einsum_132
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_7 = paddle._C_ops.slice(
+            reshape_53, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_53
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_54 = paddle._C_ops.reshape(slice_7, full_int_array_9)
+        del slice_7
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_7 = paddle._C_ops.index_select(reshape_54, arange_2, 3)
+        del reshape_54
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_66 = paddle._C_ops.add(reshape_49, parameter_78)
+        del parameter_78, reshape_49
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_46 = [add_66, parameter_76]
+        del add_66, parameter_76
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_135, einsum_136, einsum_137 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_46, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_46
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_180,
+            split_181,
+        ) = einsum_136
+        del einsum_136
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_182,
+            split_183,
+        ) = einsum_137
+        del einsum_137
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_47 = [cast_5, einsum_135]
+        del einsum_135
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_138, einsum_139, einsum_140 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_47, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_47
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_184,
+            split_185,
+        ) = einsum_139
+        del einsum_139
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_186,
+            split_187,
+        ) = einsum_140
+        del einsum_140
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_67 = paddle._C_ops.add(einsum_129, index_select_7)
+        del einsum_129, index_select_7
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_68 = paddle._C_ops.add(add_67, einsum_138)
+        del add_67, einsum_138
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_11 = paddle._C_ops.scale(add_68, full_16, float("0"), True)
+        del add_68
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_7 = paddle._C_ops.subtract(scale_11, scale_4)
+        del scale_11
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_7 = paddle._C_ops.softmax(subtract_7, 3)
+        del subtract_7
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_60, dropout_61 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_7, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_7
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_48 = [dropout_60, reshape_51]
+        del dropout_60, reshape_51
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_141, einsum_142, einsum_143 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_48, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_48
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_188,
+            split_189,
+        ) = einsum_142
+        del einsum_142
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_190,
+            split_191,
+        ) = einsum_143
+        del einsum_143
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_55 = paddle._C_ops.reshape(einsum_141, full_int_array_10)
+        del einsum_141
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_49 = [reshape_55, parameter_81]
+        del parameter_81, reshape_55
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_144, einsum_145, einsum_146 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_49, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_49
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_192,
+            split_193,
+        ) = einsum_145
+        del einsum_145
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_194,
+            split_195,
+        ) = einsum_146
+        del einsum_146
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_62, dropout_63 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_144, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_144
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_69 = paddle._C_ops.add(dropout_62, layer_norm_39)
+        del dropout_62, layer_norm_39
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_42, layer_norm_43, layer_norm_44 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_69, parameter_75, parameter_74, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_69, parameter_74, parameter_75
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_46 = paddle._C_ops.matmul(layer_norm_42, parameter_71, False, False)
+        del parameter_71
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_70 = paddle._C_ops.add(matmul_46, parameter_70)
+        del matmul_46, parameter_70
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_7 = paddle._C_ops.relu(add_70)
+        del add_70
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_64, dropout_65 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_7, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_7
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_47 = paddle._C_ops.matmul(dropout_64, parameter_69, False, False)
+        del dropout_64, parameter_69
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_71 = paddle._C_ops.add(matmul_47, parameter_68)
+        del matmul_47, parameter_68
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_66, dropout_67 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_71, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_71
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_72 = paddle._C_ops.add(dropout_66, layer_norm_42)
+        del dropout_66, layer_norm_42
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_45, layer_norm_46, layer_norm_47 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_72, parameter_73, parameter_72, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_72, parameter_72, parameter_73
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_48 = paddle._C_ops.matmul(layer_norm_45, parameter_67, False, False)
+        del parameter_67
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_56 = paddle._C_ops.reshape(matmul_48, full_int_array_5)
+        del matmul_48
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_49 = paddle._C_ops.matmul(layer_norm_45, parameter_66, False, False)
+        del parameter_66
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_57 = paddle._C_ops.reshape(matmul_49, full_int_array_5)
+        del matmul_49
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_50 = paddle._C_ops.matmul(layer_norm_45, parameter_65, False, False)
+        del parameter_65
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_58 = paddle._C_ops.reshape(matmul_50, full_int_array_5)
+        del matmul_50
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_51 = paddle._C_ops.matmul(dropout_2, parameter_63, False, False)
+        del parameter_63
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_59 = paddle._C_ops.reshape(matmul_51, full_int_array_6)
+        del matmul_51
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_73 = paddle._C_ops.add(reshape_56, parameter_60)
+        del parameter_60
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_50 = [add_73, reshape_57]
+        del add_73, reshape_57
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_147, einsum_148, einsum_149 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_50, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_50
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_196,
+            split_197,
+        ) = einsum_148
+        del einsum_148
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_198,
+            split_199,
+        ) = einsum_149
+        del einsum_149
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_74 = paddle._C_ops.add(reshape_56, parameter_62)
+        del parameter_62
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_51 = [add_74, reshape_59]
+        del add_74, reshape_59
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_150, einsum_151, einsum_152 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_51, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_51
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_200,
+            split_201,
+        ) = einsum_151
+        del einsum_151
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_202,
+            split_203,
+        ) = einsum_152
+        del einsum_152
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_60 = paddle._C_ops.reshape(einsum_150, full_int_array_7)
+        del einsum_150
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_8 = paddle._C_ops.slice(
+            reshape_60, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_60
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_61 = paddle._C_ops.reshape(slice_8, full_int_array_9)
+        del slice_8
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_8 = paddle._C_ops.index_select(reshape_61, arange_2, 3)
+        del reshape_61
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_75 = paddle._C_ops.add(reshape_56, parameter_61)
+        del parameter_61, reshape_56
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_52 = [add_75, parameter_59]
+        del add_75, parameter_59
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_153, einsum_154, einsum_155 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_52, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_52
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_204,
+            split_205,
+        ) = einsum_154
+        del einsum_154
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_206,
+            split_207,
+        ) = einsum_155
+        del einsum_155
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_53 = [cast_5, einsum_153]
+        del einsum_153
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_156, einsum_157, einsum_158 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_53, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_53
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_208,
+            split_209,
+        ) = einsum_157
+        del einsum_157
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_210,
+            split_211,
+        ) = einsum_158
+        del einsum_158
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_76 = paddle._C_ops.add(einsum_147, index_select_8)
+        del einsum_147, index_select_8
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_77 = paddle._C_ops.add(add_76, einsum_156)
+        del add_76, einsum_156
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_12 = paddle._C_ops.scale(add_77, full_16, float("0"), True)
+        del add_77
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_8 = paddle._C_ops.subtract(scale_12, scale_4)
+        del scale_12
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_8 = paddle._C_ops.softmax(subtract_8, 3)
+        del subtract_8
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_68, dropout_69 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_8, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_8
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_54 = [dropout_68, reshape_58]
+        del dropout_68, reshape_58
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_159, einsum_160, einsum_161 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_54, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_54
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_212,
+            split_213,
+        ) = einsum_160
+        del einsum_160
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_214,
+            split_215,
+        ) = einsum_161
+        del einsum_161
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_62 = paddle._C_ops.reshape(einsum_159, full_int_array_10)
+        del einsum_159
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_55 = [reshape_62, parameter_64]
+        del parameter_64, reshape_62
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_162, einsum_163, einsum_164 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_55, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_55
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_216,
+            split_217,
+        ) = einsum_163
+        del einsum_163
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_218,
+            split_219,
+        ) = einsum_164
+        del einsum_164
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_70, dropout_71 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_162, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_162
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_78 = paddle._C_ops.add(dropout_70, layer_norm_45)
+        del dropout_70, layer_norm_45
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_48, layer_norm_49, layer_norm_50 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_78, parameter_58, parameter_57, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_78, parameter_57, parameter_58
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_52 = paddle._C_ops.matmul(layer_norm_48, parameter_54, False, False)
+        del parameter_54
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_79 = paddle._C_ops.add(matmul_52, parameter_53)
+        del matmul_52, parameter_53
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_8 = paddle._C_ops.relu(add_79)
+        del add_79
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_72, dropout_73 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_8, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_8
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_53 = paddle._C_ops.matmul(dropout_72, parameter_52, False, False)
+        del dropout_72, parameter_52
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_80 = paddle._C_ops.add(matmul_53, parameter_51)
+        del matmul_53, parameter_51
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_74, dropout_75 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_80, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_80
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_81 = paddle._C_ops.add(dropout_74, layer_norm_48)
+        del dropout_74, layer_norm_48
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_51, layer_norm_52, layer_norm_53 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_81, parameter_56, parameter_55, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_81, parameter_55, parameter_56
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_54 = paddle._C_ops.matmul(layer_norm_51, parameter_50, False, False)
+        del parameter_50
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_63 = paddle._C_ops.reshape(matmul_54, full_int_array_5)
+        del matmul_54
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_55 = paddle._C_ops.matmul(layer_norm_51, parameter_49, False, False)
+        del parameter_49
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_64 = paddle._C_ops.reshape(matmul_55, full_int_array_5)
+        del matmul_55
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_56 = paddle._C_ops.matmul(layer_norm_51, parameter_48, False, False)
+        del parameter_48
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_65 = paddle._C_ops.reshape(matmul_56, full_int_array_5)
+        del matmul_56
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_57 = paddle._C_ops.matmul(dropout_2, parameter_46, False, False)
+        del parameter_46
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_66 = paddle._C_ops.reshape(matmul_57, full_int_array_6)
+        del matmul_57
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_82 = paddle._C_ops.add(reshape_63, parameter_43)
+        del parameter_43
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_56 = [add_82, reshape_64]
+        del add_82, reshape_64
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_165, einsum_166, einsum_167 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_56, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_56
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_220,
+            split_221,
+        ) = einsum_166
+        del einsum_166
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_222,
+            split_223,
+        ) = einsum_167
+        del einsum_167
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_83 = paddle._C_ops.add(reshape_63, parameter_45)
+        del parameter_45
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_57 = [add_83, reshape_66]
+        del add_83, reshape_66
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_168, einsum_169, einsum_170 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_57, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_57
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_224,
+            split_225,
+        ) = einsum_169
+        del einsum_169
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_226,
+            split_227,
+        ) = einsum_170
+        del einsum_170
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_67 = paddle._C_ops.reshape(einsum_168, full_int_array_7)
+        del einsum_168
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_9 = paddle._C_ops.slice(
+            reshape_67, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_67
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_68 = paddle._C_ops.reshape(slice_9, full_int_array_9)
+        del slice_9
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_9 = paddle._C_ops.index_select(reshape_68, arange_2, 3)
+        del reshape_68
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_84 = paddle._C_ops.add(reshape_63, parameter_44)
+        del parameter_44, reshape_63
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_58 = [add_84, parameter_42]
+        del add_84, parameter_42
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_171, einsum_172, einsum_173 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_58, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_58
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_228,
+            split_229,
+        ) = einsum_172
+        del einsum_172
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_230,
+            split_231,
+        ) = einsum_173
+        del einsum_173
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_59 = [cast_5, einsum_171]
+        del einsum_171
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_174, einsum_175, einsum_176 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_59, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_59
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_232,
+            split_233,
+        ) = einsum_175
+        del einsum_175
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_234,
+            split_235,
+        ) = einsum_176
+        del einsum_176
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_85 = paddle._C_ops.add(einsum_165, index_select_9)
+        del einsum_165, index_select_9
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_86 = paddle._C_ops.add(add_85, einsum_174)
+        del add_85, einsum_174
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_13 = paddle._C_ops.scale(add_86, full_16, float("0"), True)
+        del add_86
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_9 = paddle._C_ops.subtract(scale_13, scale_4)
+        del scale_13
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_9 = paddle._C_ops.softmax(subtract_9, 3)
+        del subtract_9
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_76, dropout_77 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_9, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_9
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_60 = [dropout_76, reshape_65]
+        del dropout_76, reshape_65
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_177, einsum_178, einsum_179 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_60, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_60
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_236,
+            split_237,
+        ) = einsum_178
+        del einsum_178
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_238,
+            split_239,
+        ) = einsum_179
+        del einsum_179
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_69 = paddle._C_ops.reshape(einsum_177, full_int_array_10)
+        del einsum_177
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_61 = [reshape_69, parameter_47]
+        del parameter_47, reshape_69
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_180, einsum_181, einsum_182 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_61, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_61
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_240,
+            split_241,
+        ) = einsum_181
+        del einsum_181
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_242,
+            split_243,
+        ) = einsum_182
+        del einsum_182
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_78, dropout_79 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_180, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_180
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_87 = paddle._C_ops.add(dropout_78, layer_norm_51)
+        del dropout_78, layer_norm_51
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_54, layer_norm_55, layer_norm_56 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_87, parameter_41, parameter_40, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_87, parameter_40, parameter_41
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_58 = paddle._C_ops.matmul(layer_norm_54, parameter_37, False, False)
+        del parameter_37
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_88 = paddle._C_ops.add(matmul_58, parameter_36)
+        del matmul_58, parameter_36
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_9 = paddle._C_ops.relu(add_88)
+        del add_88
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_80, dropout_81 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_9, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_9
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_59 = paddle._C_ops.matmul(dropout_80, parameter_35, False, False)
+        del dropout_80, parameter_35
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_89 = paddle._C_ops.add(matmul_59, parameter_34)
+        del matmul_59, parameter_34
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_82, dropout_83 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_89, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_89
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_90 = paddle._C_ops.add(dropout_82, layer_norm_54)
+        del dropout_82, layer_norm_54
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_57, layer_norm_58, layer_norm_59 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_90, parameter_39, parameter_38, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_90, parameter_38, parameter_39
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_60 = paddle._C_ops.matmul(layer_norm_57, parameter_33, False, False)
+        del parameter_33
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_70 = paddle._C_ops.reshape(matmul_60, full_int_array_5)
+        del matmul_60
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_61 = paddle._C_ops.matmul(layer_norm_57, parameter_32, False, False)
+        del parameter_32
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_71 = paddle._C_ops.reshape(matmul_61, full_int_array_5)
+        del matmul_61
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_62 = paddle._C_ops.matmul(layer_norm_57, parameter_31, False, False)
+        del parameter_31
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_72 = paddle._C_ops.reshape(matmul_62, full_int_array_5)
+        del matmul_62
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_63 = paddle._C_ops.matmul(dropout_2, parameter_29, False, False)
+        del parameter_29
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_73 = paddle._C_ops.reshape(matmul_63, full_int_array_6)
+        del matmul_63
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_91 = paddle._C_ops.add(reshape_70, parameter_26)
+        del parameter_26
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_62 = [add_91, reshape_71]
+        del add_91, reshape_71
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_183, einsum_184, einsum_185 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_62, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_62
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_244,
+            split_245,
+        ) = einsum_184
+        del einsum_184
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_246,
+            split_247,
+        ) = einsum_185
+        del einsum_185
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_92 = paddle._C_ops.add(reshape_70, parameter_28)
+        del parameter_28
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_63 = [add_92, reshape_73]
+        del add_92, reshape_73
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_186, einsum_187, einsum_188 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_63, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_63
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_248,
+            split_249,
+        ) = einsum_187
+        del einsum_187
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_250,
+            split_251,
+        ) = einsum_188
+        del einsum_188
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_74 = paddle._C_ops.reshape(einsum_186, full_int_array_7)
+        del einsum_186
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_10 = paddle._C_ops.slice(
+            reshape_74, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_74
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_75 = paddle._C_ops.reshape(slice_10, full_int_array_9)
+        del slice_10
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_10 = paddle._C_ops.index_select(reshape_75, arange_2, 3)
+        del reshape_75
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_93 = paddle._C_ops.add(reshape_70, parameter_27)
+        del parameter_27, reshape_70
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_64 = [add_93, parameter_25]
+        del add_93, parameter_25
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_189, einsum_190, einsum_191 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_64, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_64
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_252,
+            split_253,
+        ) = einsum_190
+        del einsum_190
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_254,
+            split_255,
+        ) = einsum_191
+        del einsum_191
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_65 = [cast_5, einsum_189]
+        del einsum_189
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_192, einsum_193, einsum_194 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_65, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_65
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_256,
+            split_257,
+        ) = einsum_193
+        del einsum_193
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_258,
+            split_259,
+        ) = einsum_194
+        del einsum_194
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_94 = paddle._C_ops.add(einsum_183, index_select_10)
+        del einsum_183, index_select_10
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_95 = paddle._C_ops.add(add_94, einsum_192)
+        del add_94, einsum_192
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_14 = paddle._C_ops.scale(add_95, full_16, float("0"), True)
+        del add_95
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_10 = paddle._C_ops.subtract(scale_14, scale_4)
+        del scale_14
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_10 = paddle._C_ops.softmax(subtract_10, 3)
+        del subtract_10
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_84, dropout_85 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_10, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_10
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_66 = [dropout_84, reshape_72]
+        del dropout_84, reshape_72
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_195, einsum_196, einsum_197 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_66, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_66
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_260,
+            split_261,
+        ) = einsum_196
+        del einsum_196
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_262,
+            split_263,
+        ) = einsum_197
+        del einsum_197
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_76 = paddle._C_ops.reshape(einsum_195, full_int_array_10)
+        del einsum_195
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_67 = [reshape_76, parameter_30]
+        del parameter_30, reshape_76
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_198, einsum_199, einsum_200 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_67, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_67
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_264,
+            split_265,
+        ) = einsum_199
+        del einsum_199
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_266,
+            split_267,
+        ) = einsum_200
+        del einsum_200
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_86, dropout_87 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_198, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_198
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_96 = paddle._C_ops.add(dropout_86, layer_norm_57)
+        del dropout_86, layer_norm_57
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_60, layer_norm_61, layer_norm_62 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_96, parameter_24, parameter_23, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_96, parameter_23, parameter_24
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_64 = paddle._C_ops.matmul(layer_norm_60, parameter_20, False, False)
+        del parameter_20
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_97 = paddle._C_ops.add(matmul_64, parameter_19)
+        del matmul_64, parameter_19
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_10 = paddle._C_ops.relu(add_97)
+        del add_97
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_88, dropout_89 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_10, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_10
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_65 = paddle._C_ops.matmul(dropout_88, parameter_18, False, False)
+        del dropout_88, parameter_18
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_98 = paddle._C_ops.add(matmul_65, parameter_17)
+        del matmul_65, parameter_17
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_90, dropout_91 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_98, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_98
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_99 = paddle._C_ops.add(dropout_90, layer_norm_60)
+        del dropout_90, layer_norm_60
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_63, layer_norm_64, layer_norm_65 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_99, parameter_22, parameter_21, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_99, parameter_21, parameter_22
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_66 = paddle._C_ops.matmul(layer_norm_63, parameter_16, False, False)
+        del parameter_16
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_77 = paddle._C_ops.reshape(matmul_66, full_int_array_5)
+        del matmul_66
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_67 = paddle._C_ops.matmul(layer_norm_63, parameter_15, False, False)
+        del parameter_15
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_78 = paddle._C_ops.reshape(matmul_67, full_int_array_5)
+        del matmul_67
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_68 = paddle._C_ops.matmul(layer_norm_63, parameter_14, False, False)
+        del parameter_14
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_79 = paddle._C_ops.reshape(matmul_68, full_int_array_5)
+        del full_int_array_5, matmul_68
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_69 = paddle._C_ops.matmul(dropout_2, parameter_12, False, False)
+        del dropout_2, parameter_12
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_80 = paddle._C_ops.reshape(matmul_69, full_int_array_6)
+        del full_int_array_6, matmul_69
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_100 = paddle._C_ops.add(reshape_77, parameter_9)
+        del parameter_9
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_68 = [add_100, reshape_78]
+        del add_100, reshape_78
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_201, einsum_202, einsum_203 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_68, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_68
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_268,
+            split_269,
+        ) = einsum_202
+        del einsum_202
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_270,
+            split_271,
+        ) = einsum_203
+        del einsum_203
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_101 = paddle._C_ops.add(reshape_77, parameter_11)
+        del parameter_11
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_69 = [add_101, reshape_80]
+        del add_101, reshape_80
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_204, einsum_205, einsum_206 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_69, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_69
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_272,
+            split_273,
+        ) = einsum_205
+        del einsum_205
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_274,
+            split_275,
+        ) = einsum_206
+        del einsum_206
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_81 = paddle._C_ops.reshape(einsum_204, full_int_array_7)
+        del einsum_204, full_int_array_7
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_11 = paddle._C_ops.slice(
+            reshape_81, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del full_int_array_3, full_int_array_8, reshape_81
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_82 = paddle._C_ops.reshape(slice_11, full_int_array_9)
+        del full_int_array_9, slice_11
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_11 = paddle._C_ops.index_select(reshape_82, arange_2, 3)
+        del arange_2, reshape_82
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_102 = paddle._C_ops.add(reshape_77, parameter_10)
+        del parameter_10, reshape_77
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_70 = [add_102, parameter_8]
+        del add_102, parameter_8
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_207, einsum_208, einsum_209 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_70, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_70
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_276,
+            split_277,
+        ) = einsum_208
+        del einsum_208
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_278,
+            split_279,
+        ) = einsum_209
+        del einsum_209
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_71 = [cast_5, einsum_207]
+        del cast_5, einsum_207
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_210, einsum_211, einsum_212 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_71, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_71
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_280,
+            split_281,
+        ) = einsum_211
+        del einsum_211
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_282,
+            split_283,
+        ) = einsum_212
+        del einsum_212
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_103 = paddle._C_ops.add(einsum_201, index_select_11)
+        del einsum_201, index_select_11
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_104 = paddle._C_ops.add(add_103, einsum_210)
+        del add_103, einsum_210
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_15 = paddle._C_ops.scale(add_104, full_16, float("0"), True)
+        del add_104, full_16
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_11 = paddle._C_ops.subtract(scale_15, scale_4)
+        del scale_15, scale_4
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_11 = paddle._C_ops.softmax(subtract_11, 3)
+        del subtract_11
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_92, dropout_93 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_11, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_11
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_72 = [dropout_92, reshape_79]
+        del dropout_92, reshape_79
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_213, einsum_214, einsum_215 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_72, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_72
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_284,
+            split_285,
+        ) = einsum_214
+        del einsum_214
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_286,
+            split_287,
+        ) = einsum_215
+        del einsum_215
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_83 = paddle._C_ops.reshape(einsum_213, full_int_array_10)
+        del einsum_213, full_int_array_10
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_73 = [reshape_83, parameter_13]
+        del parameter_13, reshape_83
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_216, einsum_217, einsum_218 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_73, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_73
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_288,
+            split_289,
+        ) = einsum_217
+        del einsum_217
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_290,
+            split_291,
+        ) = einsum_218
+        del einsum_218
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_94, dropout_95 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_216, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_216
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_105 = paddle._C_ops.add(dropout_94, layer_norm_63)
+        del dropout_94, layer_norm_63
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_66, layer_norm_67, layer_norm_68 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_105, parameter_7, parameter_6, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_105, parameter_6, parameter_7
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_70 = paddle._C_ops.matmul(layer_norm_66, parameter_3, False, False)
+        del parameter_3
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_106 = paddle._C_ops.add(matmul_70, parameter_2)
+        del matmul_70, parameter_2
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_11 = paddle._C_ops.relu(add_106)
+        del add_106
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_96, dropout_97 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_11, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_11
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_71 = paddle._C_ops.matmul(dropout_96, parameter_1, False, False)
+        del dropout_96, parameter_1
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_107 = paddle._C_ops.add(matmul_71, parameter_0)
+        del matmul_71, parameter_0
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_98, dropout_99 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_107, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_107
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_108 = paddle._C_ops.add(dropout_98, layer_norm_66)
+        del dropout_98, layer_norm_66
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_69, layer_norm_70, layer_norm_71 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_108, parameter_5, parameter_4, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_108, parameter_4, parameter_5
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_100, dropout_101 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                layer_norm_69, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del full_3, layer_norm_69
+
+        # pd_op.transpose: (1x9x768xf32) <- (9x1x768xf32)
+        transpose_0 = paddle._C_ops.transpose(dropout_100, [1, 0, 2])
+        del dropout_100
+
+        return transpose_0
diff --git a/paddle_samples/PaddleNLP/chinese-xlnet-base/weight_meta.py b/paddle_samples/PaddleNLP/chinese-xlnet-base/weight_meta.py
new file mode 100644
index 000000000..5dbd9da9b
--- /dev/null
+++ b/paddle_samples/PaddleNLP/chinese-xlnet-base/weight_meta.py
@@ -0,0 +1,2048 @@
+class Program_weight_tensor_parameter_0:
+    name = "parameter_0"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_1:
+    name = "parameter_1"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0998714")
+    max_val = float("0.0984303")
+    mean = float("-3.03961e-06")
+    std = float("0.0200015")
+    data = None
+
+
+class Program_weight_tensor_parameter_2:
+    name = "parameter_2"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_3:
+    name = "parameter_3"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.101221")
+    max_val = float("0.0987751")
+    mean = float("1.26591e-05")
+    std = float("0.0200075")
+    data = None
+
+
+class Program_weight_tensor_parameter_4:
+    name = "parameter_4"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_5:
+    name = "parameter_5"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_6:
+    name = "parameter_6"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_7:
+    name = "parameter_7"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_8:
+    name = "parameter_8"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.056897")
+    max_val = float("0.0570167")
+    mean = float("-0.00052722")
+    std = float("0.0200559")
+    data = None
+
+
+class Program_weight_tensor_parameter_9:
+    name = "parameter_9"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0533698")
+    max_val = float("0.0668244")
+    mean = float("0.00017133")
+    std = float("0.0196393")
+    data = None
+
+
+class Program_weight_tensor_parameter_10:
+    name = "parameter_10"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0652753")
+    max_val = float("0.0590219")
+    mean = float("0.000199135")
+    std = float("0.0198334")
+    data = None
+
+
+class Program_weight_tensor_parameter_11:
+    name = "parameter_11"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0638812")
+    max_val = float("0.0620904")
+    mean = float("-0.00144631")
+    std = float("0.0201184")
+    data = None
+
+
+class Program_weight_tensor_parameter_12:
+    name = "parameter_12"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0939125")
+    max_val = float("0.0880784")
+    mean = float("-6.9476e-06")
+    std = float("0.0200321")
+    data = None
+
+
+class Program_weight_tensor_parameter_13:
+    name = "parameter_13"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0923562")
+    max_val = float("0.0888936")
+    mean = float("-2.9202e-06")
+    std = float("0.0199792")
+    data = None
+
+
+class Program_weight_tensor_parameter_14:
+    name = "parameter_14"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.101688")
+    max_val = float("0.0916663")
+    mean = float("-2.96884e-05")
+    std = float("0.0199948")
+    data = None
+
+
+class Program_weight_tensor_parameter_15:
+    name = "parameter_15"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0873907")
+    max_val = float("0.104748")
+    mean = float("3.4997e-05")
+    std = float("0.020015")
+    data = None
+
+
+class Program_weight_tensor_parameter_16:
+    name = "parameter_16"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.092884")
+    max_val = float("0.0915055")
+    mean = float("1.42978e-05")
+    std = float("0.0199916")
+    data = None
+
+
+class Program_weight_tensor_parameter_17:
+    name = "parameter_17"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_18:
+    name = "parameter_18"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0989158")
+    max_val = float("0.0984304")
+    mean = float("-4.01166e-06")
+    std = float("0.0199911")
+    data = None
+
+
+class Program_weight_tensor_parameter_19:
+    name = "parameter_19"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_20:
+    name = "parameter_20"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0962958")
+    max_val = float("0.105661")
+    mean = float("1.3769e-06")
+    std = float("0.02")
+    data = None
+
+
+class Program_weight_tensor_parameter_21:
+    name = "parameter_21"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_22:
+    name = "parameter_22"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_23:
+    name = "parameter_23"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_24:
+    name = "parameter_24"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_25:
+    name = "parameter_25"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.063664")
+    max_val = float("0.0613667")
+    mean = float("0.000547421")
+    std = float("0.0200497")
+    data = None
+
+
+class Program_weight_tensor_parameter_26:
+    name = "parameter_26"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0671897")
+    max_val = float("0.0783652")
+    mean = float("0.000622001")
+    std = float("0.0202513")
+    data = None
+
+
+class Program_weight_tensor_parameter_27:
+    name = "parameter_27"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0725683")
+    max_val = float("0.0650216")
+    mean = float("-0.000784367")
+    std = float("0.0203172")
+    data = None
+
+
+class Program_weight_tensor_parameter_28:
+    name = "parameter_28"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0685982")
+    max_val = float("0.0590483")
+    mean = float("-0.000134032")
+    std = float("0.020622")
+    data = None
+
+
+class Program_weight_tensor_parameter_29:
+    name = "parameter_29"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0935638")
+    max_val = float("0.0981755")
+    mean = float("-1.56042e-05")
+    std = float("0.0200175")
+    data = None
+
+
+class Program_weight_tensor_parameter_30:
+    name = "parameter_30"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0918844")
+    max_val = float("0.0967983")
+    mean = float("5.58553e-05")
+    std = float("0.0200067")
+    data = None
+
+
+class Program_weight_tensor_parameter_31:
+    name = "parameter_31"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0910037")
+    max_val = float("0.104207")
+    mean = float("6.23767e-06")
+    std = float("0.0199933")
+    data = None
+
+
+class Program_weight_tensor_parameter_32:
+    name = "parameter_32"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0933517")
+    max_val = float("0.0979902")
+    mean = float("2.63145e-05")
+    std = float("0.0200088")
+    data = None
+
+
+class Program_weight_tensor_parameter_33:
+    name = "parameter_33"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0989605")
+    max_val = float("0.100781")
+    mean = float("-5.33705e-05")
+    std = float("0.020009")
+    data = None
+
+
+class Program_weight_tensor_parameter_34:
+    name = "parameter_34"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_35:
+    name = "parameter_35"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.10758")
+    max_val = float("0.10164")
+    mean = float("-1.782e-05")
+    std = float("0.0199951")
+    data = None
+
+
+class Program_weight_tensor_parameter_36:
+    name = "parameter_36"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_37:
+    name = "parameter_37"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.101564")
+    max_val = float("0.105164")
+    mean = float("3.94919e-06")
+    std = float("0.0200022")
+    data = None
+
+
+class Program_weight_tensor_parameter_38:
+    name = "parameter_38"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_39:
+    name = "parameter_39"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_40:
+    name = "parameter_40"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_41:
+    name = "parameter_41"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_42:
+    name = "parameter_42"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0743405")
+    max_val = float("0.0580922")
+    mean = float("-0.000553093")
+    std = float("0.0199521")
+    data = None
+
+
+class Program_weight_tensor_parameter_43:
+    name = "parameter_43"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0554053")
+    max_val = float("0.056885")
+    mean = float("0.00102951")
+    std = float("0.0197574")
+    data = None
+
+
+class Program_weight_tensor_parameter_44:
+    name = "parameter_44"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0579967")
+    max_val = float("0.080138")
+    mean = float("0.000286156")
+    std = float("0.0195506")
+    data = None
+
+
+class Program_weight_tensor_parameter_45:
+    name = "parameter_45"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0635722")
+    max_val = float("0.054928")
+    mean = float("-0.00117185")
+    std = float("0.0204126")
+    data = None
+
+
+class Program_weight_tensor_parameter_46:
+    name = "parameter_46"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.097754")
+    max_val = float("0.100356")
+    mean = float("1.25839e-05")
+    std = float("0.0200035")
+    data = None
+
+
+class Program_weight_tensor_parameter_47:
+    name = "parameter_47"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0978946")
+    max_val = float("0.0969981")
+    mean = float("3.99889e-06")
+    std = float("0.0200023")
+    data = None
+
+
+class Program_weight_tensor_parameter_48:
+    name = "parameter_48"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0894704")
+    max_val = float("0.0943186")
+    mean = float("-1.14581e-05")
+    std = float("0.0200001")
+    data = None
+
+
+class Program_weight_tensor_parameter_49:
+    name = "parameter_49"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0902368")
+    max_val = float("0.0955309")
+    mean = float("-1.5804e-06")
+    std = float("0.0200062")
+    data = None
+
+
+class Program_weight_tensor_parameter_50:
+    name = "parameter_50"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0894124")
+    max_val = float("0.0937951")
+    mean = float("-6.2392e-05")
+    std = float("0.0200184")
+    data = None
+
+
+class Program_weight_tensor_parameter_51:
+    name = "parameter_51"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_52:
+    name = "parameter_52"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0979337")
+    max_val = float("0.0947495")
+    mean = float("9.3209e-06")
+    std = float("0.0199896")
+    data = None
+
+
+class Program_weight_tensor_parameter_53:
+    name = "parameter_53"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_54:
+    name = "parameter_54"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0956268")
+    max_val = float("0.102042")
+    mean = float("-3.11307e-06")
+    std = float("0.0199991")
+    data = None
+
+
+class Program_weight_tensor_parameter_55:
+    name = "parameter_55"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_56:
+    name = "parameter_56"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_57:
+    name = "parameter_57"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_58:
+    name = "parameter_58"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_59:
+    name = "parameter_59"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0600446")
+    max_val = float("0.0551249")
+    mean = float("0.000668725")
+    std = float("0.0200145")
+    data = None
+
+
+class Program_weight_tensor_parameter_60:
+    name = "parameter_60"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0603542")
+    max_val = float("0.0639203")
+    mean = float("0.000876608")
+    std = float("0.0203053")
+    data = None
+
+
+class Program_weight_tensor_parameter_61:
+    name = "parameter_61"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0648046")
+    max_val = float("0.0670915")
+    mean = float("5.73312e-05")
+    std = float("0.0199847")
+    data = None
+
+
+class Program_weight_tensor_parameter_62:
+    name = "parameter_62"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0718373")
+    max_val = float("0.0617058")
+    mean = float("9.69534e-05")
+    std = float("0.0200235")
+    data = None
+
+
+class Program_weight_tensor_parameter_63:
+    name = "parameter_63"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0998177")
+    max_val = float("0.100939")
+    mean = float("-2.60804e-05")
+    std = float("0.0200047")
+    data = None
+
+
+class Program_weight_tensor_parameter_64:
+    name = "parameter_64"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0948367")
+    max_val = float("0.0902295")
+    mean = float("5.64852e-05")
+    std = float("0.0200064")
+    data = None
+
+
+class Program_weight_tensor_parameter_65:
+    name = "parameter_65"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0975177")
+    max_val = float("0.0945639")
+    mean = float("3.33007e-05")
+    std = float("0.0200165")
+    data = None
+
+
+class Program_weight_tensor_parameter_66:
+    name = "parameter_66"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0932207")
+    max_val = float("0.0939244")
+    mean = float("-3.06693e-05")
+    std = float("0.0199697")
+    data = None
+
+
+class Program_weight_tensor_parameter_67:
+    name = "parameter_67"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0910764")
+    max_val = float("0.0979006")
+    mean = float("3.5443e-05")
+    std = float("0.0199895")
+    data = None
+
+
+class Program_weight_tensor_parameter_68:
+    name = "parameter_68"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_69:
+    name = "parameter_69"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.10246")
+    max_val = float("0.102432")
+    mean = float("2.51644e-05")
+    std = float("0.0200064")
+    data = None
+
+
+class Program_weight_tensor_parameter_70:
+    name = "parameter_70"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_71:
+    name = "parameter_71"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0979288")
+    max_val = float("0.110267")
+    mean = float("-1.96634e-06")
+    std = float("0.0200013")
+    data = None
+
+
+class Program_weight_tensor_parameter_72:
+    name = "parameter_72"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_73:
+    name = "parameter_73"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_74:
+    name = "parameter_74"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_75:
+    name = "parameter_75"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_76:
+    name = "parameter_76"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0639689")
+    max_val = float("0.0547493")
+    mean = float("-0.0010852")
+    std = float("0.0195674")
+    data = None
+
+
+class Program_weight_tensor_parameter_77:
+    name = "parameter_77"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0639137")
+    max_val = float("0.0712681")
+    mean = float("0.000938638")
+    std = float("0.0206526")
+    data = None
+
+
+class Program_weight_tensor_parameter_78:
+    name = "parameter_78"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0617112")
+    max_val = float("0.0581551")
+    mean = float("-0.000829282")
+    std = float("0.0186566")
+    data = None
+
+
+class Program_weight_tensor_parameter_79:
+    name = "parameter_79"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0720919")
+    max_val = float("0.0569111")
+    mean = float("0.000933793")
+    std = float("0.0197778")
+    data = None
+
+
+class Program_weight_tensor_parameter_80:
+    name = "parameter_80"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0901267")
+    max_val = float("0.0963059")
+    mean = float("-3.99305e-05")
+    std = float("0.0200038")
+    data = None
+
+
+class Program_weight_tensor_parameter_81:
+    name = "parameter_81"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0919104")
+    max_val = float("0.0971055")
+    mean = float("2.00371e-05")
+    std = float("0.020012")
+    data = None
+
+
+class Program_weight_tensor_parameter_82:
+    name = "parameter_82"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0979838")
+    max_val = float("0.0884718")
+    mean = float("-3.46436e-05")
+    std = float("0.0199821")
+    data = None
+
+
+class Program_weight_tensor_parameter_83:
+    name = "parameter_83"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.10039")
+    max_val = float("0.0876718")
+    mean = float("-1.1278e-05")
+    std = float("0.0199789")
+    data = None
+
+
+class Program_weight_tensor_parameter_84:
+    name = "parameter_84"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0914442")
+    max_val = float("0.111778")
+    mean = float("-2.99018e-05")
+    std = float("0.0200296")
+    data = None
+
+
+class Program_weight_tensor_parameter_85:
+    name = "parameter_85"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_86:
+    name = "parameter_86"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0993673")
+    max_val = float("0.0954033")
+    mean = float("2.0312e-05")
+    std = float("0.0200069")
+    data = None
+
+
+class Program_weight_tensor_parameter_87:
+    name = "parameter_87"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_88:
+    name = "parameter_88"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.109231")
+    max_val = float("0.104778")
+    mean = float("-1.63055e-06")
+    std = float("0.0199936")
+    data = None
+
+
+class Program_weight_tensor_parameter_89:
+    name = "parameter_89"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_90:
+    name = "parameter_90"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_91:
+    name = "parameter_91"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_92:
+    name = "parameter_92"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_93:
+    name = "parameter_93"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0646925")
+    max_val = float("0.0661256")
+    mean = float("8.01721e-05")
+    std = float("0.0203713")
+    data = None
+
+
+class Program_weight_tensor_parameter_94:
+    name = "parameter_94"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0627554")
+    max_val = float("0.0622741")
+    mean = float("0.000668161")
+    std = float("0.0203228")
+    data = None
+
+
+class Program_weight_tensor_parameter_95:
+    name = "parameter_95"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0649528")
+    max_val = float("0.0575164")
+    mean = float("-0.00103082")
+    std = float("0.0206422")
+    data = None
+
+
+class Program_weight_tensor_parameter_96:
+    name = "parameter_96"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0618572")
+    max_val = float("0.0741325")
+    mean = float("4.17254e-05")
+    std = float("0.0201355")
+    data = None
+
+
+class Program_weight_tensor_parameter_97:
+    name = "parameter_97"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0940977")
+    max_val = float("0.0906086")
+    mean = float("2.22085e-05")
+    std = float("0.0199948")
+    data = None
+
+
+class Program_weight_tensor_parameter_98:
+    name = "parameter_98"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0898018")
+    max_val = float("0.0921712")
+    mean = float("-7.91858e-06")
+    std = float("0.0200158")
+    data = None
+
+
+class Program_weight_tensor_parameter_99:
+    name = "parameter_99"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0930985")
+    max_val = float("0.0904702")
+    mean = float("1.22401e-05")
+    std = float("0.0200171")
+    data = None
+
+
+class Program_weight_tensor_parameter_100:
+    name = "parameter_100"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0916543")
+    max_val = float("0.0928527")
+    mean = float("-2.07254e-05")
+    std = float("0.0200222")
+    data = None
+
+
+class Program_weight_tensor_parameter_101:
+    name = "parameter_101"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.091229")
+    max_val = float("0.0979338")
+    mean = float("-2.48638e-05")
+    std = float("0.0199802")
+    data = None
+
+
+class Program_weight_tensor_parameter_102:
+    name = "parameter_102"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_103:
+    name = "parameter_103"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0977989")
+    max_val = float("0.0964952")
+    mean = float("-1.39183e-05")
+    std = float("0.0200033")
+    data = None
+
+
+class Program_weight_tensor_parameter_104:
+    name = "parameter_104"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_105:
+    name = "parameter_105"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0993583")
+    max_val = float("0.0992722")
+    mean = float("-5.68789e-06")
+    std = float("0.0200234")
+    data = None
+
+
+class Program_weight_tensor_parameter_106:
+    name = "parameter_106"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_107:
+    name = "parameter_107"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_108:
+    name = "parameter_108"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_109:
+    name = "parameter_109"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_110:
+    name = "parameter_110"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0626282")
+    max_val = float("0.0639193")
+    mean = float("-0.000134828")
+    std = float("0.0202296")
+    data = None
+
+
+class Program_weight_tensor_parameter_111:
+    name = "parameter_111"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0610922")
+    max_val = float("0.08835")
+    mean = float("0.000520116")
+    std = float("0.0209154")
+    data = None
+
+
+class Program_weight_tensor_parameter_112:
+    name = "parameter_112"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0630409")
+    max_val = float("0.0590075")
+    mean = float("0.000529107")
+    std = float("0.0189061")
+    data = None
+
+
+class Program_weight_tensor_parameter_113:
+    name = "parameter_113"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0639899")
+    max_val = float("0.0549799")
+    mean = float("-0.000347124")
+    std = float("0.0196525")
+    data = None
+
+
+class Program_weight_tensor_parameter_114:
+    name = "parameter_114"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0921856")
+    max_val = float("0.106383")
+    mean = float("-1.21788e-05")
+    std = float("0.0200174")
+    data = None
+
+
+class Program_weight_tensor_parameter_115:
+    name = "parameter_115"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0951978")
+    max_val = float("0.0910537")
+    mean = float("-1.91198e-05")
+    std = float("0.0199829")
+    data = None
+
+
+class Program_weight_tensor_parameter_116:
+    name = "parameter_116"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0934649")
+    max_val = float("0.0918414")
+    mean = float("2.66845e-05")
+    std = float("0.0200047")
+    data = None
+
+
+class Program_weight_tensor_parameter_117:
+    name = "parameter_117"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0982122")
+    max_val = float("0.0951821")
+    mean = float("-1.94681e-05")
+    std = float("0.02001")
+    data = None
+
+
+class Program_weight_tensor_parameter_118:
+    name = "parameter_118"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.100677")
+    max_val = float("0.095596")
+    mean = float("4.64555e-05")
+    std = float("0.0200081")
+    data = None
+
+
+class Program_weight_tensor_parameter_119:
+    name = "parameter_119"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_120:
+    name = "parameter_120"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.100304")
+    max_val = float("0.0992358")
+    mean = float("-4.44139e-06")
+    std = float("0.0200026")
+    data = None
+
+
+class Program_weight_tensor_parameter_121:
+    name = "parameter_121"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_122:
+    name = "parameter_122"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0976562")
+    max_val = float("0.0991958")
+    mean = float("-2.74978e-05")
+    std = float("0.0199844")
+    data = None
+
+
+class Program_weight_tensor_parameter_123:
+    name = "parameter_123"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_124:
+    name = "parameter_124"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_125:
+    name = "parameter_125"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_126:
+    name = "parameter_126"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_127:
+    name = "parameter_127"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.066919")
+    max_val = float("0.0784602")
+    mean = float("0.000554993")
+    std = float("0.0202607")
+    data = None
+
+
+class Program_weight_tensor_parameter_128:
+    name = "parameter_128"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0579942")
+    max_val = float("0.0650711")
+    mean = float("-0.000921763")
+    std = float("0.0193981")
+    data = None
+
+
+class Program_weight_tensor_parameter_129:
+    name = "parameter_129"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0578699")
+    max_val = float("0.0777063")
+    mean = float("0.000276614")
+    std = float("0.0196959")
+    data = None
+
+
+class Program_weight_tensor_parameter_130:
+    name = "parameter_130"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0910947")
+    max_val = float("0.0641465")
+    mean = float("-0.000487076")
+    std = float("0.0202169")
+    data = None
+
+
+class Program_weight_tensor_parameter_131:
+    name = "parameter_131"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0952865")
+    max_val = float("0.0953403")
+    mean = float("-3.82647e-05")
+    std = float("0.0199912")
+    data = None
+
+
+class Program_weight_tensor_parameter_132:
+    name = "parameter_132"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0938449")
+    max_val = float("0.0938034")
+    mean = float("-5.93409e-06")
+    std = float("0.0199985")
+    data = None
+
+
+class Program_weight_tensor_parameter_133:
+    name = "parameter_133"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0966507")
+    max_val = float("0.101359")
+    mean = float("-4.50886e-05")
+    std = float("0.0200011")
+    data = None
+
+
+class Program_weight_tensor_parameter_134:
+    name = "parameter_134"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0974317")
+    max_val = float("0.088512")
+    mean = float("-1.53379e-05")
+    std = float("0.0199917")
+    data = None
+
+
+class Program_weight_tensor_parameter_135:
+    name = "parameter_135"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0947583")
+    max_val = float("0.097753")
+    mean = float("-3.77429e-05")
+    std = float("0.0199982")
+    data = None
+
+
+class Program_weight_tensor_parameter_136:
+    name = "parameter_136"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_137:
+    name = "parameter_137"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0966239")
+    max_val = float("0.0946495")
+    mean = float("7.38891e-06")
+    std = float("0.0200077")
+    data = None
+
+
+class Program_weight_tensor_parameter_138:
+    name = "parameter_138"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_139:
+    name = "parameter_139"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0990434")
+    max_val = float("0.0986182")
+    mean = float("1.60806e-06")
+    std = float("0.020012")
+    data = None
+
+
+class Program_weight_tensor_parameter_140:
+    name = "parameter_140"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_141:
+    name = "parameter_141"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_142:
+    name = "parameter_142"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_143:
+    name = "parameter_143"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_144:
+    name = "parameter_144"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0660286")
+    max_val = float("0.0705801")
+    mean = float("-0.000426975")
+    std = float("0.0202835")
+    data = None
+
+
+class Program_weight_tensor_parameter_145:
+    name = "parameter_145"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0546884")
+    max_val = float("0.0597706")
+    mean = float("8.27615e-05")
+    std = float("0.01934")
+    data = None
+
+
+class Program_weight_tensor_parameter_146:
+    name = "parameter_146"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0521603")
+    max_val = float("0.0543209")
+    mean = float("0.000320002")
+    std = float("0.0189905")
+    data = None
+
+
+class Program_weight_tensor_parameter_147:
+    name = "parameter_147"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.051644")
+    max_val = float("0.0690596")
+    mean = float("-5.32648e-05")
+    std = float("0.0192358")
+    data = None
+
+
+class Program_weight_tensor_parameter_148:
+    name = "parameter_148"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.101252")
+    max_val = float("0.0949966")
+    mean = float("-1.39793e-05")
+    std = float("0.0200079")
+    data = None
+
+
+class Program_weight_tensor_parameter_149:
+    name = "parameter_149"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0977555")
+    max_val = float("0.0929576")
+    mean = float("-2.29729e-06")
+    std = float("0.0199824")
+    data = None
+
+
+class Program_weight_tensor_parameter_150:
+    name = "parameter_150"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0889582")
+    max_val = float("0.0956246")
+    mean = float("-6.87214e-06")
+    std = float("0.0200277")
+    data = None
+
+
+class Program_weight_tensor_parameter_151:
+    name = "parameter_151"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0905439")
+    max_val = float("0.0928013")
+    mean = float("-1.9406e-05")
+    std = float("0.0200045")
+    data = None
+
+
+class Program_weight_tensor_parameter_152:
+    name = "parameter_152"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0915919")
+    max_val = float("0.0916602")
+    mean = float("2.62146e-05")
+    std = float("0.0200038")
+    data = None
+
+
+class Program_weight_tensor_parameter_153:
+    name = "parameter_153"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_154:
+    name = "parameter_154"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0979289")
+    max_val = float("0.0973227")
+    mean = float("-1.06642e-05")
+    std = float("0.0200054")
+    data = None
+
+
+class Program_weight_tensor_parameter_155:
+    name = "parameter_155"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_156:
+    name = "parameter_156"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0990122")
+    max_val = float("0.103254")
+    mean = float("-2.12928e-05")
+    std = float("0.0199923")
+    data = None
+
+
+class Program_weight_tensor_parameter_157:
+    name = "parameter_157"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_158:
+    name = "parameter_158"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_159:
+    name = "parameter_159"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_160:
+    name = "parameter_160"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_161:
+    name = "parameter_161"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.059159")
+    max_val = float("0.0750411")
+    mean = float("0.00127293")
+    std = float("0.0197122")
+    data = None
+
+
+class Program_weight_tensor_parameter_162:
+    name = "parameter_162"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0592058")
+    max_val = float("0.0590503")
+    mean = float("-0.00223688")
+    std = float("0.0198972")
+    data = None
+
+
+class Program_weight_tensor_parameter_163:
+    name = "parameter_163"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0645318")
+    max_val = float("0.0622872")
+    mean = float("-0.000444717")
+    std = float("0.020224")
+    data = None
+
+
+class Program_weight_tensor_parameter_164:
+    name = "parameter_164"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0533066")
+    max_val = float("0.066021")
+    mean = float("0.000328887")
+    std = float("0.0192417")
+    data = None
+
+
+class Program_weight_tensor_parameter_165:
+    name = "parameter_165"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.09613")
+    max_val = float("0.0914884")
+    mean = float("1.30753e-05")
+    std = float("0.0199968")
+    data = None
+
+
+class Program_weight_tensor_parameter_166:
+    name = "parameter_166"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0950543")
+    max_val = float("0.0963969")
+    mean = float("1.61388e-05")
+    std = float("0.0199462")
+    data = None
+
+
+class Program_weight_tensor_parameter_167:
+    name = "parameter_167"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0943406")
+    max_val = float("0.102827")
+    mean = float("2.36727e-05")
+    std = float("0.0200222")
+    data = None
+
+
+class Program_weight_tensor_parameter_168:
+    name = "parameter_168"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.098155")
+    max_val = float("0.0884917")
+    mean = float("3.13806e-05")
+    std = float("0.0200088")
+    data = None
+
+
+class Program_weight_tensor_parameter_169:
+    name = "parameter_169"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0911829")
+    max_val = float("0.100784")
+    mean = float("2.40136e-05")
+    std = float("0.0199743")
+    data = None
+
+
+class Program_weight_tensor_parameter_170:
+    name = "parameter_170"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_171:
+    name = "parameter_171"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.105951")
+    max_val = float("0.101723")
+    mean = float("-5.68485e-06")
+    std = float("0.0200186")
+    data = None
+
+
+class Program_weight_tensor_parameter_172:
+    name = "parameter_172"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_173:
+    name = "parameter_173"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.098726")
+    max_val = float("0.0996512")
+    mean = float("-4.4365e-06")
+    std = float("0.0200118")
+    data = None
+
+
+class Program_weight_tensor_parameter_174:
+    name = "parameter_174"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_175:
+    name = "parameter_175"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_176:
+    name = "parameter_176"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_177:
+    name = "parameter_177"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_178:
+    name = "parameter_178"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0718335")
+    max_val = float("0.065082")
+    mean = float("-5.04988e-05")
+    std = float("0.0197697")
+    data = None
+
+
+class Program_weight_tensor_parameter_179:
+    name = "parameter_179"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.056934")
+    max_val = float("0.0580286")
+    mean = float("9.49533e-05")
+    std = float("0.0193841")
+    data = None
+
+
+class Program_weight_tensor_parameter_180:
+    name = "parameter_180"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0595877")
+    max_val = float("0.0660135")
+    mean = float("0.0004164")
+    std = float("0.0194652")
+    data = None
+
+
+class Program_weight_tensor_parameter_181:
+    name = "parameter_181"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0612096")
+    max_val = float("0.0827676")
+    mean = float("0.000251395")
+    std = float("0.0197037")
+    data = None
+
+
+class Program_weight_tensor_parameter_182:
+    name = "parameter_182"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0846039")
+    max_val = float("0.0922142")
+    mean = float("-3.84193e-05")
+    std = float("0.019995")
+    data = None
+
+
+class Program_weight_tensor_parameter_183:
+    name = "parameter_183"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0926911")
+    max_val = float("0.0922433")
+    mean = float("-1.6358e-05")
+    std = float("0.0200194")
+    data = None
+
+
+class Program_weight_tensor_parameter_184:
+    name = "parameter_184"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0978895")
+    max_val = float("0.0956747")
+    mean = float("-3.46703e-05")
+    std = float("0.020004")
+    data = None
+
+
+class Program_weight_tensor_parameter_185:
+    name = "parameter_185"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.102104")
+    max_val = float("0.0910908")
+    mean = float("-5e-06")
+    std = float("0.0199864")
+    data = None
+
+
+class Program_weight_tensor_parameter_186:
+    name = "parameter_186"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0892603")
+    max_val = float("0.100866")
+    mean = float("-5.00942e-05")
+    std = float("0.0200319")
+    data = None
+
+
+class Program_weight_tensor_parameter_187:
+    name = "parameter_187"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_188:
+    name = "parameter_188"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.106639")
+    max_val = float("0.108845")
+    mean = float("7.08545e-06")
+    std = float("0.0200057")
+    data = None
+
+
+class Program_weight_tensor_parameter_189:
+    name = "parameter_189"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_190:
+    name = "parameter_190"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.105093")
+    max_val = float("0.112508")
+    mean = float("-3.25768e-05")
+    std = float("0.0200009")
+    data = None
+
+
+class Program_weight_tensor_parameter_191:
+    name = "parameter_191"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_192:
+    name = "parameter_192"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_193:
+    name = "parameter_193"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_194:
+    name = "parameter_194"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_195:
+    name = "parameter_195"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0577629")
+    max_val = float("0.065642")
+    mean = float("0.000204452")
+    std = float("0.0199027")
+    data = None
+
+
+class Program_weight_tensor_parameter_196:
+    name = "parameter_196"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.090207")
+    max_val = float("0.0611968")
+    mean = float("-0.00021308")
+    std = float("0.0197199")
+    data = None
+
+
+class Program_weight_tensor_parameter_197:
+    name = "parameter_197"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0570823")
+    max_val = float("0.0768646")
+    mean = float("0.000430867")
+    std = float("0.0196956")
+    data = None
+
+
+class Program_weight_tensor_parameter_198:
+    name = "parameter_198"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0662148")
+    max_val = float("0.0569416")
+    mean = float("0.000616465")
+    std = float("0.0208327")
+    data = None
+
+
+class Program_weight_tensor_parameter_199:
+    name = "parameter_199"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.102001")
+    max_val = float("0.096987")
+    mean = float("-2.67564e-05")
+    std = float("0.0199838")
+    data = None
+
+
+class Program_weight_tensor_parameter_200:
+    name = "parameter_200"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0906012")
+    max_val = float("0.0902717")
+    mean = float("2.2588e-05")
+    std = float("0.0199944")
+    data = None
+
+
+class Program_weight_tensor_parameter_201:
+    name = "parameter_201"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.091711")
+    max_val = float("0.111538")
+    mean = float("-1.27522e-05")
+    std = float("0.0199702")
+    data = None
+
+
+class Program_weight_tensor_parameter_202:
+    name = "parameter_202"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0936883")
+    max_val = float("0.0957427")
+    mean = float("-2.19867e-05")
+    std = float("0.0200179")
+    data = None
+
+
+class Program_weight_tensor_parameter_203:
+    name = "parameter_203"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0904621")
+    max_val = float("0.0907767")
+    mean = float("-4.65418e-05")
+    std = float("0.0200181")
+    data = None
+
+
+class Program_weight_tensor_parameter_204:
+    name = "parameter_204"
+    shape = [32000, 768]
+    dtype = "float32"
+    min_val = float("-0.110721")
+    max_val = float("0.108419")
+    mean = float("3.69692e-06")
+    std = float("0.0200022")
+    data = None
+
+
+class Program_weight_tensor_parameter_205:
+    name = "parameter_205"
+    shape = [1, 1, 768]
+    dtype = "float32"
+    min_val = float("-0.0660402")
+    max_val = float("0.0637117")
+    mean = float("-0.000634141")
+    std = float("0.0197734")
+    data = None
diff --git a/paddle_samples/PaddleNLP/chinese-xlnet-large/graph_net.json b/paddle_samples/PaddleNLP/chinese-xlnet-large/graph_net.json
new file mode 100644
index 000000000..664b8cfff
--- /dev/null
+++ b/paddle_samples/PaddleNLP/chinese-xlnet-large/graph_net.json
@@ -0,0 +1,6 @@
+{
+    "framework": "paddle",
+    "model_name": "chinese-xlnet-large",
+    "num_devices_required": 1,
+    "num_nodes_required": 1
+}
\ No newline at end of file
diff --git a/paddle_samples/PaddleNLP/chinese-xlnet-large/input_meta.py b/paddle_samples/PaddleNLP/chinese-xlnet-large/input_meta.py
new file mode 100644
index 000000000..9ea1655e0
--- /dev/null
+++ b/paddle_samples/PaddleNLP/chinese-xlnet-large/input_meta.py
@@ -0,0 +1,19 @@
+class Program_weight_tensor_data_0:
+    name = "data_0"
+    shape = [1, 9]
+    dtype = "int64"
+    data = [19, 11684, 121, 15954, 2090, 21957, 1039, 4, 3]
+
+
+class Program_weight_tensor_data_1:
+    name = "data_1"
+    shape = [1, 9]
+    dtype = "int64"
+    data = [0, 0, 0, 0, 0, 0, 0, 0, 2]
+
+
+class Program_weight_tensor_data_2:
+    name = "data_2"
+    shape = [1, 9]
+    dtype = "int64"
+    data = [1, 1, 1, 1, 1, 1, 1, 1, 1]
diff --git a/paddle_samples/PaddleNLP/chinese-xlnet-large/model.py b/paddle_samples/PaddleNLP/chinese-xlnet-large/model.py
new file mode 100644
index 000000000..c01bfa31f
--- /dev/null
+++ b/paddle_samples/PaddleNLP/chinese-xlnet-large/model.py
@@ -0,0 +1,8389 @@
+import paddle
+
+
+class GraphModule(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self,
+        parameter_0,
+        parameter_1,
+        parameter_2,
+        parameter_3,
+        parameter_4,
+        parameter_5,
+        parameter_6,
+        parameter_7,
+        parameter_8,
+        parameter_9,
+        parameter_10,
+        parameter_11,
+        parameter_12,
+        parameter_13,
+        parameter_14,
+        parameter_15,
+        parameter_16,
+        parameter_17,
+        parameter_18,
+        parameter_19,
+        parameter_20,
+        parameter_21,
+        parameter_22,
+        parameter_23,
+        parameter_24,
+        parameter_25,
+        parameter_26,
+        parameter_27,
+        parameter_28,
+        parameter_29,
+        parameter_30,
+        parameter_31,
+        parameter_32,
+        parameter_33,
+        parameter_34,
+        parameter_35,
+        parameter_36,
+        parameter_37,
+        parameter_38,
+        parameter_39,
+        parameter_40,
+        parameter_41,
+        parameter_42,
+        parameter_43,
+        parameter_44,
+        parameter_45,
+        parameter_46,
+        parameter_47,
+        parameter_48,
+        parameter_49,
+        parameter_50,
+        parameter_51,
+        parameter_52,
+        parameter_53,
+        parameter_54,
+        parameter_55,
+        parameter_56,
+        parameter_57,
+        parameter_58,
+        parameter_59,
+        parameter_60,
+        parameter_61,
+        parameter_62,
+        parameter_63,
+        parameter_64,
+        parameter_65,
+        parameter_66,
+        parameter_67,
+        parameter_68,
+        parameter_69,
+        parameter_70,
+        parameter_71,
+        parameter_72,
+        parameter_73,
+        parameter_74,
+        parameter_75,
+        parameter_76,
+        parameter_77,
+        parameter_78,
+        parameter_79,
+        parameter_80,
+        parameter_81,
+        parameter_82,
+        parameter_83,
+        parameter_84,
+        parameter_85,
+        parameter_86,
+        parameter_87,
+        parameter_88,
+        parameter_89,
+        parameter_90,
+        parameter_91,
+        parameter_92,
+        parameter_93,
+        parameter_94,
+        parameter_95,
+        parameter_96,
+        parameter_97,
+        parameter_98,
+        parameter_99,
+        parameter_100,
+        parameter_101,
+        parameter_102,
+        parameter_103,
+        parameter_104,
+        parameter_105,
+        parameter_106,
+        parameter_107,
+        parameter_108,
+        parameter_109,
+        parameter_110,
+        parameter_111,
+        parameter_112,
+        parameter_113,
+        parameter_114,
+        parameter_115,
+        parameter_116,
+        parameter_117,
+        parameter_118,
+        parameter_119,
+        parameter_120,
+        parameter_121,
+        parameter_122,
+        parameter_123,
+        parameter_124,
+        parameter_125,
+        parameter_126,
+        parameter_127,
+        parameter_128,
+        parameter_129,
+        parameter_130,
+        parameter_131,
+        parameter_132,
+        parameter_133,
+        parameter_134,
+        parameter_135,
+        parameter_136,
+        parameter_137,
+        parameter_138,
+        parameter_139,
+        parameter_140,
+        parameter_141,
+        parameter_142,
+        parameter_143,
+        parameter_144,
+        parameter_145,
+        parameter_146,
+        parameter_147,
+        parameter_148,
+        parameter_149,
+        parameter_150,
+        parameter_151,
+        parameter_152,
+        parameter_153,
+        parameter_154,
+        parameter_155,
+        parameter_156,
+        parameter_157,
+        parameter_158,
+        parameter_159,
+        parameter_160,
+        parameter_161,
+        parameter_162,
+        parameter_163,
+        parameter_164,
+        parameter_165,
+        parameter_166,
+        parameter_167,
+        parameter_168,
+        parameter_169,
+        parameter_170,
+        parameter_171,
+        parameter_172,
+        parameter_173,
+        parameter_174,
+        parameter_175,
+        parameter_176,
+        parameter_177,
+        parameter_178,
+        parameter_179,
+        parameter_180,
+        parameter_181,
+        parameter_182,
+        parameter_183,
+        parameter_184,
+        parameter_185,
+        parameter_186,
+        parameter_187,
+        parameter_188,
+        parameter_189,
+        parameter_190,
+        parameter_191,
+        parameter_192,
+        parameter_193,
+        parameter_194,
+        parameter_195,
+        parameter_196,
+        parameter_197,
+        parameter_198,
+        parameter_199,
+        parameter_200,
+        parameter_201,
+        parameter_202,
+        parameter_203,
+        parameter_204,
+        parameter_205,
+        parameter_206,
+        parameter_207,
+        parameter_208,
+        parameter_209,
+        parameter_210,
+        parameter_211,
+        parameter_212,
+        parameter_213,
+        parameter_214,
+        parameter_215,
+        parameter_216,
+        parameter_217,
+        parameter_218,
+        parameter_219,
+        parameter_220,
+        parameter_221,
+        parameter_222,
+        parameter_223,
+        parameter_224,
+        parameter_225,
+        parameter_226,
+        parameter_227,
+        parameter_228,
+        parameter_229,
+        parameter_230,
+        parameter_231,
+        parameter_232,
+        parameter_233,
+        parameter_234,
+        parameter_235,
+        parameter_236,
+        parameter_237,
+        parameter_238,
+        parameter_239,
+        parameter_240,
+        parameter_241,
+        parameter_242,
+        parameter_243,
+        parameter_244,
+        parameter_245,
+        parameter_246,
+        parameter_247,
+        parameter_248,
+        parameter_249,
+        parameter_250,
+        parameter_251,
+        parameter_252,
+        parameter_253,
+        parameter_254,
+        parameter_255,
+        parameter_256,
+        parameter_257,
+        parameter_258,
+        parameter_259,
+        parameter_260,
+        parameter_261,
+        parameter_262,
+        parameter_263,
+        parameter_264,
+        parameter_265,
+        parameter_266,
+        parameter_267,
+        parameter_268,
+        parameter_269,
+        parameter_270,
+        parameter_271,
+        parameter_272,
+        parameter_273,
+        parameter_274,
+        parameter_275,
+        parameter_276,
+        parameter_277,
+        parameter_278,
+        parameter_279,
+        parameter_280,
+        parameter_281,
+        parameter_282,
+        parameter_283,
+        parameter_284,
+        parameter_285,
+        parameter_286,
+        parameter_287,
+        parameter_288,
+        parameter_289,
+        parameter_290,
+        parameter_291,
+        parameter_292,
+        parameter_293,
+        parameter_294,
+        parameter_295,
+        parameter_296,
+        parameter_297,
+        parameter_298,
+        parameter_299,
+        parameter_300,
+        parameter_301,
+        parameter_302,
+        parameter_303,
+        parameter_304,
+        parameter_305,
+        parameter_306,
+        parameter_307,
+        parameter_308,
+        parameter_309,
+        parameter_310,
+        parameter_311,
+        parameter_312,
+        parameter_313,
+        parameter_314,
+        parameter_315,
+        parameter_316,
+        parameter_317,
+        parameter_318,
+        parameter_319,
+        parameter_320,
+        parameter_321,
+        parameter_322,
+        parameter_323,
+        parameter_324,
+        parameter_325,
+        parameter_326,
+        parameter_327,
+        parameter_328,
+        parameter_329,
+        parameter_330,
+        parameter_331,
+        parameter_332,
+        parameter_333,
+        parameter_334,
+        parameter_335,
+        parameter_336,
+        parameter_337,
+        parameter_338,
+        parameter_339,
+        parameter_340,
+        parameter_341,
+        parameter_342,
+        parameter_343,
+        parameter_344,
+        parameter_345,
+        parameter_346,
+        parameter_347,
+        parameter_348,
+        parameter_349,
+        parameter_350,
+        parameter_351,
+        parameter_352,
+        parameter_353,
+        parameter_354,
+        parameter_355,
+        parameter_356,
+        parameter_357,
+        parameter_358,
+        parameter_359,
+        parameter_360,
+        parameter_361,
+        parameter_362,
+        parameter_363,
+        parameter_364,
+        parameter_365,
+        parameter_366,
+        parameter_367,
+        parameter_368,
+        parameter_369,
+        parameter_370,
+        parameter_371,
+        parameter_372,
+        parameter_373,
+        parameter_374,
+        parameter_375,
+        parameter_376,
+        parameter_377,
+        parameter_378,
+        parameter_379,
+        parameter_380,
+        parameter_381,
+        parameter_382,
+        parameter_383,
+        parameter_384,
+        parameter_385,
+        parameter_386,
+        parameter_387,
+        parameter_388,
+        parameter_389,
+        parameter_390,
+        parameter_391,
+        parameter_392,
+        parameter_393,
+        parameter_394,
+        parameter_395,
+        parameter_396,
+        parameter_397,
+        parameter_398,
+        parameter_399,
+        parameter_400,
+        parameter_401,
+        parameter_402,
+        parameter_403,
+        parameter_404,
+        parameter_405,
+        parameter_406,
+        parameter_407,
+        parameter_408,
+        parameter_409,
+        data_0,
+        data_1,
+        data_2,
+    ):
+        # pd_op.transpose: (9x1xi64) <- (1x9xi64)
+        transpose_1 = paddle._C_ops.transpose(data_0, [1, 0])
+        del data_0
+
+        # pd_op.transpose: (9x1xi64) <- (1x9xi64)
+        transpose_2 = paddle._C_ops.transpose(data_1, [1, 0])
+        del data_1
+
+        # pd_op.transpose: (9x1xi64) <- (1x9xi64)
+        transpose_3 = paddle._C_ops.transpose(data_2, [1, 0])
+        del data_2
+
+        # pd_op.cast: (9x1xf32) <- (9x1xi64)
+        cast_0 = paddle._C_ops.cast(transpose_3, paddle.float32)
+        del transpose_3
+
+        # pd_op.full: (1xf32) <- ()
+        full_0 = paddle._C_ops.full(
+            [1], float("-1"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (9x1xf32) <- (9x1xf32, 1xf32)
+        scale_0 = paddle._C_ops.scale(cast_0, full_0, float("1"), True)
+        del cast_0
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_0 = [0]
+
+        # pd_op.unsqueeze: (1x9x1xf32) <- (9x1xf32, 1xi64)
+        unsqueeze_0 = paddle._C_ops.unsqueeze(scale_0, full_int_array_0)
+        del scale_0
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_1 = [-1]
+
+        # pd_op.unsqueeze: (1x9x1x1xf32) <- (1x9x1xf32, 1xi64)
+        unsqueeze_1 = paddle._C_ops.unsqueeze(unsqueeze_0, full_int_array_1)
+        del full_int_array_1, unsqueeze_0
+
+        # pd_op.full: (xf32) <- ()
+        full_1 = paddle._C_ops.full(
+            [], float("0"), paddle.float32, paddle.framework._current_expected_place()
+        )
+
+        # pd_op.greater_than: (1x9x1x1xb) <- (1x9x1x1xf32, xf32)
+        greater_than_0 = paddle._C_ops.greater_than(unsqueeze_1, full_1)
+        del unsqueeze_1
+
+        # pd_op.cast: (1x9x1x1xf32) <- (1x9x1x1xb)
+        cast_1 = paddle._C_ops.cast(greater_than_0, paddle.float32)
+        del greater_than_0
+
+        # pd_op.full: (9xf32) <- ()
+        full_2 = paddle._C_ops.full(
+            [9], float("1"), paddle.float32, paddle.framework._current_expected_place()
+        )
+
+        # pd_op.diag: (9x9xf32) <- (9xf32)
+        diag_0 = paddle._C_ops.diag(full_2, 0, float("0"))
+        del full_2
+
+        # pd_op.scale: (9x9xf32) <- (9x9xf32, 1xf32)
+        scale_1 = paddle._C_ops.scale(diag_0, full_0, float("0"), True)
+        del diag_0, full_0
+
+        # pd_op.cast: (9x9xf32) <- (9x9xf32)
+        cast_2 = paddle._C_ops.cast(scale_1, paddle.float32)
+        del scale_1
+
+        # pd_op.full_int_array: (2xi64) <- ()
+        full_int_array_2 = [2, 3]
+
+        # pd_op.unsqueeze: (9x9x1x1xf32) <- (9x9xf32, 2xi64)
+        unsqueeze_2 = paddle._C_ops.unsqueeze(cast_2, full_int_array_2)
+        del cast_2, full_int_array_2
+
+        # pd_op.add: (9x9x1x1xf32) <- (1x9x1x1xf32, 9x9x1x1xf32)
+        add_0 = paddle._C_ops.add(cast_1, unsqueeze_2)
+        del cast_1, unsqueeze_2
+
+        # pd_op.greater_than: (9x9x1x1xb) <- (9x9x1x1xf32, xf32)
+        greater_than_1 = paddle._C_ops.greater_than(add_0, full_1)
+        del add_0, full_1
+
+        # pd_op.cast: (9x9x1x1xf32) <- (9x9x1x1xb)
+        cast_3 = paddle._C_ops.cast(greater_than_1, paddle.float32)
+        del greater_than_1
+
+        # pd_op.embedding: (9x1x1024xf32) <- (9x1xi64, 32000x1024xf32)
+        embedding_0 = paddle._C_ops.embedding(transpose_1, parameter_408, -1, False)
+        del parameter_408, transpose_1
+
+        # pd_op.full: (1xf32) <- ()
+        full_3 = paddle._C_ops.full(
+            [1], float("0.1"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_0, dropout_1 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                embedding_0, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del embedding_0
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_3 = [1]
+
+        # pd_op.unsqueeze: (9x1x1xi64) <- (9x1xi64, 1xi64)
+        unsqueeze_3 = paddle._C_ops.unsqueeze(transpose_2, full_int_array_3)
+
+        # pd_op.unsqueeze: (1x9x1xi64) <- (9x1xi64, 1xi64)
+        unsqueeze_4 = paddle._C_ops.unsqueeze(transpose_2, full_int_array_0)
+        del full_int_array_0, transpose_2
+
+        # pd_op.not_equal: (9x9x1xb) <- (9x1x1xi64, 1x9x1xi64)
+        not_equal_0 = paddle._C_ops.not_equal(unsqueeze_3, unsqueeze_4)
+        del unsqueeze_3, unsqueeze_4
+
+        # pd_op.cast: (9x9x1xi64) <- (9x9x1xb)
+        cast_4 = paddle._C_ops.cast(not_equal_0, paddle.int64)
+        del not_equal_0
+
+        # pd_op.full: (1xi32) <- ()
+        full_4 = paddle._C_ops.full(
+            [1], float("2"), paddle.int32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.one_hot: (9x9x1x2xf32) <- (9x9x1xi64, 1xi32)
+        one_hot_0 = paddle._C_ops.one_hot(
+            cast_4 % paddle.cast(full_4, cast_4.dtype), full_4
+        )
+        del cast_4, full_4
+
+        # pd_op.cast: (9x9x1x2xf32) <- (9x9x1x2xf32)
+        cast_5 = paddle._C_ops.cast(one_hot_0, paddle.float32)
+        del one_hot_0
+
+        # pd_op.full: (1xf64) <- ()
+        full_5 = paddle._C_ops.full(
+            [1], float("0"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full: (1xf64) <- ()
+        full_6 = paddle._C_ops.full(
+            [1], float("1024"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full: (1xf64) <- ()
+        full_7 = paddle._C_ops.full(
+            [1], float("2"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.arange: (512xf32) <- (1xf64, 1xf64, 1xf64)
+        arange_0 = paddle.arange(full_5, full_6, full_7, dtype="float32")
+        del full_6, full_7
+
+        # pd_op.full: (1xf32) <- ()
+        full_8 = paddle._C_ops.full(
+            [1], float("0.000976562"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (512xf32) <- (512xf32, 1xf32)
+        scale_2 = paddle._C_ops.scale(arange_0, full_8, float("0"), True)
+        del arange_0, full_8
+
+        # pd_op.full: (512xf32) <- ()
+        full_9 = paddle._C_ops.full(
+            [512],
+            float("10000"),
+            paddle.float32,
+            paddle.framework._current_expected_place(),
+        )
+
+        # pd_op.elementwise_pow: (512xf32) <- (512xf32, 512xf32)
+        elementwise_pow_0 = paddle._C_ops.elementwise_pow(full_9, scale_2)
+        del full_9, scale_2
+
+        # pd_op.full: (512xf32) <- ()
+        full_10 = paddle._C_ops.full(
+            [512],
+            float("1"),
+            paddle.float32,
+            paddle.framework._current_expected_place(),
+        )
+
+        # pd_op.divide: (512xf32) <- (512xf32, 512xf32)
+        divide_0 = paddle._C_ops.divide(full_10, elementwise_pow_0)
+        del elementwise_pow_0, full_10
+
+        # pd_op.full: (1xf64) <- ()
+        full_11 = paddle._C_ops.full(
+            [1], float("9"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full: (1xf64) <- ()
+        full_12 = paddle._C_ops.full(
+            [1], float("-9"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full: (1xf64) <- ()
+        full_13 = paddle._C_ops.full(
+            [1], float("-1"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.arange: (18xf32) <- (1xf64, 1xf64, 1xf64)
+        arange_1 = paddle.arange(full_11, full_12, full_13, dtype="float32")
+        del full_12, full_13
+
+        # builtin.combine: ([18xf32, 512xf32]) <- (18xf32, 512xf32)
+        combine_0 = [arange_1, divide_0]
+        del arange_1, divide_0
+
+        # pd_op.einsum: (18x512xf32, [0xf32, 0xf32], [18xf32, 512xf32]) <- ([18xf32, 512xf32])
+        einsum_0, einsum_1, einsum_2 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_0, "i,d->id"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_0
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_0,
+            split_1,
+        ) = einsum_1
+        del einsum_1
+
+        # builtin.split: (18xf32, 512xf32) <- ([18xf32, 512xf32])
+        (
+            split_2,
+            split_3,
+        ) = einsum_2
+        del einsum_2
+
+        # pd_op.sin: (18x512xf32) <- (18x512xf32)
+        sin_0 = paddle._C_ops.sin(einsum_0)
+
+        # pd_op.cos: (18x512xf32) <- (18x512xf32)
+        cos_0 = paddle._C_ops.cos(einsum_0)
+        del einsum_0
+
+        # pd_op.full: (1xi32) <- ()
+        full_14 = paddle._C_ops.full(
+            [1], float("-1"), paddle.int32, paddle.core.CPUPlace()
+        )
+
+        # builtin.combine: ([18x512xf32, 18x512xf32]) <- (18x512xf32, 18x512xf32)
+        combine_1 = [sin_0, cos_0]
+        del cos_0, sin_0
+
+        # pd_op.concat: (18x1024xf32) <- ([18x512xf32, 18x512xf32], 1xi32)
+        concat_0 = paddle._C_ops.concat(combine_1, full_14)
+        del combine_1, full_14
+
+        # pd_op.unsqueeze: (18x1x1024xf32) <- (18x1024xf32, 1xi64)
+        unsqueeze_5 = paddle._C_ops.unsqueeze(concat_0, full_int_array_3)
+        del concat_0
+
+        # pd_op.full_int_array: (3xi64) <- ()
+        full_int_array_4 = [-1, 1, -1]
+
+        # pd_op.expand: (18x1x1024xf32) <- (18x1x1024xf32, 3xi64)
+        expand_0 = paddle._C_ops.expand(unsqueeze_5, full_int_array_4)
+        del full_int_array_4, unsqueeze_5
+
+        # pd_op.dropout: (18x1x1024xf32, 18x1x1024xui8) <- (18x1x1024xf32, None, 1xf32)
+        dropout_2, dropout_3 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                expand_0, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del expand_0
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_0 = paddle._C_ops.matmul(dropout_0, parameter_407, False, False)
+        del parameter_407
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_5 = [9, 1, 16, 64]
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_0 = paddle._C_ops.reshape(matmul_0, full_int_array_5)
+        del matmul_0
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_1 = paddle._C_ops.matmul(dropout_0, parameter_406, False, False)
+        del parameter_406
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_1 = paddle._C_ops.reshape(matmul_1, full_int_array_5)
+        del matmul_1
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_2 = paddle._C_ops.matmul(dropout_0, parameter_405, False, False)
+        del parameter_405
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_2 = paddle._C_ops.reshape(matmul_2, full_int_array_5)
+        del matmul_2
+
+        # pd_op.matmul: (18x1x1024xf32) <- (18x1x1024xf32, 1024x1024xf32)
+        matmul_3 = paddle._C_ops.matmul(dropout_2, parameter_403, False, False)
+        del parameter_403
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_6 = [18, -1, 16, 64]
+
+        # pd_op.reshape: (18x1x16x64xf32) <- (18x1x1024xf32, 4xi64)
+        reshape_3 = paddle._C_ops.reshape(matmul_3, full_int_array_6)
+        del matmul_3
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_1 = paddle._C_ops.add(reshape_0, parameter_400)
+        del parameter_400
+
+        # builtin.combine: ([9x1x16x64xf32, 9x1x16x64xf32]) <- (9x1x16x64xf32, 9x1x16x64xf32)
+        combine_2 = [add_1, reshape_1]
+        del add_1, reshape_1
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x1x16x64xf32, 9x1x16x64xf32]) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        einsum_3, einsum_4, einsum_5 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_2, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_2
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_4,
+            split_5,
+        ) = einsum_4
+        del einsum_4
+
+        # builtin.split: (9x1x16x64xf32, 9x1x16x64xf32) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        (
+            split_6,
+            split_7,
+        ) = einsum_5
+        del einsum_5
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_2 = paddle._C_ops.add(reshape_0, parameter_402)
+        del parameter_402
+
+        # builtin.combine: ([9x1x16x64xf32, 18x1x16x64xf32]) <- (9x1x16x64xf32, 18x1x16x64xf32)
+        combine_3 = [add_2, reshape_3]
+        del add_2, reshape_3
+
+        # pd_op.einsum: (1x16x9x18xf32, [0xf32, 0xf32], [9x1x16x64xf32, 18x1x16x64xf32]) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        einsum_6, einsum_7, einsum_8 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_3, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_3
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_8,
+            split_9,
+        ) = einsum_7
+        del einsum_7
+
+        # builtin.split: (9x1x16x64xf32, 18x1x16x64xf32) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        (
+            split_10,
+            split_11,
+        ) = einsum_8
+        del einsum_8
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_7 = [1, 16, 18, 9]
+
+        # pd_op.reshape: (1x16x18x9xf32) <- (1x16x9x18xf32, 4xi64)
+        reshape_4 = paddle._C_ops.reshape(einsum_6, full_int_array_7)
+        del einsum_6
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_8 = [2147483647]
+
+        # pd_op.slice: (1x16x17x9xf32) <- (1x16x18x9xf32, 1xi64, 1xi64)
+        slice_0 = paddle._C_ops.slice(
+            reshape_4, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_4
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_9 = [1, 16, 9, 17]
+
+        # pd_op.reshape: (1x16x9x17xf32) <- (1x16x17x9xf32, 4xi64)
+        reshape_5 = paddle._C_ops.reshape(slice_0, full_int_array_9)
+        del slice_0
+
+        # pd_op.full: (1xf64) <- ()
+        full_15 = paddle._C_ops.full(
+            [1], float("1"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.arange: (9xi64) <- (1xf64, 1xf64, 1xf64)
+        arange_2 = paddle.arange(full_5, full_11, full_15, dtype="int64")
+        del full_11, full_15, full_5
+
+        # pd_op.index_select: (1x16x9x9xf32) <- (1x16x9x17xf32, 9xi64)
+        index_select_0 = paddle._C_ops.index_select(reshape_5, arange_2, 3)
+        del reshape_5
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_3 = paddle._C_ops.add(reshape_0, parameter_401)
+        del parameter_401, reshape_0
+
+        # builtin.combine: ([9x1x16x64xf32, 2x16x64xf32]) <- (9x1x16x64xf32, 2x16x64xf32)
+        combine_4 = [add_3, parameter_399]
+        del add_3, parameter_399
+
+        # pd_op.einsum: (9x1x16x2xf32, [0xf32, 0xf32], [9x1x16x64xf32, 2x16x64xf32]) <- ([9x1x16x64xf32, 2x16x64xf32])
+        einsum_9, einsum_10, einsum_11 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_4, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_4
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_12,
+            split_13,
+        ) = einsum_10
+        del einsum_10
+
+        # builtin.split: (9x1x16x64xf32, 2x16x64xf32) <- ([9x1x16x64xf32, 2x16x64xf32])
+        (
+            split_14,
+            split_15,
+        ) = einsum_11
+        del einsum_11
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x16x2xf32]) <- (9x9x1x2xf32, 9x1x16x2xf32)
+        combine_5 = [cast_5, einsum_9]
+        del einsum_9
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x16x2xf32]) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        einsum_12, einsum_13, einsum_14 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_5, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_5
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_16,
+            split_17,
+        ) = einsum_13
+        del einsum_13
+
+        # builtin.split: (9x9x1x2xf32, 9x1x16x2xf32) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        (
+            split_18,
+            split_19,
+        ) = einsum_14
+        del einsum_14
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_4 = paddle._C_ops.add(einsum_3, index_select_0)
+        del einsum_3, index_select_0
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_5 = paddle._C_ops.add(add_4, einsum_12)
+        del add_4, einsum_12
+
+        # pd_op.full: (1xf32) <- ()
+        full_16 = paddle._C_ops.full(
+            [1], float("0.125"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x16x9x9xf32) <- (1x16x9x9xf32, 1xf32)
+        scale_3 = paddle._C_ops.scale(add_5, full_16, float("0"), True)
+        del add_5
+
+        # pd_op.transpose: (1x1x9x9xf32) <- (9x9x1x1xf32)
+        transpose_4 = paddle._C_ops.transpose(cast_3, [2, 3, 0, 1])
+        del cast_3
+
+        # pd_op.full: (1xf32) <- ()
+        full_17 = paddle._C_ops.full(
+            [1], float("1e+30"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x1x9x9xf32) <- (1x1x9x9xf32, 1xf32)
+        scale_4 = paddle._C_ops.scale(transpose_4, full_17, float("0"), True)
+        del full_17, transpose_4
+
+        # pd_op.subtract: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x1x9x9xf32)
+        subtract_0 = paddle._C_ops.subtract(scale_3, scale_4)
+        del scale_3
+
+        # pd_op.softmax: (1x16x9x9xf32) <- (1x16x9x9xf32)
+        softmax_0 = paddle._C_ops.softmax(subtract_0, 3)
+        del subtract_0
+
+        # pd_op.dropout: (1x16x9x9xf32, 1x16x9x9xui8) <- (1x16x9x9xf32, None, 1xf32)
+        dropout_4, dropout_5 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_0, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_0
+
+        # builtin.combine: ([1x16x9x9xf32, 9x1x16x64xf32]) <- (1x16x9x9xf32, 9x1x16x64xf32)
+        combine_6 = [dropout_4, reshape_2]
+        del dropout_4, reshape_2
+
+        # pd_op.einsum: (9x1x16x64xf32, [0xf32, 0xf32], [1x16x9x9xf32, 9x1x16x64xf32]) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        einsum_15, einsum_16, einsum_17 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_6, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_6
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_20,
+            split_21,
+        ) = einsum_16
+        del einsum_16
+
+        # builtin.split: (1x16x9x9xf32, 9x1x16x64xf32) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        (
+            split_22,
+            split_23,
+        ) = einsum_17
+        del einsum_17
+
+        # pd_op.full_int_array: (3xi64) <- ()
+        full_int_array_10 = [9, 1, 1024]
+
+        # pd_op.reshape: (9x1x1024xf32) <- (9x1x16x64xf32, 3xi64)
+        reshape_6 = paddle._C_ops.reshape(einsum_15, full_int_array_10)
+        del einsum_15
+
+        # builtin.combine: ([9x1x1024xf32, 1024x1024xf32]) <- (9x1x1024xf32, 1024x1024xf32)
+        combine_7 = [reshape_6, parameter_404]
+        del parameter_404, reshape_6
+
+        # pd_op.einsum: (9x1x1024xf32, [0xf32, 0xf32], [9x1x1024xf32, 1024x1024xf32]) <- ([9x1x1024xf32, 1024x1024xf32])
+        einsum_18, einsum_19, einsum_20 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_7, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_7
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_24,
+            split_25,
+        ) = einsum_19
+        del einsum_19
+
+        # builtin.split: (9x1x1024xf32, 1024x1024xf32) <- ([9x1x1024xf32, 1024x1024xf32])
+        (
+            split_26,
+            split_27,
+        ) = einsum_20
+        del einsum_20
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_6, dropout_7 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_18, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_18
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_6 = paddle._C_ops.add(dropout_6, dropout_0)
+        del dropout_0, dropout_6
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_0, layer_norm_1, layer_norm_2 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_6, parameter_398, parameter_397, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_6, parameter_397, parameter_398
+
+        # pd_op.matmul: (9x1x4096xf32) <- (9x1x1024xf32, 1024x4096xf32)
+        matmul_4 = paddle._C_ops.matmul(layer_norm_0, parameter_394, False, False)
+        del parameter_394
+
+        # pd_op.add: (9x1x4096xf32) <- (9x1x4096xf32, 4096xf32)
+        add_7 = paddle._C_ops.add(matmul_4, parameter_393)
+        del matmul_4, parameter_393
+
+        # pd_op.relu: (9x1x4096xf32) <- (9x1x4096xf32)
+        relu_0 = paddle._C_ops.relu(add_7)
+        del add_7
+
+        # pd_op.dropout: (9x1x4096xf32, 9x1x4096xui8) <- (9x1x4096xf32, None, 1xf32)
+        dropout_8, dropout_9 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_0, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_0
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x4096xf32, 4096x1024xf32)
+        matmul_5 = paddle._C_ops.matmul(dropout_8, parameter_392, False, False)
+        del dropout_8, parameter_392
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 1024xf32)
+        add_8 = paddle._C_ops.add(matmul_5, parameter_391)
+        del matmul_5, parameter_391
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_10, dropout_11 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_8, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_8
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_9 = paddle._C_ops.add(dropout_10, layer_norm_0)
+        del dropout_10, layer_norm_0
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_3, layer_norm_4, layer_norm_5 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_9, parameter_396, parameter_395, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_9, parameter_395, parameter_396
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_6 = paddle._C_ops.matmul(layer_norm_3, parameter_390, False, False)
+        del parameter_390
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_7 = paddle._C_ops.reshape(matmul_6, full_int_array_5)
+        del matmul_6
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_7 = paddle._C_ops.matmul(layer_norm_3, parameter_389, False, False)
+        del parameter_389
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_8 = paddle._C_ops.reshape(matmul_7, full_int_array_5)
+        del matmul_7
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_8 = paddle._C_ops.matmul(layer_norm_3, parameter_388, False, False)
+        del parameter_388
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_9 = paddle._C_ops.reshape(matmul_8, full_int_array_5)
+        del matmul_8
+
+        # pd_op.matmul: (18x1x1024xf32) <- (18x1x1024xf32, 1024x1024xf32)
+        matmul_9 = paddle._C_ops.matmul(dropout_2, parameter_386, False, False)
+        del parameter_386
+
+        # pd_op.reshape: (18x1x16x64xf32) <- (18x1x1024xf32, 4xi64)
+        reshape_10 = paddle._C_ops.reshape(matmul_9, full_int_array_6)
+        del matmul_9
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_10 = paddle._C_ops.add(reshape_7, parameter_383)
+        del parameter_383
+
+        # builtin.combine: ([9x1x16x64xf32, 9x1x16x64xf32]) <- (9x1x16x64xf32, 9x1x16x64xf32)
+        combine_8 = [add_10, reshape_8]
+        del add_10, reshape_8
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x1x16x64xf32, 9x1x16x64xf32]) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        einsum_21, einsum_22, einsum_23 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_8, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_8
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_28,
+            split_29,
+        ) = einsum_22
+        del einsum_22
+
+        # builtin.split: (9x1x16x64xf32, 9x1x16x64xf32) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        (
+            split_30,
+            split_31,
+        ) = einsum_23
+        del einsum_23
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_11 = paddle._C_ops.add(reshape_7, parameter_385)
+        del parameter_385
+
+        # builtin.combine: ([9x1x16x64xf32, 18x1x16x64xf32]) <- (9x1x16x64xf32, 18x1x16x64xf32)
+        combine_9 = [add_11, reshape_10]
+        del add_11, reshape_10
+
+        # pd_op.einsum: (1x16x9x18xf32, [0xf32, 0xf32], [9x1x16x64xf32, 18x1x16x64xf32]) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        einsum_24, einsum_25, einsum_26 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_9, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_9
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_32,
+            split_33,
+        ) = einsum_25
+        del einsum_25
+
+        # builtin.split: (9x1x16x64xf32, 18x1x16x64xf32) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        (
+            split_34,
+            split_35,
+        ) = einsum_26
+        del einsum_26
+
+        # pd_op.reshape: (1x16x18x9xf32) <- (1x16x9x18xf32, 4xi64)
+        reshape_11 = paddle._C_ops.reshape(einsum_24, full_int_array_7)
+        del einsum_24
+
+        # pd_op.slice: (1x16x17x9xf32) <- (1x16x18x9xf32, 1xi64, 1xi64)
+        slice_1 = paddle._C_ops.slice(
+            reshape_11, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_11
+
+        # pd_op.reshape: (1x16x9x17xf32) <- (1x16x17x9xf32, 4xi64)
+        reshape_12 = paddle._C_ops.reshape(slice_1, full_int_array_9)
+        del slice_1
+
+        # pd_op.index_select: (1x16x9x9xf32) <- (1x16x9x17xf32, 9xi64)
+        index_select_1 = paddle._C_ops.index_select(reshape_12, arange_2, 3)
+        del reshape_12
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_12 = paddle._C_ops.add(reshape_7, parameter_384)
+        del parameter_384, reshape_7
+
+        # builtin.combine: ([9x1x16x64xf32, 2x16x64xf32]) <- (9x1x16x64xf32, 2x16x64xf32)
+        combine_10 = [add_12, parameter_382]
+        del add_12, parameter_382
+
+        # pd_op.einsum: (9x1x16x2xf32, [0xf32, 0xf32], [9x1x16x64xf32, 2x16x64xf32]) <- ([9x1x16x64xf32, 2x16x64xf32])
+        einsum_27, einsum_28, einsum_29 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_10, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_10
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_36,
+            split_37,
+        ) = einsum_28
+        del einsum_28
+
+        # builtin.split: (9x1x16x64xf32, 2x16x64xf32) <- ([9x1x16x64xf32, 2x16x64xf32])
+        (
+            split_38,
+            split_39,
+        ) = einsum_29
+        del einsum_29
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x16x2xf32]) <- (9x9x1x2xf32, 9x1x16x2xf32)
+        combine_11 = [cast_5, einsum_27]
+        del einsum_27
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x16x2xf32]) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        einsum_30, einsum_31, einsum_32 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_11, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_11
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_40,
+            split_41,
+        ) = einsum_31
+        del einsum_31
+
+        # builtin.split: (9x9x1x2xf32, 9x1x16x2xf32) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        (
+            split_42,
+            split_43,
+        ) = einsum_32
+        del einsum_32
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_13 = paddle._C_ops.add(einsum_21, index_select_1)
+        del einsum_21, index_select_1
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_14 = paddle._C_ops.add(add_13, einsum_30)
+        del add_13, einsum_30
+
+        # pd_op.scale: (1x16x9x9xf32) <- (1x16x9x9xf32, 1xf32)
+        scale_5 = paddle._C_ops.scale(add_14, full_16, float("0"), True)
+        del add_14
+
+        # pd_op.subtract: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x1x9x9xf32)
+        subtract_1 = paddle._C_ops.subtract(scale_5, scale_4)
+        del scale_5
+
+        # pd_op.softmax: (1x16x9x9xf32) <- (1x16x9x9xf32)
+        softmax_1 = paddle._C_ops.softmax(subtract_1, 3)
+        del subtract_1
+
+        # pd_op.dropout: (1x16x9x9xf32, 1x16x9x9xui8) <- (1x16x9x9xf32, None, 1xf32)
+        dropout_12, dropout_13 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_1, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_1
+
+        # builtin.combine: ([1x16x9x9xf32, 9x1x16x64xf32]) <- (1x16x9x9xf32, 9x1x16x64xf32)
+        combine_12 = [dropout_12, reshape_9]
+        del dropout_12, reshape_9
+
+        # pd_op.einsum: (9x1x16x64xf32, [0xf32, 0xf32], [1x16x9x9xf32, 9x1x16x64xf32]) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        einsum_33, einsum_34, einsum_35 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_12, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_12
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_44,
+            split_45,
+        ) = einsum_34
+        del einsum_34
+
+        # builtin.split: (1x16x9x9xf32, 9x1x16x64xf32) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        (
+            split_46,
+            split_47,
+        ) = einsum_35
+        del einsum_35
+
+        # pd_op.reshape: (9x1x1024xf32) <- (9x1x16x64xf32, 3xi64)
+        reshape_13 = paddle._C_ops.reshape(einsum_33, full_int_array_10)
+        del einsum_33
+
+        # builtin.combine: ([9x1x1024xf32, 1024x1024xf32]) <- (9x1x1024xf32, 1024x1024xf32)
+        combine_13 = [reshape_13, parameter_387]
+        del parameter_387, reshape_13
+
+        # pd_op.einsum: (9x1x1024xf32, [0xf32, 0xf32], [9x1x1024xf32, 1024x1024xf32]) <- ([9x1x1024xf32, 1024x1024xf32])
+        einsum_36, einsum_37, einsum_38 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_13, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_13
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_48,
+            split_49,
+        ) = einsum_37
+        del einsum_37
+
+        # builtin.split: (9x1x1024xf32, 1024x1024xf32) <- ([9x1x1024xf32, 1024x1024xf32])
+        (
+            split_50,
+            split_51,
+        ) = einsum_38
+        del einsum_38
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_14, dropout_15 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_36, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_36
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_15 = paddle._C_ops.add(dropout_14, layer_norm_3)
+        del dropout_14, layer_norm_3
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_6, layer_norm_7, layer_norm_8 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_15, parameter_381, parameter_380, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_15, parameter_380, parameter_381
+
+        # pd_op.matmul: (9x1x4096xf32) <- (9x1x1024xf32, 1024x4096xf32)
+        matmul_10 = paddle._C_ops.matmul(layer_norm_6, parameter_377, False, False)
+        del parameter_377
+
+        # pd_op.add: (9x1x4096xf32) <- (9x1x4096xf32, 4096xf32)
+        add_16 = paddle._C_ops.add(matmul_10, parameter_376)
+        del matmul_10, parameter_376
+
+        # pd_op.relu: (9x1x4096xf32) <- (9x1x4096xf32)
+        relu_1 = paddle._C_ops.relu(add_16)
+        del add_16
+
+        # pd_op.dropout: (9x1x4096xf32, 9x1x4096xui8) <- (9x1x4096xf32, None, 1xf32)
+        dropout_16, dropout_17 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_1, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_1
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x4096xf32, 4096x1024xf32)
+        matmul_11 = paddle._C_ops.matmul(dropout_16, parameter_375, False, False)
+        del dropout_16, parameter_375
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 1024xf32)
+        add_17 = paddle._C_ops.add(matmul_11, parameter_374)
+        del matmul_11, parameter_374
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_18, dropout_19 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_17, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_17
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_18 = paddle._C_ops.add(dropout_18, layer_norm_6)
+        del dropout_18, layer_norm_6
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_9, layer_norm_10, layer_norm_11 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_18, parameter_379, parameter_378, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_18, parameter_378, parameter_379
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_12 = paddle._C_ops.matmul(layer_norm_9, parameter_373, False, False)
+        del parameter_373
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_14 = paddle._C_ops.reshape(matmul_12, full_int_array_5)
+        del matmul_12
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_13 = paddle._C_ops.matmul(layer_norm_9, parameter_372, False, False)
+        del parameter_372
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_15 = paddle._C_ops.reshape(matmul_13, full_int_array_5)
+        del matmul_13
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_14 = paddle._C_ops.matmul(layer_norm_9, parameter_371, False, False)
+        del parameter_371
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_16 = paddle._C_ops.reshape(matmul_14, full_int_array_5)
+        del matmul_14
+
+        # pd_op.matmul: (18x1x1024xf32) <- (18x1x1024xf32, 1024x1024xf32)
+        matmul_15 = paddle._C_ops.matmul(dropout_2, parameter_369, False, False)
+        del parameter_369
+
+        # pd_op.reshape: (18x1x16x64xf32) <- (18x1x1024xf32, 4xi64)
+        reshape_17 = paddle._C_ops.reshape(matmul_15, full_int_array_6)
+        del matmul_15
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_19 = paddle._C_ops.add(reshape_14, parameter_366)
+        del parameter_366
+
+        # builtin.combine: ([9x1x16x64xf32, 9x1x16x64xf32]) <- (9x1x16x64xf32, 9x1x16x64xf32)
+        combine_14 = [add_19, reshape_15]
+        del add_19, reshape_15
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x1x16x64xf32, 9x1x16x64xf32]) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        einsum_39, einsum_40, einsum_41 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_14, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_14
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_52,
+            split_53,
+        ) = einsum_40
+        del einsum_40
+
+        # builtin.split: (9x1x16x64xf32, 9x1x16x64xf32) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        (
+            split_54,
+            split_55,
+        ) = einsum_41
+        del einsum_41
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_20 = paddle._C_ops.add(reshape_14, parameter_368)
+        del parameter_368
+
+        # builtin.combine: ([9x1x16x64xf32, 18x1x16x64xf32]) <- (9x1x16x64xf32, 18x1x16x64xf32)
+        combine_15 = [add_20, reshape_17]
+        del add_20, reshape_17
+
+        # pd_op.einsum: (1x16x9x18xf32, [0xf32, 0xf32], [9x1x16x64xf32, 18x1x16x64xf32]) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        einsum_42, einsum_43, einsum_44 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_15, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_15
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_56,
+            split_57,
+        ) = einsum_43
+        del einsum_43
+
+        # builtin.split: (9x1x16x64xf32, 18x1x16x64xf32) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        (
+            split_58,
+            split_59,
+        ) = einsum_44
+        del einsum_44
+
+        # pd_op.reshape: (1x16x18x9xf32) <- (1x16x9x18xf32, 4xi64)
+        reshape_18 = paddle._C_ops.reshape(einsum_42, full_int_array_7)
+        del einsum_42
+
+        # pd_op.slice: (1x16x17x9xf32) <- (1x16x18x9xf32, 1xi64, 1xi64)
+        slice_2 = paddle._C_ops.slice(
+            reshape_18, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_18
+
+        # pd_op.reshape: (1x16x9x17xf32) <- (1x16x17x9xf32, 4xi64)
+        reshape_19 = paddle._C_ops.reshape(slice_2, full_int_array_9)
+        del slice_2
+
+        # pd_op.index_select: (1x16x9x9xf32) <- (1x16x9x17xf32, 9xi64)
+        index_select_2 = paddle._C_ops.index_select(reshape_19, arange_2, 3)
+        del reshape_19
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_21 = paddle._C_ops.add(reshape_14, parameter_367)
+        del parameter_367, reshape_14
+
+        # builtin.combine: ([9x1x16x64xf32, 2x16x64xf32]) <- (9x1x16x64xf32, 2x16x64xf32)
+        combine_16 = [add_21, parameter_365]
+        del add_21, parameter_365
+
+        # pd_op.einsum: (9x1x16x2xf32, [0xf32, 0xf32], [9x1x16x64xf32, 2x16x64xf32]) <- ([9x1x16x64xf32, 2x16x64xf32])
+        einsum_45, einsum_46, einsum_47 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_16, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_16
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_60,
+            split_61,
+        ) = einsum_46
+        del einsum_46
+
+        # builtin.split: (9x1x16x64xf32, 2x16x64xf32) <- ([9x1x16x64xf32, 2x16x64xf32])
+        (
+            split_62,
+            split_63,
+        ) = einsum_47
+        del einsum_47
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x16x2xf32]) <- (9x9x1x2xf32, 9x1x16x2xf32)
+        combine_17 = [cast_5, einsum_45]
+        del einsum_45
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x16x2xf32]) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        einsum_48, einsum_49, einsum_50 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_17, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_17
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_64,
+            split_65,
+        ) = einsum_49
+        del einsum_49
+
+        # builtin.split: (9x9x1x2xf32, 9x1x16x2xf32) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        (
+            split_66,
+            split_67,
+        ) = einsum_50
+        del einsum_50
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_22 = paddle._C_ops.add(einsum_39, index_select_2)
+        del einsum_39, index_select_2
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_23 = paddle._C_ops.add(add_22, einsum_48)
+        del add_22, einsum_48
+
+        # pd_op.scale: (1x16x9x9xf32) <- (1x16x9x9xf32, 1xf32)
+        scale_6 = paddle._C_ops.scale(add_23, full_16, float("0"), True)
+        del add_23
+
+        # pd_op.subtract: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x1x9x9xf32)
+        subtract_2 = paddle._C_ops.subtract(scale_6, scale_4)
+        del scale_6
+
+        # pd_op.softmax: (1x16x9x9xf32) <- (1x16x9x9xf32)
+        softmax_2 = paddle._C_ops.softmax(subtract_2, 3)
+        del subtract_2
+
+        # pd_op.dropout: (1x16x9x9xf32, 1x16x9x9xui8) <- (1x16x9x9xf32, None, 1xf32)
+        dropout_20, dropout_21 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_2, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_2
+
+        # builtin.combine: ([1x16x9x9xf32, 9x1x16x64xf32]) <- (1x16x9x9xf32, 9x1x16x64xf32)
+        combine_18 = [dropout_20, reshape_16]
+        del dropout_20, reshape_16
+
+        # pd_op.einsum: (9x1x16x64xf32, [0xf32, 0xf32], [1x16x9x9xf32, 9x1x16x64xf32]) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        einsum_51, einsum_52, einsum_53 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_18, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_18
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_68,
+            split_69,
+        ) = einsum_52
+        del einsum_52
+
+        # builtin.split: (1x16x9x9xf32, 9x1x16x64xf32) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        (
+            split_70,
+            split_71,
+        ) = einsum_53
+        del einsum_53
+
+        # pd_op.reshape: (9x1x1024xf32) <- (9x1x16x64xf32, 3xi64)
+        reshape_20 = paddle._C_ops.reshape(einsum_51, full_int_array_10)
+        del einsum_51
+
+        # builtin.combine: ([9x1x1024xf32, 1024x1024xf32]) <- (9x1x1024xf32, 1024x1024xf32)
+        combine_19 = [reshape_20, parameter_370]
+        del parameter_370, reshape_20
+
+        # pd_op.einsum: (9x1x1024xf32, [0xf32, 0xf32], [9x1x1024xf32, 1024x1024xf32]) <- ([9x1x1024xf32, 1024x1024xf32])
+        einsum_54, einsum_55, einsum_56 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_19, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_19
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_72,
+            split_73,
+        ) = einsum_55
+        del einsum_55
+
+        # builtin.split: (9x1x1024xf32, 1024x1024xf32) <- ([9x1x1024xf32, 1024x1024xf32])
+        (
+            split_74,
+            split_75,
+        ) = einsum_56
+        del einsum_56
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_22, dropout_23 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_54, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_54
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_24 = paddle._C_ops.add(dropout_22, layer_norm_9)
+        del dropout_22, layer_norm_9
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_12, layer_norm_13, layer_norm_14 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_24, parameter_364, parameter_363, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_24, parameter_363, parameter_364
+
+        # pd_op.matmul: (9x1x4096xf32) <- (9x1x1024xf32, 1024x4096xf32)
+        matmul_16 = paddle._C_ops.matmul(layer_norm_12, parameter_360, False, False)
+        del parameter_360
+
+        # pd_op.add: (9x1x4096xf32) <- (9x1x4096xf32, 4096xf32)
+        add_25 = paddle._C_ops.add(matmul_16, parameter_359)
+        del matmul_16, parameter_359
+
+        # pd_op.relu: (9x1x4096xf32) <- (9x1x4096xf32)
+        relu_2 = paddle._C_ops.relu(add_25)
+        del add_25
+
+        # pd_op.dropout: (9x1x4096xf32, 9x1x4096xui8) <- (9x1x4096xf32, None, 1xf32)
+        dropout_24, dropout_25 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_2, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_2
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x4096xf32, 4096x1024xf32)
+        matmul_17 = paddle._C_ops.matmul(dropout_24, parameter_358, False, False)
+        del dropout_24, parameter_358
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 1024xf32)
+        add_26 = paddle._C_ops.add(matmul_17, parameter_357)
+        del matmul_17, parameter_357
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_26, dropout_27 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_26, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_26
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_27 = paddle._C_ops.add(dropout_26, layer_norm_12)
+        del dropout_26, layer_norm_12
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_15, layer_norm_16, layer_norm_17 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_27, parameter_362, parameter_361, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_27, parameter_361, parameter_362
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_18 = paddle._C_ops.matmul(layer_norm_15, parameter_356, False, False)
+        del parameter_356
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_21 = paddle._C_ops.reshape(matmul_18, full_int_array_5)
+        del matmul_18
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_19 = paddle._C_ops.matmul(layer_norm_15, parameter_355, False, False)
+        del parameter_355
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_22 = paddle._C_ops.reshape(matmul_19, full_int_array_5)
+        del matmul_19
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_20 = paddle._C_ops.matmul(layer_norm_15, parameter_354, False, False)
+        del parameter_354
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_23 = paddle._C_ops.reshape(matmul_20, full_int_array_5)
+        del matmul_20
+
+        # pd_op.matmul: (18x1x1024xf32) <- (18x1x1024xf32, 1024x1024xf32)
+        matmul_21 = paddle._C_ops.matmul(dropout_2, parameter_352, False, False)
+        del parameter_352
+
+        # pd_op.reshape: (18x1x16x64xf32) <- (18x1x1024xf32, 4xi64)
+        reshape_24 = paddle._C_ops.reshape(matmul_21, full_int_array_6)
+        del matmul_21
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_28 = paddle._C_ops.add(reshape_21, parameter_349)
+        del parameter_349
+
+        # builtin.combine: ([9x1x16x64xf32, 9x1x16x64xf32]) <- (9x1x16x64xf32, 9x1x16x64xf32)
+        combine_20 = [add_28, reshape_22]
+        del add_28, reshape_22
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x1x16x64xf32, 9x1x16x64xf32]) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        einsum_57, einsum_58, einsum_59 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_20, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_20
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_76,
+            split_77,
+        ) = einsum_58
+        del einsum_58
+
+        # builtin.split: (9x1x16x64xf32, 9x1x16x64xf32) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        (
+            split_78,
+            split_79,
+        ) = einsum_59
+        del einsum_59
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_29 = paddle._C_ops.add(reshape_21, parameter_351)
+        del parameter_351
+
+        # builtin.combine: ([9x1x16x64xf32, 18x1x16x64xf32]) <- (9x1x16x64xf32, 18x1x16x64xf32)
+        combine_21 = [add_29, reshape_24]
+        del add_29, reshape_24
+
+        # pd_op.einsum: (1x16x9x18xf32, [0xf32, 0xf32], [9x1x16x64xf32, 18x1x16x64xf32]) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        einsum_60, einsum_61, einsum_62 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_21, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_21
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_80,
+            split_81,
+        ) = einsum_61
+        del einsum_61
+
+        # builtin.split: (9x1x16x64xf32, 18x1x16x64xf32) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        (
+            split_82,
+            split_83,
+        ) = einsum_62
+        del einsum_62
+
+        # pd_op.reshape: (1x16x18x9xf32) <- (1x16x9x18xf32, 4xi64)
+        reshape_25 = paddle._C_ops.reshape(einsum_60, full_int_array_7)
+        del einsum_60
+
+        # pd_op.slice: (1x16x17x9xf32) <- (1x16x18x9xf32, 1xi64, 1xi64)
+        slice_3 = paddle._C_ops.slice(
+            reshape_25, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_25
+
+        # pd_op.reshape: (1x16x9x17xf32) <- (1x16x17x9xf32, 4xi64)
+        reshape_26 = paddle._C_ops.reshape(slice_3, full_int_array_9)
+        del slice_3
+
+        # pd_op.index_select: (1x16x9x9xf32) <- (1x16x9x17xf32, 9xi64)
+        index_select_3 = paddle._C_ops.index_select(reshape_26, arange_2, 3)
+        del reshape_26
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_30 = paddle._C_ops.add(reshape_21, parameter_350)
+        del parameter_350, reshape_21
+
+        # builtin.combine: ([9x1x16x64xf32, 2x16x64xf32]) <- (9x1x16x64xf32, 2x16x64xf32)
+        combine_22 = [add_30, parameter_348]
+        del add_30, parameter_348
+
+        # pd_op.einsum: (9x1x16x2xf32, [0xf32, 0xf32], [9x1x16x64xf32, 2x16x64xf32]) <- ([9x1x16x64xf32, 2x16x64xf32])
+        einsum_63, einsum_64, einsum_65 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_22, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_22
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_84,
+            split_85,
+        ) = einsum_64
+        del einsum_64
+
+        # builtin.split: (9x1x16x64xf32, 2x16x64xf32) <- ([9x1x16x64xf32, 2x16x64xf32])
+        (
+            split_86,
+            split_87,
+        ) = einsum_65
+        del einsum_65
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x16x2xf32]) <- (9x9x1x2xf32, 9x1x16x2xf32)
+        combine_23 = [cast_5, einsum_63]
+        del einsum_63
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x16x2xf32]) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        einsum_66, einsum_67, einsum_68 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_23, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_23
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_88,
+            split_89,
+        ) = einsum_67
+        del einsum_67
+
+        # builtin.split: (9x9x1x2xf32, 9x1x16x2xf32) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        (
+            split_90,
+            split_91,
+        ) = einsum_68
+        del einsum_68
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_31 = paddle._C_ops.add(einsum_57, index_select_3)
+        del einsum_57, index_select_3
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_32 = paddle._C_ops.add(add_31, einsum_66)
+        del add_31, einsum_66
+
+        # pd_op.scale: (1x16x9x9xf32) <- (1x16x9x9xf32, 1xf32)
+        scale_7 = paddle._C_ops.scale(add_32, full_16, float("0"), True)
+        del add_32
+
+        # pd_op.subtract: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x1x9x9xf32)
+        subtract_3 = paddle._C_ops.subtract(scale_7, scale_4)
+        del scale_7
+
+        # pd_op.softmax: (1x16x9x9xf32) <- (1x16x9x9xf32)
+        softmax_3 = paddle._C_ops.softmax(subtract_3, 3)
+        del subtract_3
+
+        # pd_op.dropout: (1x16x9x9xf32, 1x16x9x9xui8) <- (1x16x9x9xf32, None, 1xf32)
+        dropout_28, dropout_29 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_3, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_3
+
+        # builtin.combine: ([1x16x9x9xf32, 9x1x16x64xf32]) <- (1x16x9x9xf32, 9x1x16x64xf32)
+        combine_24 = [dropout_28, reshape_23]
+        del dropout_28, reshape_23
+
+        # pd_op.einsum: (9x1x16x64xf32, [0xf32, 0xf32], [1x16x9x9xf32, 9x1x16x64xf32]) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        einsum_69, einsum_70, einsum_71 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_24, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_24
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_92,
+            split_93,
+        ) = einsum_70
+        del einsum_70
+
+        # builtin.split: (1x16x9x9xf32, 9x1x16x64xf32) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        (
+            split_94,
+            split_95,
+        ) = einsum_71
+        del einsum_71
+
+        # pd_op.reshape: (9x1x1024xf32) <- (9x1x16x64xf32, 3xi64)
+        reshape_27 = paddle._C_ops.reshape(einsum_69, full_int_array_10)
+        del einsum_69
+
+        # builtin.combine: ([9x1x1024xf32, 1024x1024xf32]) <- (9x1x1024xf32, 1024x1024xf32)
+        combine_25 = [reshape_27, parameter_353]
+        del parameter_353, reshape_27
+
+        # pd_op.einsum: (9x1x1024xf32, [0xf32, 0xf32], [9x1x1024xf32, 1024x1024xf32]) <- ([9x1x1024xf32, 1024x1024xf32])
+        einsum_72, einsum_73, einsum_74 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_25, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_25
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_96,
+            split_97,
+        ) = einsum_73
+        del einsum_73
+
+        # builtin.split: (9x1x1024xf32, 1024x1024xf32) <- ([9x1x1024xf32, 1024x1024xf32])
+        (
+            split_98,
+            split_99,
+        ) = einsum_74
+        del einsum_74
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_30, dropout_31 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_72, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_72
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_33 = paddle._C_ops.add(dropout_30, layer_norm_15)
+        del dropout_30, layer_norm_15
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_18, layer_norm_19, layer_norm_20 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_33, parameter_347, parameter_346, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_33, parameter_346, parameter_347
+
+        # pd_op.matmul: (9x1x4096xf32) <- (9x1x1024xf32, 1024x4096xf32)
+        matmul_22 = paddle._C_ops.matmul(layer_norm_18, parameter_343, False, False)
+        del parameter_343
+
+        # pd_op.add: (9x1x4096xf32) <- (9x1x4096xf32, 4096xf32)
+        add_34 = paddle._C_ops.add(matmul_22, parameter_342)
+        del matmul_22, parameter_342
+
+        # pd_op.relu: (9x1x4096xf32) <- (9x1x4096xf32)
+        relu_3 = paddle._C_ops.relu(add_34)
+        del add_34
+
+        # pd_op.dropout: (9x1x4096xf32, 9x1x4096xui8) <- (9x1x4096xf32, None, 1xf32)
+        dropout_32, dropout_33 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_3, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_3
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x4096xf32, 4096x1024xf32)
+        matmul_23 = paddle._C_ops.matmul(dropout_32, parameter_341, False, False)
+        del dropout_32, parameter_341
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 1024xf32)
+        add_35 = paddle._C_ops.add(matmul_23, parameter_340)
+        del matmul_23, parameter_340
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_34, dropout_35 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_35, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_35
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_36 = paddle._C_ops.add(dropout_34, layer_norm_18)
+        del dropout_34, layer_norm_18
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_21, layer_norm_22, layer_norm_23 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_36, parameter_345, parameter_344, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_36, parameter_344, parameter_345
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_24 = paddle._C_ops.matmul(layer_norm_21, parameter_339, False, False)
+        del parameter_339
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_28 = paddle._C_ops.reshape(matmul_24, full_int_array_5)
+        del matmul_24
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_25 = paddle._C_ops.matmul(layer_norm_21, parameter_338, False, False)
+        del parameter_338
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_29 = paddle._C_ops.reshape(matmul_25, full_int_array_5)
+        del matmul_25
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_26 = paddle._C_ops.matmul(layer_norm_21, parameter_337, False, False)
+        del parameter_337
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_30 = paddle._C_ops.reshape(matmul_26, full_int_array_5)
+        del matmul_26
+
+        # pd_op.matmul: (18x1x1024xf32) <- (18x1x1024xf32, 1024x1024xf32)
+        matmul_27 = paddle._C_ops.matmul(dropout_2, parameter_335, False, False)
+        del parameter_335
+
+        # pd_op.reshape: (18x1x16x64xf32) <- (18x1x1024xf32, 4xi64)
+        reshape_31 = paddle._C_ops.reshape(matmul_27, full_int_array_6)
+        del matmul_27
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_37 = paddle._C_ops.add(reshape_28, parameter_332)
+        del parameter_332
+
+        # builtin.combine: ([9x1x16x64xf32, 9x1x16x64xf32]) <- (9x1x16x64xf32, 9x1x16x64xf32)
+        combine_26 = [add_37, reshape_29]
+        del add_37, reshape_29
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x1x16x64xf32, 9x1x16x64xf32]) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        einsum_75, einsum_76, einsum_77 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_26, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_26
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_100,
+            split_101,
+        ) = einsum_76
+        del einsum_76
+
+        # builtin.split: (9x1x16x64xf32, 9x1x16x64xf32) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        (
+            split_102,
+            split_103,
+        ) = einsum_77
+        del einsum_77
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_38 = paddle._C_ops.add(reshape_28, parameter_334)
+        del parameter_334
+
+        # builtin.combine: ([9x1x16x64xf32, 18x1x16x64xf32]) <- (9x1x16x64xf32, 18x1x16x64xf32)
+        combine_27 = [add_38, reshape_31]
+        del add_38, reshape_31
+
+        # pd_op.einsum: (1x16x9x18xf32, [0xf32, 0xf32], [9x1x16x64xf32, 18x1x16x64xf32]) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        einsum_78, einsum_79, einsum_80 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_27, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_27
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_104,
+            split_105,
+        ) = einsum_79
+        del einsum_79
+
+        # builtin.split: (9x1x16x64xf32, 18x1x16x64xf32) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        (
+            split_106,
+            split_107,
+        ) = einsum_80
+        del einsum_80
+
+        # pd_op.reshape: (1x16x18x9xf32) <- (1x16x9x18xf32, 4xi64)
+        reshape_32 = paddle._C_ops.reshape(einsum_78, full_int_array_7)
+        del einsum_78
+
+        # pd_op.slice: (1x16x17x9xf32) <- (1x16x18x9xf32, 1xi64, 1xi64)
+        slice_4 = paddle._C_ops.slice(
+            reshape_32, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_32
+
+        # pd_op.reshape: (1x16x9x17xf32) <- (1x16x17x9xf32, 4xi64)
+        reshape_33 = paddle._C_ops.reshape(slice_4, full_int_array_9)
+        del slice_4
+
+        # pd_op.index_select: (1x16x9x9xf32) <- (1x16x9x17xf32, 9xi64)
+        index_select_4 = paddle._C_ops.index_select(reshape_33, arange_2, 3)
+        del reshape_33
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_39 = paddle._C_ops.add(reshape_28, parameter_333)
+        del parameter_333, reshape_28
+
+        # builtin.combine: ([9x1x16x64xf32, 2x16x64xf32]) <- (9x1x16x64xf32, 2x16x64xf32)
+        combine_28 = [add_39, parameter_331]
+        del add_39, parameter_331
+
+        # pd_op.einsum: (9x1x16x2xf32, [0xf32, 0xf32], [9x1x16x64xf32, 2x16x64xf32]) <- ([9x1x16x64xf32, 2x16x64xf32])
+        einsum_81, einsum_82, einsum_83 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_28, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_28
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_108,
+            split_109,
+        ) = einsum_82
+        del einsum_82
+
+        # builtin.split: (9x1x16x64xf32, 2x16x64xf32) <- ([9x1x16x64xf32, 2x16x64xf32])
+        (
+            split_110,
+            split_111,
+        ) = einsum_83
+        del einsum_83
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x16x2xf32]) <- (9x9x1x2xf32, 9x1x16x2xf32)
+        combine_29 = [cast_5, einsum_81]
+        del einsum_81
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x16x2xf32]) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        einsum_84, einsum_85, einsum_86 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_29, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_29
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_112,
+            split_113,
+        ) = einsum_85
+        del einsum_85
+
+        # builtin.split: (9x9x1x2xf32, 9x1x16x2xf32) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        (
+            split_114,
+            split_115,
+        ) = einsum_86
+        del einsum_86
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_40 = paddle._C_ops.add(einsum_75, index_select_4)
+        del einsum_75, index_select_4
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_41 = paddle._C_ops.add(add_40, einsum_84)
+        del add_40, einsum_84
+
+        # pd_op.scale: (1x16x9x9xf32) <- (1x16x9x9xf32, 1xf32)
+        scale_8 = paddle._C_ops.scale(add_41, full_16, float("0"), True)
+        del add_41
+
+        # pd_op.subtract: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x1x9x9xf32)
+        subtract_4 = paddle._C_ops.subtract(scale_8, scale_4)
+        del scale_8
+
+        # pd_op.softmax: (1x16x9x9xf32) <- (1x16x9x9xf32)
+        softmax_4 = paddle._C_ops.softmax(subtract_4, 3)
+        del subtract_4
+
+        # pd_op.dropout: (1x16x9x9xf32, 1x16x9x9xui8) <- (1x16x9x9xf32, None, 1xf32)
+        dropout_36, dropout_37 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_4, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_4
+
+        # builtin.combine: ([1x16x9x9xf32, 9x1x16x64xf32]) <- (1x16x9x9xf32, 9x1x16x64xf32)
+        combine_30 = [dropout_36, reshape_30]
+        del dropout_36, reshape_30
+
+        # pd_op.einsum: (9x1x16x64xf32, [0xf32, 0xf32], [1x16x9x9xf32, 9x1x16x64xf32]) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        einsum_87, einsum_88, einsum_89 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_30, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_30
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_116,
+            split_117,
+        ) = einsum_88
+        del einsum_88
+
+        # builtin.split: (1x16x9x9xf32, 9x1x16x64xf32) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        (
+            split_118,
+            split_119,
+        ) = einsum_89
+        del einsum_89
+
+        # pd_op.reshape: (9x1x1024xf32) <- (9x1x16x64xf32, 3xi64)
+        reshape_34 = paddle._C_ops.reshape(einsum_87, full_int_array_10)
+        del einsum_87
+
+        # builtin.combine: ([9x1x1024xf32, 1024x1024xf32]) <- (9x1x1024xf32, 1024x1024xf32)
+        combine_31 = [reshape_34, parameter_336]
+        del parameter_336, reshape_34
+
+        # pd_op.einsum: (9x1x1024xf32, [0xf32, 0xf32], [9x1x1024xf32, 1024x1024xf32]) <- ([9x1x1024xf32, 1024x1024xf32])
+        einsum_90, einsum_91, einsum_92 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_31, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_31
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_120,
+            split_121,
+        ) = einsum_91
+        del einsum_91
+
+        # builtin.split: (9x1x1024xf32, 1024x1024xf32) <- ([9x1x1024xf32, 1024x1024xf32])
+        (
+            split_122,
+            split_123,
+        ) = einsum_92
+        del einsum_92
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_38, dropout_39 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_90, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_90
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_42 = paddle._C_ops.add(dropout_38, layer_norm_21)
+        del dropout_38, layer_norm_21
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_24, layer_norm_25, layer_norm_26 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_42, parameter_330, parameter_329, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_42, parameter_329, parameter_330
+
+        # pd_op.matmul: (9x1x4096xf32) <- (9x1x1024xf32, 1024x4096xf32)
+        matmul_28 = paddle._C_ops.matmul(layer_norm_24, parameter_326, False, False)
+        del parameter_326
+
+        # pd_op.add: (9x1x4096xf32) <- (9x1x4096xf32, 4096xf32)
+        add_43 = paddle._C_ops.add(matmul_28, parameter_325)
+        del matmul_28, parameter_325
+
+        # pd_op.relu: (9x1x4096xf32) <- (9x1x4096xf32)
+        relu_4 = paddle._C_ops.relu(add_43)
+        del add_43
+
+        # pd_op.dropout: (9x1x4096xf32, 9x1x4096xui8) <- (9x1x4096xf32, None, 1xf32)
+        dropout_40, dropout_41 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_4, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_4
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x4096xf32, 4096x1024xf32)
+        matmul_29 = paddle._C_ops.matmul(dropout_40, parameter_324, False, False)
+        del dropout_40, parameter_324
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 1024xf32)
+        add_44 = paddle._C_ops.add(matmul_29, parameter_323)
+        del matmul_29, parameter_323
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_42, dropout_43 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_44, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_44
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_45 = paddle._C_ops.add(dropout_42, layer_norm_24)
+        del dropout_42, layer_norm_24
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_27, layer_norm_28, layer_norm_29 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_45, parameter_328, parameter_327, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_45, parameter_327, parameter_328
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_30 = paddle._C_ops.matmul(layer_norm_27, parameter_322, False, False)
+        del parameter_322
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_35 = paddle._C_ops.reshape(matmul_30, full_int_array_5)
+        del matmul_30
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_31 = paddle._C_ops.matmul(layer_norm_27, parameter_321, False, False)
+        del parameter_321
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_36 = paddle._C_ops.reshape(matmul_31, full_int_array_5)
+        del matmul_31
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_32 = paddle._C_ops.matmul(layer_norm_27, parameter_320, False, False)
+        del parameter_320
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_37 = paddle._C_ops.reshape(matmul_32, full_int_array_5)
+        del matmul_32
+
+        # pd_op.matmul: (18x1x1024xf32) <- (18x1x1024xf32, 1024x1024xf32)
+        matmul_33 = paddle._C_ops.matmul(dropout_2, parameter_318, False, False)
+        del parameter_318
+
+        # pd_op.reshape: (18x1x16x64xf32) <- (18x1x1024xf32, 4xi64)
+        reshape_38 = paddle._C_ops.reshape(matmul_33, full_int_array_6)
+        del matmul_33
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_46 = paddle._C_ops.add(reshape_35, parameter_315)
+        del parameter_315
+
+        # builtin.combine: ([9x1x16x64xf32, 9x1x16x64xf32]) <- (9x1x16x64xf32, 9x1x16x64xf32)
+        combine_32 = [add_46, reshape_36]
+        del add_46, reshape_36
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x1x16x64xf32, 9x1x16x64xf32]) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        einsum_93, einsum_94, einsum_95 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_32, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_32
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_124,
+            split_125,
+        ) = einsum_94
+        del einsum_94
+
+        # builtin.split: (9x1x16x64xf32, 9x1x16x64xf32) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        (
+            split_126,
+            split_127,
+        ) = einsum_95
+        del einsum_95
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_47 = paddle._C_ops.add(reshape_35, parameter_317)
+        del parameter_317
+
+        # builtin.combine: ([9x1x16x64xf32, 18x1x16x64xf32]) <- (9x1x16x64xf32, 18x1x16x64xf32)
+        combine_33 = [add_47, reshape_38]
+        del add_47, reshape_38
+
+        # pd_op.einsum: (1x16x9x18xf32, [0xf32, 0xf32], [9x1x16x64xf32, 18x1x16x64xf32]) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        einsum_96, einsum_97, einsum_98 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_33, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_33
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_128,
+            split_129,
+        ) = einsum_97
+        del einsum_97
+
+        # builtin.split: (9x1x16x64xf32, 18x1x16x64xf32) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        (
+            split_130,
+            split_131,
+        ) = einsum_98
+        del einsum_98
+
+        # pd_op.reshape: (1x16x18x9xf32) <- (1x16x9x18xf32, 4xi64)
+        reshape_39 = paddle._C_ops.reshape(einsum_96, full_int_array_7)
+        del einsum_96
+
+        # pd_op.slice: (1x16x17x9xf32) <- (1x16x18x9xf32, 1xi64, 1xi64)
+        slice_5 = paddle._C_ops.slice(
+            reshape_39, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_39
+
+        # pd_op.reshape: (1x16x9x17xf32) <- (1x16x17x9xf32, 4xi64)
+        reshape_40 = paddle._C_ops.reshape(slice_5, full_int_array_9)
+        del slice_5
+
+        # pd_op.index_select: (1x16x9x9xf32) <- (1x16x9x17xf32, 9xi64)
+        index_select_5 = paddle._C_ops.index_select(reshape_40, arange_2, 3)
+        del reshape_40
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_48 = paddle._C_ops.add(reshape_35, parameter_316)
+        del parameter_316, reshape_35
+
+        # builtin.combine: ([9x1x16x64xf32, 2x16x64xf32]) <- (9x1x16x64xf32, 2x16x64xf32)
+        combine_34 = [add_48, parameter_314]
+        del add_48, parameter_314
+
+        # pd_op.einsum: (9x1x16x2xf32, [0xf32, 0xf32], [9x1x16x64xf32, 2x16x64xf32]) <- ([9x1x16x64xf32, 2x16x64xf32])
+        einsum_99, einsum_100, einsum_101 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_34, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_34
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_132,
+            split_133,
+        ) = einsum_100
+        del einsum_100
+
+        # builtin.split: (9x1x16x64xf32, 2x16x64xf32) <- ([9x1x16x64xf32, 2x16x64xf32])
+        (
+            split_134,
+            split_135,
+        ) = einsum_101
+        del einsum_101
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x16x2xf32]) <- (9x9x1x2xf32, 9x1x16x2xf32)
+        combine_35 = [cast_5, einsum_99]
+        del einsum_99
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x16x2xf32]) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        einsum_102, einsum_103, einsum_104 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_35, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_35
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_136,
+            split_137,
+        ) = einsum_103
+        del einsum_103
+
+        # builtin.split: (9x9x1x2xf32, 9x1x16x2xf32) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        (
+            split_138,
+            split_139,
+        ) = einsum_104
+        del einsum_104
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_49 = paddle._C_ops.add(einsum_93, index_select_5)
+        del einsum_93, index_select_5
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_50 = paddle._C_ops.add(add_49, einsum_102)
+        del add_49, einsum_102
+
+        # pd_op.scale: (1x16x9x9xf32) <- (1x16x9x9xf32, 1xf32)
+        scale_9 = paddle._C_ops.scale(add_50, full_16, float("0"), True)
+        del add_50
+
+        # pd_op.subtract: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x1x9x9xf32)
+        subtract_5 = paddle._C_ops.subtract(scale_9, scale_4)
+        del scale_9
+
+        # pd_op.softmax: (1x16x9x9xf32) <- (1x16x9x9xf32)
+        softmax_5 = paddle._C_ops.softmax(subtract_5, 3)
+        del subtract_5
+
+        # pd_op.dropout: (1x16x9x9xf32, 1x16x9x9xui8) <- (1x16x9x9xf32, None, 1xf32)
+        dropout_44, dropout_45 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_5, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_5
+
+        # builtin.combine: ([1x16x9x9xf32, 9x1x16x64xf32]) <- (1x16x9x9xf32, 9x1x16x64xf32)
+        combine_36 = [dropout_44, reshape_37]
+        del dropout_44, reshape_37
+
+        # pd_op.einsum: (9x1x16x64xf32, [0xf32, 0xf32], [1x16x9x9xf32, 9x1x16x64xf32]) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        einsum_105, einsum_106, einsum_107 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_36, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_36
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_140,
+            split_141,
+        ) = einsum_106
+        del einsum_106
+
+        # builtin.split: (1x16x9x9xf32, 9x1x16x64xf32) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        (
+            split_142,
+            split_143,
+        ) = einsum_107
+        del einsum_107
+
+        # pd_op.reshape: (9x1x1024xf32) <- (9x1x16x64xf32, 3xi64)
+        reshape_41 = paddle._C_ops.reshape(einsum_105, full_int_array_10)
+        del einsum_105
+
+        # builtin.combine: ([9x1x1024xf32, 1024x1024xf32]) <- (9x1x1024xf32, 1024x1024xf32)
+        combine_37 = [reshape_41, parameter_319]
+        del parameter_319, reshape_41
+
+        # pd_op.einsum: (9x1x1024xf32, [0xf32, 0xf32], [9x1x1024xf32, 1024x1024xf32]) <- ([9x1x1024xf32, 1024x1024xf32])
+        einsum_108, einsum_109, einsum_110 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_37, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_37
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_144,
+            split_145,
+        ) = einsum_109
+        del einsum_109
+
+        # builtin.split: (9x1x1024xf32, 1024x1024xf32) <- ([9x1x1024xf32, 1024x1024xf32])
+        (
+            split_146,
+            split_147,
+        ) = einsum_110
+        del einsum_110
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_46, dropout_47 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_108, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_108
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_51 = paddle._C_ops.add(dropout_46, layer_norm_27)
+        del dropout_46, layer_norm_27
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_30, layer_norm_31, layer_norm_32 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_51, parameter_313, parameter_312, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_51, parameter_312, parameter_313
+
+        # pd_op.matmul: (9x1x4096xf32) <- (9x1x1024xf32, 1024x4096xf32)
+        matmul_34 = paddle._C_ops.matmul(layer_norm_30, parameter_309, False, False)
+        del parameter_309
+
+        # pd_op.add: (9x1x4096xf32) <- (9x1x4096xf32, 4096xf32)
+        add_52 = paddle._C_ops.add(matmul_34, parameter_308)
+        del matmul_34, parameter_308
+
+        # pd_op.relu: (9x1x4096xf32) <- (9x1x4096xf32)
+        relu_5 = paddle._C_ops.relu(add_52)
+        del add_52
+
+        # pd_op.dropout: (9x1x4096xf32, 9x1x4096xui8) <- (9x1x4096xf32, None, 1xf32)
+        dropout_48, dropout_49 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_5, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_5
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x4096xf32, 4096x1024xf32)
+        matmul_35 = paddle._C_ops.matmul(dropout_48, parameter_307, False, False)
+        del dropout_48, parameter_307
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 1024xf32)
+        add_53 = paddle._C_ops.add(matmul_35, parameter_306)
+        del matmul_35, parameter_306
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_50, dropout_51 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_53, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_53
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_54 = paddle._C_ops.add(dropout_50, layer_norm_30)
+        del dropout_50, layer_norm_30
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_33, layer_norm_34, layer_norm_35 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_54, parameter_311, parameter_310, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_54, parameter_310, parameter_311
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_36 = paddle._C_ops.matmul(layer_norm_33, parameter_305, False, False)
+        del parameter_305
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_42 = paddle._C_ops.reshape(matmul_36, full_int_array_5)
+        del matmul_36
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_37 = paddle._C_ops.matmul(layer_norm_33, parameter_304, False, False)
+        del parameter_304
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_43 = paddle._C_ops.reshape(matmul_37, full_int_array_5)
+        del matmul_37
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_38 = paddle._C_ops.matmul(layer_norm_33, parameter_303, False, False)
+        del parameter_303
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_44 = paddle._C_ops.reshape(matmul_38, full_int_array_5)
+        del matmul_38
+
+        # pd_op.matmul: (18x1x1024xf32) <- (18x1x1024xf32, 1024x1024xf32)
+        matmul_39 = paddle._C_ops.matmul(dropout_2, parameter_301, False, False)
+        del parameter_301
+
+        # pd_op.reshape: (18x1x16x64xf32) <- (18x1x1024xf32, 4xi64)
+        reshape_45 = paddle._C_ops.reshape(matmul_39, full_int_array_6)
+        del matmul_39
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_55 = paddle._C_ops.add(reshape_42, parameter_298)
+        del parameter_298
+
+        # builtin.combine: ([9x1x16x64xf32, 9x1x16x64xf32]) <- (9x1x16x64xf32, 9x1x16x64xf32)
+        combine_38 = [add_55, reshape_43]
+        del add_55, reshape_43
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x1x16x64xf32, 9x1x16x64xf32]) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        einsum_111, einsum_112, einsum_113 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_38, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_38
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_148,
+            split_149,
+        ) = einsum_112
+        del einsum_112
+
+        # builtin.split: (9x1x16x64xf32, 9x1x16x64xf32) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        (
+            split_150,
+            split_151,
+        ) = einsum_113
+        del einsum_113
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_56 = paddle._C_ops.add(reshape_42, parameter_300)
+        del parameter_300
+
+        # builtin.combine: ([9x1x16x64xf32, 18x1x16x64xf32]) <- (9x1x16x64xf32, 18x1x16x64xf32)
+        combine_39 = [add_56, reshape_45]
+        del add_56, reshape_45
+
+        # pd_op.einsum: (1x16x9x18xf32, [0xf32, 0xf32], [9x1x16x64xf32, 18x1x16x64xf32]) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        einsum_114, einsum_115, einsum_116 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_39, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_39
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_152,
+            split_153,
+        ) = einsum_115
+        del einsum_115
+
+        # builtin.split: (9x1x16x64xf32, 18x1x16x64xf32) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        (
+            split_154,
+            split_155,
+        ) = einsum_116
+        del einsum_116
+
+        # pd_op.reshape: (1x16x18x9xf32) <- (1x16x9x18xf32, 4xi64)
+        reshape_46 = paddle._C_ops.reshape(einsum_114, full_int_array_7)
+        del einsum_114
+
+        # pd_op.slice: (1x16x17x9xf32) <- (1x16x18x9xf32, 1xi64, 1xi64)
+        slice_6 = paddle._C_ops.slice(
+            reshape_46, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_46
+
+        # pd_op.reshape: (1x16x9x17xf32) <- (1x16x17x9xf32, 4xi64)
+        reshape_47 = paddle._C_ops.reshape(slice_6, full_int_array_9)
+        del slice_6
+
+        # pd_op.index_select: (1x16x9x9xf32) <- (1x16x9x17xf32, 9xi64)
+        index_select_6 = paddle._C_ops.index_select(reshape_47, arange_2, 3)
+        del reshape_47
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_57 = paddle._C_ops.add(reshape_42, parameter_299)
+        del parameter_299, reshape_42
+
+        # builtin.combine: ([9x1x16x64xf32, 2x16x64xf32]) <- (9x1x16x64xf32, 2x16x64xf32)
+        combine_40 = [add_57, parameter_297]
+        del add_57, parameter_297
+
+        # pd_op.einsum: (9x1x16x2xf32, [0xf32, 0xf32], [9x1x16x64xf32, 2x16x64xf32]) <- ([9x1x16x64xf32, 2x16x64xf32])
+        einsum_117, einsum_118, einsum_119 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_40, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_40
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_156,
+            split_157,
+        ) = einsum_118
+        del einsum_118
+
+        # builtin.split: (9x1x16x64xf32, 2x16x64xf32) <- ([9x1x16x64xf32, 2x16x64xf32])
+        (
+            split_158,
+            split_159,
+        ) = einsum_119
+        del einsum_119
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x16x2xf32]) <- (9x9x1x2xf32, 9x1x16x2xf32)
+        combine_41 = [cast_5, einsum_117]
+        del einsum_117
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x16x2xf32]) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        einsum_120, einsum_121, einsum_122 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_41, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_41
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_160,
+            split_161,
+        ) = einsum_121
+        del einsum_121
+
+        # builtin.split: (9x9x1x2xf32, 9x1x16x2xf32) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        (
+            split_162,
+            split_163,
+        ) = einsum_122
+        del einsum_122
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_58 = paddle._C_ops.add(einsum_111, index_select_6)
+        del einsum_111, index_select_6
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_59 = paddle._C_ops.add(add_58, einsum_120)
+        del add_58, einsum_120
+
+        # pd_op.scale: (1x16x9x9xf32) <- (1x16x9x9xf32, 1xf32)
+        scale_10 = paddle._C_ops.scale(add_59, full_16, float("0"), True)
+        del add_59
+
+        # pd_op.subtract: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x1x9x9xf32)
+        subtract_6 = paddle._C_ops.subtract(scale_10, scale_4)
+        del scale_10
+
+        # pd_op.softmax: (1x16x9x9xf32) <- (1x16x9x9xf32)
+        softmax_6 = paddle._C_ops.softmax(subtract_6, 3)
+        del subtract_6
+
+        # pd_op.dropout: (1x16x9x9xf32, 1x16x9x9xui8) <- (1x16x9x9xf32, None, 1xf32)
+        dropout_52, dropout_53 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_6, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_6
+
+        # builtin.combine: ([1x16x9x9xf32, 9x1x16x64xf32]) <- (1x16x9x9xf32, 9x1x16x64xf32)
+        combine_42 = [dropout_52, reshape_44]
+        del dropout_52, reshape_44
+
+        # pd_op.einsum: (9x1x16x64xf32, [0xf32, 0xf32], [1x16x9x9xf32, 9x1x16x64xf32]) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        einsum_123, einsum_124, einsum_125 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_42, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_42
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_164,
+            split_165,
+        ) = einsum_124
+        del einsum_124
+
+        # builtin.split: (1x16x9x9xf32, 9x1x16x64xf32) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        (
+            split_166,
+            split_167,
+        ) = einsum_125
+        del einsum_125
+
+        # pd_op.reshape: (9x1x1024xf32) <- (9x1x16x64xf32, 3xi64)
+        reshape_48 = paddle._C_ops.reshape(einsum_123, full_int_array_10)
+        del einsum_123
+
+        # builtin.combine: ([9x1x1024xf32, 1024x1024xf32]) <- (9x1x1024xf32, 1024x1024xf32)
+        combine_43 = [reshape_48, parameter_302]
+        del parameter_302, reshape_48
+
+        # pd_op.einsum: (9x1x1024xf32, [0xf32, 0xf32], [9x1x1024xf32, 1024x1024xf32]) <- ([9x1x1024xf32, 1024x1024xf32])
+        einsum_126, einsum_127, einsum_128 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_43, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_43
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_168,
+            split_169,
+        ) = einsum_127
+        del einsum_127
+
+        # builtin.split: (9x1x1024xf32, 1024x1024xf32) <- ([9x1x1024xf32, 1024x1024xf32])
+        (
+            split_170,
+            split_171,
+        ) = einsum_128
+        del einsum_128
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_54, dropout_55 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_126, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_126
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_60 = paddle._C_ops.add(dropout_54, layer_norm_33)
+        del dropout_54, layer_norm_33
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_36, layer_norm_37, layer_norm_38 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_60, parameter_296, parameter_295, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_60, parameter_295, parameter_296
+
+        # pd_op.matmul: (9x1x4096xf32) <- (9x1x1024xf32, 1024x4096xf32)
+        matmul_40 = paddle._C_ops.matmul(layer_norm_36, parameter_292, False, False)
+        del parameter_292
+
+        # pd_op.add: (9x1x4096xf32) <- (9x1x4096xf32, 4096xf32)
+        add_61 = paddle._C_ops.add(matmul_40, parameter_291)
+        del matmul_40, parameter_291
+
+        # pd_op.relu: (9x1x4096xf32) <- (9x1x4096xf32)
+        relu_6 = paddle._C_ops.relu(add_61)
+        del add_61
+
+        # pd_op.dropout: (9x1x4096xf32, 9x1x4096xui8) <- (9x1x4096xf32, None, 1xf32)
+        dropout_56, dropout_57 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_6, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_6
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x4096xf32, 4096x1024xf32)
+        matmul_41 = paddle._C_ops.matmul(dropout_56, parameter_290, False, False)
+        del dropout_56, parameter_290
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 1024xf32)
+        add_62 = paddle._C_ops.add(matmul_41, parameter_289)
+        del matmul_41, parameter_289
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_58, dropout_59 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_62, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_62
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_63 = paddle._C_ops.add(dropout_58, layer_norm_36)
+        del dropout_58, layer_norm_36
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_39, layer_norm_40, layer_norm_41 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_63, parameter_294, parameter_293, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_63, parameter_293, parameter_294
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_42 = paddle._C_ops.matmul(layer_norm_39, parameter_288, False, False)
+        del parameter_288
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_49 = paddle._C_ops.reshape(matmul_42, full_int_array_5)
+        del matmul_42
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_43 = paddle._C_ops.matmul(layer_norm_39, parameter_287, False, False)
+        del parameter_287
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_50 = paddle._C_ops.reshape(matmul_43, full_int_array_5)
+        del matmul_43
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_44 = paddle._C_ops.matmul(layer_norm_39, parameter_286, False, False)
+        del parameter_286
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_51 = paddle._C_ops.reshape(matmul_44, full_int_array_5)
+        del matmul_44
+
+        # pd_op.matmul: (18x1x1024xf32) <- (18x1x1024xf32, 1024x1024xf32)
+        matmul_45 = paddle._C_ops.matmul(dropout_2, parameter_284, False, False)
+        del parameter_284
+
+        # pd_op.reshape: (18x1x16x64xf32) <- (18x1x1024xf32, 4xi64)
+        reshape_52 = paddle._C_ops.reshape(matmul_45, full_int_array_6)
+        del matmul_45
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_64 = paddle._C_ops.add(reshape_49, parameter_281)
+        del parameter_281
+
+        # builtin.combine: ([9x1x16x64xf32, 9x1x16x64xf32]) <- (9x1x16x64xf32, 9x1x16x64xf32)
+        combine_44 = [add_64, reshape_50]
+        del add_64, reshape_50
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x1x16x64xf32, 9x1x16x64xf32]) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        einsum_129, einsum_130, einsum_131 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_44, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_44
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_172,
+            split_173,
+        ) = einsum_130
+        del einsum_130
+
+        # builtin.split: (9x1x16x64xf32, 9x1x16x64xf32) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        (
+            split_174,
+            split_175,
+        ) = einsum_131
+        del einsum_131
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_65 = paddle._C_ops.add(reshape_49, parameter_283)
+        del parameter_283
+
+        # builtin.combine: ([9x1x16x64xf32, 18x1x16x64xf32]) <- (9x1x16x64xf32, 18x1x16x64xf32)
+        combine_45 = [add_65, reshape_52]
+        del add_65, reshape_52
+
+        # pd_op.einsum: (1x16x9x18xf32, [0xf32, 0xf32], [9x1x16x64xf32, 18x1x16x64xf32]) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        einsum_132, einsum_133, einsum_134 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_45, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_45
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_176,
+            split_177,
+        ) = einsum_133
+        del einsum_133
+
+        # builtin.split: (9x1x16x64xf32, 18x1x16x64xf32) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        (
+            split_178,
+            split_179,
+        ) = einsum_134
+        del einsum_134
+
+        # pd_op.reshape: (1x16x18x9xf32) <- (1x16x9x18xf32, 4xi64)
+        reshape_53 = paddle._C_ops.reshape(einsum_132, full_int_array_7)
+        del einsum_132
+
+        # pd_op.slice: (1x16x17x9xf32) <- (1x16x18x9xf32, 1xi64, 1xi64)
+        slice_7 = paddle._C_ops.slice(
+            reshape_53, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_53
+
+        # pd_op.reshape: (1x16x9x17xf32) <- (1x16x17x9xf32, 4xi64)
+        reshape_54 = paddle._C_ops.reshape(slice_7, full_int_array_9)
+        del slice_7
+
+        # pd_op.index_select: (1x16x9x9xf32) <- (1x16x9x17xf32, 9xi64)
+        index_select_7 = paddle._C_ops.index_select(reshape_54, arange_2, 3)
+        del reshape_54
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_66 = paddle._C_ops.add(reshape_49, parameter_282)
+        del parameter_282, reshape_49
+
+        # builtin.combine: ([9x1x16x64xf32, 2x16x64xf32]) <- (9x1x16x64xf32, 2x16x64xf32)
+        combine_46 = [add_66, parameter_280]
+        del add_66, parameter_280
+
+        # pd_op.einsum: (9x1x16x2xf32, [0xf32, 0xf32], [9x1x16x64xf32, 2x16x64xf32]) <- ([9x1x16x64xf32, 2x16x64xf32])
+        einsum_135, einsum_136, einsum_137 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_46, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_46
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_180,
+            split_181,
+        ) = einsum_136
+        del einsum_136
+
+        # builtin.split: (9x1x16x64xf32, 2x16x64xf32) <- ([9x1x16x64xf32, 2x16x64xf32])
+        (
+            split_182,
+            split_183,
+        ) = einsum_137
+        del einsum_137
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x16x2xf32]) <- (9x9x1x2xf32, 9x1x16x2xf32)
+        combine_47 = [cast_5, einsum_135]
+        del einsum_135
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x16x2xf32]) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        einsum_138, einsum_139, einsum_140 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_47, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_47
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_184,
+            split_185,
+        ) = einsum_139
+        del einsum_139
+
+        # builtin.split: (9x9x1x2xf32, 9x1x16x2xf32) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        (
+            split_186,
+            split_187,
+        ) = einsum_140
+        del einsum_140
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_67 = paddle._C_ops.add(einsum_129, index_select_7)
+        del einsum_129, index_select_7
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_68 = paddle._C_ops.add(add_67, einsum_138)
+        del add_67, einsum_138
+
+        # pd_op.scale: (1x16x9x9xf32) <- (1x16x9x9xf32, 1xf32)
+        scale_11 = paddle._C_ops.scale(add_68, full_16, float("0"), True)
+        del add_68
+
+        # pd_op.subtract: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x1x9x9xf32)
+        subtract_7 = paddle._C_ops.subtract(scale_11, scale_4)
+        del scale_11
+
+        # pd_op.softmax: (1x16x9x9xf32) <- (1x16x9x9xf32)
+        softmax_7 = paddle._C_ops.softmax(subtract_7, 3)
+        del subtract_7
+
+        # pd_op.dropout: (1x16x9x9xf32, 1x16x9x9xui8) <- (1x16x9x9xf32, None, 1xf32)
+        dropout_60, dropout_61 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_7, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_7
+
+        # builtin.combine: ([1x16x9x9xf32, 9x1x16x64xf32]) <- (1x16x9x9xf32, 9x1x16x64xf32)
+        combine_48 = [dropout_60, reshape_51]
+        del dropout_60, reshape_51
+
+        # pd_op.einsum: (9x1x16x64xf32, [0xf32, 0xf32], [1x16x9x9xf32, 9x1x16x64xf32]) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        einsum_141, einsum_142, einsum_143 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_48, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_48
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_188,
+            split_189,
+        ) = einsum_142
+        del einsum_142
+
+        # builtin.split: (1x16x9x9xf32, 9x1x16x64xf32) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        (
+            split_190,
+            split_191,
+        ) = einsum_143
+        del einsum_143
+
+        # pd_op.reshape: (9x1x1024xf32) <- (9x1x16x64xf32, 3xi64)
+        reshape_55 = paddle._C_ops.reshape(einsum_141, full_int_array_10)
+        del einsum_141
+
+        # builtin.combine: ([9x1x1024xf32, 1024x1024xf32]) <- (9x1x1024xf32, 1024x1024xf32)
+        combine_49 = [reshape_55, parameter_285]
+        del parameter_285, reshape_55
+
+        # pd_op.einsum: (9x1x1024xf32, [0xf32, 0xf32], [9x1x1024xf32, 1024x1024xf32]) <- ([9x1x1024xf32, 1024x1024xf32])
+        einsum_144, einsum_145, einsum_146 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_49, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_49
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_192,
+            split_193,
+        ) = einsum_145
+        del einsum_145
+
+        # builtin.split: (9x1x1024xf32, 1024x1024xf32) <- ([9x1x1024xf32, 1024x1024xf32])
+        (
+            split_194,
+            split_195,
+        ) = einsum_146
+        del einsum_146
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_62, dropout_63 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_144, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_144
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_69 = paddle._C_ops.add(dropout_62, layer_norm_39)
+        del dropout_62, layer_norm_39
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_42, layer_norm_43, layer_norm_44 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_69, parameter_279, parameter_278, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_69, parameter_278, parameter_279
+
+        # pd_op.matmul: (9x1x4096xf32) <- (9x1x1024xf32, 1024x4096xf32)
+        matmul_46 = paddle._C_ops.matmul(layer_norm_42, parameter_275, False, False)
+        del parameter_275
+
+        # pd_op.add: (9x1x4096xf32) <- (9x1x4096xf32, 4096xf32)
+        add_70 = paddle._C_ops.add(matmul_46, parameter_274)
+        del matmul_46, parameter_274
+
+        # pd_op.relu: (9x1x4096xf32) <- (9x1x4096xf32)
+        relu_7 = paddle._C_ops.relu(add_70)
+        del add_70
+
+        # pd_op.dropout: (9x1x4096xf32, 9x1x4096xui8) <- (9x1x4096xf32, None, 1xf32)
+        dropout_64, dropout_65 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_7, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_7
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x4096xf32, 4096x1024xf32)
+        matmul_47 = paddle._C_ops.matmul(dropout_64, parameter_273, False, False)
+        del dropout_64, parameter_273
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 1024xf32)
+        add_71 = paddle._C_ops.add(matmul_47, parameter_272)
+        del matmul_47, parameter_272
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_66, dropout_67 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_71, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_71
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_72 = paddle._C_ops.add(dropout_66, layer_norm_42)
+        del dropout_66, layer_norm_42
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_45, layer_norm_46, layer_norm_47 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_72, parameter_277, parameter_276, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_72, parameter_276, parameter_277
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_48 = paddle._C_ops.matmul(layer_norm_45, parameter_271, False, False)
+        del parameter_271
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_56 = paddle._C_ops.reshape(matmul_48, full_int_array_5)
+        del matmul_48
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_49 = paddle._C_ops.matmul(layer_norm_45, parameter_270, False, False)
+        del parameter_270
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_57 = paddle._C_ops.reshape(matmul_49, full_int_array_5)
+        del matmul_49
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_50 = paddle._C_ops.matmul(layer_norm_45, parameter_269, False, False)
+        del parameter_269
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_58 = paddle._C_ops.reshape(matmul_50, full_int_array_5)
+        del matmul_50
+
+        # pd_op.matmul: (18x1x1024xf32) <- (18x1x1024xf32, 1024x1024xf32)
+        matmul_51 = paddle._C_ops.matmul(dropout_2, parameter_267, False, False)
+        del parameter_267
+
+        # pd_op.reshape: (18x1x16x64xf32) <- (18x1x1024xf32, 4xi64)
+        reshape_59 = paddle._C_ops.reshape(matmul_51, full_int_array_6)
+        del matmul_51
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_73 = paddle._C_ops.add(reshape_56, parameter_264)
+        del parameter_264
+
+        # builtin.combine: ([9x1x16x64xf32, 9x1x16x64xf32]) <- (9x1x16x64xf32, 9x1x16x64xf32)
+        combine_50 = [add_73, reshape_57]
+        del add_73, reshape_57
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x1x16x64xf32, 9x1x16x64xf32]) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        einsum_147, einsum_148, einsum_149 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_50, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_50
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_196,
+            split_197,
+        ) = einsum_148
+        del einsum_148
+
+        # builtin.split: (9x1x16x64xf32, 9x1x16x64xf32) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        (
+            split_198,
+            split_199,
+        ) = einsum_149
+        del einsum_149
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_74 = paddle._C_ops.add(reshape_56, parameter_266)
+        del parameter_266
+
+        # builtin.combine: ([9x1x16x64xf32, 18x1x16x64xf32]) <- (9x1x16x64xf32, 18x1x16x64xf32)
+        combine_51 = [add_74, reshape_59]
+        del add_74, reshape_59
+
+        # pd_op.einsum: (1x16x9x18xf32, [0xf32, 0xf32], [9x1x16x64xf32, 18x1x16x64xf32]) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        einsum_150, einsum_151, einsum_152 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_51, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_51
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_200,
+            split_201,
+        ) = einsum_151
+        del einsum_151
+
+        # builtin.split: (9x1x16x64xf32, 18x1x16x64xf32) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        (
+            split_202,
+            split_203,
+        ) = einsum_152
+        del einsum_152
+
+        # pd_op.reshape: (1x16x18x9xf32) <- (1x16x9x18xf32, 4xi64)
+        reshape_60 = paddle._C_ops.reshape(einsum_150, full_int_array_7)
+        del einsum_150
+
+        # pd_op.slice: (1x16x17x9xf32) <- (1x16x18x9xf32, 1xi64, 1xi64)
+        slice_8 = paddle._C_ops.slice(
+            reshape_60, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_60
+
+        # pd_op.reshape: (1x16x9x17xf32) <- (1x16x17x9xf32, 4xi64)
+        reshape_61 = paddle._C_ops.reshape(slice_8, full_int_array_9)
+        del slice_8
+
+        # pd_op.index_select: (1x16x9x9xf32) <- (1x16x9x17xf32, 9xi64)
+        index_select_8 = paddle._C_ops.index_select(reshape_61, arange_2, 3)
+        del reshape_61
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_75 = paddle._C_ops.add(reshape_56, parameter_265)
+        del parameter_265, reshape_56
+
+        # builtin.combine: ([9x1x16x64xf32, 2x16x64xf32]) <- (9x1x16x64xf32, 2x16x64xf32)
+        combine_52 = [add_75, parameter_263]
+        del add_75, parameter_263
+
+        # pd_op.einsum: (9x1x16x2xf32, [0xf32, 0xf32], [9x1x16x64xf32, 2x16x64xf32]) <- ([9x1x16x64xf32, 2x16x64xf32])
+        einsum_153, einsum_154, einsum_155 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_52, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_52
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_204,
+            split_205,
+        ) = einsum_154
+        del einsum_154
+
+        # builtin.split: (9x1x16x64xf32, 2x16x64xf32) <- ([9x1x16x64xf32, 2x16x64xf32])
+        (
+            split_206,
+            split_207,
+        ) = einsum_155
+        del einsum_155
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x16x2xf32]) <- (9x9x1x2xf32, 9x1x16x2xf32)
+        combine_53 = [cast_5, einsum_153]
+        del einsum_153
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x16x2xf32]) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        einsum_156, einsum_157, einsum_158 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_53, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_53
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_208,
+            split_209,
+        ) = einsum_157
+        del einsum_157
+
+        # builtin.split: (9x9x1x2xf32, 9x1x16x2xf32) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        (
+            split_210,
+            split_211,
+        ) = einsum_158
+        del einsum_158
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_76 = paddle._C_ops.add(einsum_147, index_select_8)
+        del einsum_147, index_select_8
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_77 = paddle._C_ops.add(add_76, einsum_156)
+        del add_76, einsum_156
+
+        # pd_op.scale: (1x16x9x9xf32) <- (1x16x9x9xf32, 1xf32)
+        scale_12 = paddle._C_ops.scale(add_77, full_16, float("0"), True)
+        del add_77
+
+        # pd_op.subtract: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x1x9x9xf32)
+        subtract_8 = paddle._C_ops.subtract(scale_12, scale_4)
+        del scale_12
+
+        # pd_op.softmax: (1x16x9x9xf32) <- (1x16x9x9xf32)
+        softmax_8 = paddle._C_ops.softmax(subtract_8, 3)
+        del subtract_8
+
+        # pd_op.dropout: (1x16x9x9xf32, 1x16x9x9xui8) <- (1x16x9x9xf32, None, 1xf32)
+        dropout_68, dropout_69 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_8, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_8
+
+        # builtin.combine: ([1x16x9x9xf32, 9x1x16x64xf32]) <- (1x16x9x9xf32, 9x1x16x64xf32)
+        combine_54 = [dropout_68, reshape_58]
+        del dropout_68, reshape_58
+
+        # pd_op.einsum: (9x1x16x64xf32, [0xf32, 0xf32], [1x16x9x9xf32, 9x1x16x64xf32]) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        einsum_159, einsum_160, einsum_161 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_54, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_54
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_212,
+            split_213,
+        ) = einsum_160
+        del einsum_160
+
+        # builtin.split: (1x16x9x9xf32, 9x1x16x64xf32) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        (
+            split_214,
+            split_215,
+        ) = einsum_161
+        del einsum_161
+
+        # pd_op.reshape: (9x1x1024xf32) <- (9x1x16x64xf32, 3xi64)
+        reshape_62 = paddle._C_ops.reshape(einsum_159, full_int_array_10)
+        del einsum_159
+
+        # builtin.combine: ([9x1x1024xf32, 1024x1024xf32]) <- (9x1x1024xf32, 1024x1024xf32)
+        combine_55 = [reshape_62, parameter_268]
+        del parameter_268, reshape_62
+
+        # pd_op.einsum: (9x1x1024xf32, [0xf32, 0xf32], [9x1x1024xf32, 1024x1024xf32]) <- ([9x1x1024xf32, 1024x1024xf32])
+        einsum_162, einsum_163, einsum_164 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_55, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_55
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_216,
+            split_217,
+        ) = einsum_163
+        del einsum_163
+
+        # builtin.split: (9x1x1024xf32, 1024x1024xf32) <- ([9x1x1024xf32, 1024x1024xf32])
+        (
+            split_218,
+            split_219,
+        ) = einsum_164
+        del einsum_164
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_70, dropout_71 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_162, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_162
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_78 = paddle._C_ops.add(dropout_70, layer_norm_45)
+        del dropout_70, layer_norm_45
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_48, layer_norm_49, layer_norm_50 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_78, parameter_262, parameter_261, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_78, parameter_261, parameter_262
+
+        # pd_op.matmul: (9x1x4096xf32) <- (9x1x1024xf32, 1024x4096xf32)
+        matmul_52 = paddle._C_ops.matmul(layer_norm_48, parameter_258, False, False)
+        del parameter_258
+
+        # pd_op.add: (9x1x4096xf32) <- (9x1x4096xf32, 4096xf32)
+        add_79 = paddle._C_ops.add(matmul_52, parameter_257)
+        del matmul_52, parameter_257
+
+        # pd_op.relu: (9x1x4096xf32) <- (9x1x4096xf32)
+        relu_8 = paddle._C_ops.relu(add_79)
+        del add_79
+
+        # pd_op.dropout: (9x1x4096xf32, 9x1x4096xui8) <- (9x1x4096xf32, None, 1xf32)
+        dropout_72, dropout_73 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_8, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_8
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x4096xf32, 4096x1024xf32)
+        matmul_53 = paddle._C_ops.matmul(dropout_72, parameter_256, False, False)
+        del dropout_72, parameter_256
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 1024xf32)
+        add_80 = paddle._C_ops.add(matmul_53, parameter_255)
+        del matmul_53, parameter_255
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_74, dropout_75 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_80, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_80
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_81 = paddle._C_ops.add(dropout_74, layer_norm_48)
+        del dropout_74, layer_norm_48
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_51, layer_norm_52, layer_norm_53 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_81, parameter_260, parameter_259, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_81, parameter_259, parameter_260
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_54 = paddle._C_ops.matmul(layer_norm_51, parameter_254, False, False)
+        del parameter_254
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_63 = paddle._C_ops.reshape(matmul_54, full_int_array_5)
+        del matmul_54
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_55 = paddle._C_ops.matmul(layer_norm_51, parameter_253, False, False)
+        del parameter_253
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_64 = paddle._C_ops.reshape(matmul_55, full_int_array_5)
+        del matmul_55
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_56 = paddle._C_ops.matmul(layer_norm_51, parameter_252, False, False)
+        del parameter_252
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_65 = paddle._C_ops.reshape(matmul_56, full_int_array_5)
+        del matmul_56
+
+        # pd_op.matmul: (18x1x1024xf32) <- (18x1x1024xf32, 1024x1024xf32)
+        matmul_57 = paddle._C_ops.matmul(dropout_2, parameter_250, False, False)
+        del parameter_250
+
+        # pd_op.reshape: (18x1x16x64xf32) <- (18x1x1024xf32, 4xi64)
+        reshape_66 = paddle._C_ops.reshape(matmul_57, full_int_array_6)
+        del matmul_57
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_82 = paddle._C_ops.add(reshape_63, parameter_247)
+        del parameter_247
+
+        # builtin.combine: ([9x1x16x64xf32, 9x1x16x64xf32]) <- (9x1x16x64xf32, 9x1x16x64xf32)
+        combine_56 = [add_82, reshape_64]
+        del add_82, reshape_64
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x1x16x64xf32, 9x1x16x64xf32]) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        einsum_165, einsum_166, einsum_167 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_56, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_56
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_220,
+            split_221,
+        ) = einsum_166
+        del einsum_166
+
+        # builtin.split: (9x1x16x64xf32, 9x1x16x64xf32) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        (
+            split_222,
+            split_223,
+        ) = einsum_167
+        del einsum_167
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_83 = paddle._C_ops.add(reshape_63, parameter_249)
+        del parameter_249
+
+        # builtin.combine: ([9x1x16x64xf32, 18x1x16x64xf32]) <- (9x1x16x64xf32, 18x1x16x64xf32)
+        combine_57 = [add_83, reshape_66]
+        del add_83, reshape_66
+
+        # pd_op.einsum: (1x16x9x18xf32, [0xf32, 0xf32], [9x1x16x64xf32, 18x1x16x64xf32]) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        einsum_168, einsum_169, einsum_170 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_57, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_57
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_224,
+            split_225,
+        ) = einsum_169
+        del einsum_169
+
+        # builtin.split: (9x1x16x64xf32, 18x1x16x64xf32) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        (
+            split_226,
+            split_227,
+        ) = einsum_170
+        del einsum_170
+
+        # pd_op.reshape: (1x16x18x9xf32) <- (1x16x9x18xf32, 4xi64)
+        reshape_67 = paddle._C_ops.reshape(einsum_168, full_int_array_7)
+        del einsum_168
+
+        # pd_op.slice: (1x16x17x9xf32) <- (1x16x18x9xf32, 1xi64, 1xi64)
+        slice_9 = paddle._C_ops.slice(
+            reshape_67, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_67
+
+        # pd_op.reshape: (1x16x9x17xf32) <- (1x16x17x9xf32, 4xi64)
+        reshape_68 = paddle._C_ops.reshape(slice_9, full_int_array_9)
+        del slice_9
+
+        # pd_op.index_select: (1x16x9x9xf32) <- (1x16x9x17xf32, 9xi64)
+        index_select_9 = paddle._C_ops.index_select(reshape_68, arange_2, 3)
+        del reshape_68
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_84 = paddle._C_ops.add(reshape_63, parameter_248)
+        del parameter_248, reshape_63
+
+        # builtin.combine: ([9x1x16x64xf32, 2x16x64xf32]) <- (9x1x16x64xf32, 2x16x64xf32)
+        combine_58 = [add_84, parameter_246]
+        del add_84, parameter_246
+
+        # pd_op.einsum: (9x1x16x2xf32, [0xf32, 0xf32], [9x1x16x64xf32, 2x16x64xf32]) <- ([9x1x16x64xf32, 2x16x64xf32])
+        einsum_171, einsum_172, einsum_173 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_58, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_58
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_228,
+            split_229,
+        ) = einsum_172
+        del einsum_172
+
+        # builtin.split: (9x1x16x64xf32, 2x16x64xf32) <- ([9x1x16x64xf32, 2x16x64xf32])
+        (
+            split_230,
+            split_231,
+        ) = einsum_173
+        del einsum_173
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x16x2xf32]) <- (9x9x1x2xf32, 9x1x16x2xf32)
+        combine_59 = [cast_5, einsum_171]
+        del einsum_171
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x16x2xf32]) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        einsum_174, einsum_175, einsum_176 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_59, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_59
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_232,
+            split_233,
+        ) = einsum_175
+        del einsum_175
+
+        # builtin.split: (9x9x1x2xf32, 9x1x16x2xf32) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        (
+            split_234,
+            split_235,
+        ) = einsum_176
+        del einsum_176
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_85 = paddle._C_ops.add(einsum_165, index_select_9)
+        del einsum_165, index_select_9
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_86 = paddle._C_ops.add(add_85, einsum_174)
+        del add_85, einsum_174
+
+        # pd_op.scale: (1x16x9x9xf32) <- (1x16x9x9xf32, 1xf32)
+        scale_13 = paddle._C_ops.scale(add_86, full_16, float("0"), True)
+        del add_86
+
+        # pd_op.subtract: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x1x9x9xf32)
+        subtract_9 = paddle._C_ops.subtract(scale_13, scale_4)
+        del scale_13
+
+        # pd_op.softmax: (1x16x9x9xf32) <- (1x16x9x9xf32)
+        softmax_9 = paddle._C_ops.softmax(subtract_9, 3)
+        del subtract_9
+
+        # pd_op.dropout: (1x16x9x9xf32, 1x16x9x9xui8) <- (1x16x9x9xf32, None, 1xf32)
+        dropout_76, dropout_77 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_9, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_9
+
+        # builtin.combine: ([1x16x9x9xf32, 9x1x16x64xf32]) <- (1x16x9x9xf32, 9x1x16x64xf32)
+        combine_60 = [dropout_76, reshape_65]
+        del dropout_76, reshape_65
+
+        # pd_op.einsum: (9x1x16x64xf32, [0xf32, 0xf32], [1x16x9x9xf32, 9x1x16x64xf32]) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        einsum_177, einsum_178, einsum_179 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_60, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_60
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_236,
+            split_237,
+        ) = einsum_178
+        del einsum_178
+
+        # builtin.split: (1x16x9x9xf32, 9x1x16x64xf32) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        (
+            split_238,
+            split_239,
+        ) = einsum_179
+        del einsum_179
+
+        # pd_op.reshape: (9x1x1024xf32) <- (9x1x16x64xf32, 3xi64)
+        reshape_69 = paddle._C_ops.reshape(einsum_177, full_int_array_10)
+        del einsum_177
+
+        # builtin.combine: ([9x1x1024xf32, 1024x1024xf32]) <- (9x1x1024xf32, 1024x1024xf32)
+        combine_61 = [reshape_69, parameter_251]
+        del parameter_251, reshape_69
+
+        # pd_op.einsum: (9x1x1024xf32, [0xf32, 0xf32], [9x1x1024xf32, 1024x1024xf32]) <- ([9x1x1024xf32, 1024x1024xf32])
+        einsum_180, einsum_181, einsum_182 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_61, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_61
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_240,
+            split_241,
+        ) = einsum_181
+        del einsum_181
+
+        # builtin.split: (9x1x1024xf32, 1024x1024xf32) <- ([9x1x1024xf32, 1024x1024xf32])
+        (
+            split_242,
+            split_243,
+        ) = einsum_182
+        del einsum_182
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_78, dropout_79 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_180, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_180
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_87 = paddle._C_ops.add(dropout_78, layer_norm_51)
+        del dropout_78, layer_norm_51
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_54, layer_norm_55, layer_norm_56 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_87, parameter_245, parameter_244, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_87, parameter_244, parameter_245
+
+        # pd_op.matmul: (9x1x4096xf32) <- (9x1x1024xf32, 1024x4096xf32)
+        matmul_58 = paddle._C_ops.matmul(layer_norm_54, parameter_241, False, False)
+        del parameter_241
+
+        # pd_op.add: (9x1x4096xf32) <- (9x1x4096xf32, 4096xf32)
+        add_88 = paddle._C_ops.add(matmul_58, parameter_240)
+        del matmul_58, parameter_240
+
+        # pd_op.relu: (9x1x4096xf32) <- (9x1x4096xf32)
+        relu_9 = paddle._C_ops.relu(add_88)
+        del add_88
+
+        # pd_op.dropout: (9x1x4096xf32, 9x1x4096xui8) <- (9x1x4096xf32, None, 1xf32)
+        dropout_80, dropout_81 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_9, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_9
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x4096xf32, 4096x1024xf32)
+        matmul_59 = paddle._C_ops.matmul(dropout_80, parameter_239, False, False)
+        del dropout_80, parameter_239
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 1024xf32)
+        add_89 = paddle._C_ops.add(matmul_59, parameter_238)
+        del matmul_59, parameter_238
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_82, dropout_83 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_89, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_89
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_90 = paddle._C_ops.add(dropout_82, layer_norm_54)
+        del dropout_82, layer_norm_54
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_57, layer_norm_58, layer_norm_59 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_90, parameter_243, parameter_242, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_90, parameter_242, parameter_243
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_60 = paddle._C_ops.matmul(layer_norm_57, parameter_237, False, False)
+        del parameter_237
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_70 = paddle._C_ops.reshape(matmul_60, full_int_array_5)
+        del matmul_60
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_61 = paddle._C_ops.matmul(layer_norm_57, parameter_236, False, False)
+        del parameter_236
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_71 = paddle._C_ops.reshape(matmul_61, full_int_array_5)
+        del matmul_61
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_62 = paddle._C_ops.matmul(layer_norm_57, parameter_235, False, False)
+        del parameter_235
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_72 = paddle._C_ops.reshape(matmul_62, full_int_array_5)
+        del matmul_62
+
+        # pd_op.matmul: (18x1x1024xf32) <- (18x1x1024xf32, 1024x1024xf32)
+        matmul_63 = paddle._C_ops.matmul(dropout_2, parameter_233, False, False)
+        del parameter_233
+
+        # pd_op.reshape: (18x1x16x64xf32) <- (18x1x1024xf32, 4xi64)
+        reshape_73 = paddle._C_ops.reshape(matmul_63, full_int_array_6)
+        del matmul_63
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_91 = paddle._C_ops.add(reshape_70, parameter_230)
+        del parameter_230
+
+        # builtin.combine: ([9x1x16x64xf32, 9x1x16x64xf32]) <- (9x1x16x64xf32, 9x1x16x64xf32)
+        combine_62 = [add_91, reshape_71]
+        del add_91, reshape_71
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x1x16x64xf32, 9x1x16x64xf32]) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        einsum_183, einsum_184, einsum_185 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_62, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_62
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_244,
+            split_245,
+        ) = einsum_184
+        del einsum_184
+
+        # builtin.split: (9x1x16x64xf32, 9x1x16x64xf32) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        (
+            split_246,
+            split_247,
+        ) = einsum_185
+        del einsum_185
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_92 = paddle._C_ops.add(reshape_70, parameter_232)
+        del parameter_232
+
+        # builtin.combine: ([9x1x16x64xf32, 18x1x16x64xf32]) <- (9x1x16x64xf32, 18x1x16x64xf32)
+        combine_63 = [add_92, reshape_73]
+        del add_92, reshape_73
+
+        # pd_op.einsum: (1x16x9x18xf32, [0xf32, 0xf32], [9x1x16x64xf32, 18x1x16x64xf32]) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        einsum_186, einsum_187, einsum_188 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_63, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_63
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_248,
+            split_249,
+        ) = einsum_187
+        del einsum_187
+
+        # builtin.split: (9x1x16x64xf32, 18x1x16x64xf32) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        (
+            split_250,
+            split_251,
+        ) = einsum_188
+        del einsum_188
+
+        # pd_op.reshape: (1x16x18x9xf32) <- (1x16x9x18xf32, 4xi64)
+        reshape_74 = paddle._C_ops.reshape(einsum_186, full_int_array_7)
+        del einsum_186
+
+        # pd_op.slice: (1x16x17x9xf32) <- (1x16x18x9xf32, 1xi64, 1xi64)
+        slice_10 = paddle._C_ops.slice(
+            reshape_74, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_74
+
+        # pd_op.reshape: (1x16x9x17xf32) <- (1x16x17x9xf32, 4xi64)
+        reshape_75 = paddle._C_ops.reshape(slice_10, full_int_array_9)
+        del slice_10
+
+        # pd_op.index_select: (1x16x9x9xf32) <- (1x16x9x17xf32, 9xi64)
+        index_select_10 = paddle._C_ops.index_select(reshape_75, arange_2, 3)
+        del reshape_75
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_93 = paddle._C_ops.add(reshape_70, parameter_231)
+        del parameter_231, reshape_70
+
+        # builtin.combine: ([9x1x16x64xf32, 2x16x64xf32]) <- (9x1x16x64xf32, 2x16x64xf32)
+        combine_64 = [add_93, parameter_229]
+        del add_93, parameter_229
+
+        # pd_op.einsum: (9x1x16x2xf32, [0xf32, 0xf32], [9x1x16x64xf32, 2x16x64xf32]) <- ([9x1x16x64xf32, 2x16x64xf32])
+        einsum_189, einsum_190, einsum_191 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_64, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_64
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_252,
+            split_253,
+        ) = einsum_190
+        del einsum_190
+
+        # builtin.split: (9x1x16x64xf32, 2x16x64xf32) <- ([9x1x16x64xf32, 2x16x64xf32])
+        (
+            split_254,
+            split_255,
+        ) = einsum_191
+        del einsum_191
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x16x2xf32]) <- (9x9x1x2xf32, 9x1x16x2xf32)
+        combine_65 = [cast_5, einsum_189]
+        del einsum_189
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x16x2xf32]) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        einsum_192, einsum_193, einsum_194 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_65, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_65
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_256,
+            split_257,
+        ) = einsum_193
+        del einsum_193
+
+        # builtin.split: (9x9x1x2xf32, 9x1x16x2xf32) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        (
+            split_258,
+            split_259,
+        ) = einsum_194
+        del einsum_194
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_94 = paddle._C_ops.add(einsum_183, index_select_10)
+        del einsum_183, index_select_10
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_95 = paddle._C_ops.add(add_94, einsum_192)
+        del add_94, einsum_192
+
+        # pd_op.scale: (1x16x9x9xf32) <- (1x16x9x9xf32, 1xf32)
+        scale_14 = paddle._C_ops.scale(add_95, full_16, float("0"), True)
+        del add_95
+
+        # pd_op.subtract: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x1x9x9xf32)
+        subtract_10 = paddle._C_ops.subtract(scale_14, scale_4)
+        del scale_14
+
+        # pd_op.softmax: (1x16x9x9xf32) <- (1x16x9x9xf32)
+        softmax_10 = paddle._C_ops.softmax(subtract_10, 3)
+        del subtract_10
+
+        # pd_op.dropout: (1x16x9x9xf32, 1x16x9x9xui8) <- (1x16x9x9xf32, None, 1xf32)
+        dropout_84, dropout_85 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_10, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_10
+
+        # builtin.combine: ([1x16x9x9xf32, 9x1x16x64xf32]) <- (1x16x9x9xf32, 9x1x16x64xf32)
+        combine_66 = [dropout_84, reshape_72]
+        del dropout_84, reshape_72
+
+        # pd_op.einsum: (9x1x16x64xf32, [0xf32, 0xf32], [1x16x9x9xf32, 9x1x16x64xf32]) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        einsum_195, einsum_196, einsum_197 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_66, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_66
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_260,
+            split_261,
+        ) = einsum_196
+        del einsum_196
+
+        # builtin.split: (1x16x9x9xf32, 9x1x16x64xf32) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        (
+            split_262,
+            split_263,
+        ) = einsum_197
+        del einsum_197
+
+        # pd_op.reshape: (9x1x1024xf32) <- (9x1x16x64xf32, 3xi64)
+        reshape_76 = paddle._C_ops.reshape(einsum_195, full_int_array_10)
+        del einsum_195
+
+        # builtin.combine: ([9x1x1024xf32, 1024x1024xf32]) <- (9x1x1024xf32, 1024x1024xf32)
+        combine_67 = [reshape_76, parameter_234]
+        del parameter_234, reshape_76
+
+        # pd_op.einsum: (9x1x1024xf32, [0xf32, 0xf32], [9x1x1024xf32, 1024x1024xf32]) <- ([9x1x1024xf32, 1024x1024xf32])
+        einsum_198, einsum_199, einsum_200 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_67, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_67
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_264,
+            split_265,
+        ) = einsum_199
+        del einsum_199
+
+        # builtin.split: (9x1x1024xf32, 1024x1024xf32) <- ([9x1x1024xf32, 1024x1024xf32])
+        (
+            split_266,
+            split_267,
+        ) = einsum_200
+        del einsum_200
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_86, dropout_87 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_198, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_198
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_96 = paddle._C_ops.add(dropout_86, layer_norm_57)
+        del dropout_86, layer_norm_57
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_60, layer_norm_61, layer_norm_62 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_96, parameter_228, parameter_227, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_96, parameter_227, parameter_228
+
+        # pd_op.matmul: (9x1x4096xf32) <- (9x1x1024xf32, 1024x4096xf32)
+        matmul_64 = paddle._C_ops.matmul(layer_norm_60, parameter_224, False, False)
+        del parameter_224
+
+        # pd_op.add: (9x1x4096xf32) <- (9x1x4096xf32, 4096xf32)
+        add_97 = paddle._C_ops.add(matmul_64, parameter_223)
+        del matmul_64, parameter_223
+
+        # pd_op.relu: (9x1x4096xf32) <- (9x1x4096xf32)
+        relu_10 = paddle._C_ops.relu(add_97)
+        del add_97
+
+        # pd_op.dropout: (9x1x4096xf32, 9x1x4096xui8) <- (9x1x4096xf32, None, 1xf32)
+        dropout_88, dropout_89 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_10, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_10
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x4096xf32, 4096x1024xf32)
+        matmul_65 = paddle._C_ops.matmul(dropout_88, parameter_222, False, False)
+        del dropout_88, parameter_222
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 1024xf32)
+        add_98 = paddle._C_ops.add(matmul_65, parameter_221)
+        del matmul_65, parameter_221
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_90, dropout_91 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_98, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_98
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_99 = paddle._C_ops.add(dropout_90, layer_norm_60)
+        del dropout_90, layer_norm_60
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_63, layer_norm_64, layer_norm_65 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_99, parameter_226, parameter_225, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_99, parameter_225, parameter_226
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_66 = paddle._C_ops.matmul(layer_norm_63, parameter_220, False, False)
+        del parameter_220
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_77 = paddle._C_ops.reshape(matmul_66, full_int_array_5)
+        del matmul_66
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_67 = paddle._C_ops.matmul(layer_norm_63, parameter_219, False, False)
+        del parameter_219
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_78 = paddle._C_ops.reshape(matmul_67, full_int_array_5)
+        del matmul_67
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_68 = paddle._C_ops.matmul(layer_norm_63, parameter_218, False, False)
+        del parameter_218
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_79 = paddle._C_ops.reshape(matmul_68, full_int_array_5)
+        del matmul_68
+
+        # pd_op.matmul: (18x1x1024xf32) <- (18x1x1024xf32, 1024x1024xf32)
+        matmul_69 = paddle._C_ops.matmul(dropout_2, parameter_216, False, False)
+        del parameter_216
+
+        # pd_op.reshape: (18x1x16x64xf32) <- (18x1x1024xf32, 4xi64)
+        reshape_80 = paddle._C_ops.reshape(matmul_69, full_int_array_6)
+        del matmul_69
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_100 = paddle._C_ops.add(reshape_77, parameter_213)
+        del parameter_213
+
+        # builtin.combine: ([9x1x16x64xf32, 9x1x16x64xf32]) <- (9x1x16x64xf32, 9x1x16x64xf32)
+        combine_68 = [add_100, reshape_78]
+        del add_100, reshape_78
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x1x16x64xf32, 9x1x16x64xf32]) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        einsum_201, einsum_202, einsum_203 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_68, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_68
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_268,
+            split_269,
+        ) = einsum_202
+        del einsum_202
+
+        # builtin.split: (9x1x16x64xf32, 9x1x16x64xf32) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        (
+            split_270,
+            split_271,
+        ) = einsum_203
+        del einsum_203
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_101 = paddle._C_ops.add(reshape_77, parameter_215)
+        del parameter_215
+
+        # builtin.combine: ([9x1x16x64xf32, 18x1x16x64xf32]) <- (9x1x16x64xf32, 18x1x16x64xf32)
+        combine_69 = [add_101, reshape_80]
+        del add_101, reshape_80
+
+        # pd_op.einsum: (1x16x9x18xf32, [0xf32, 0xf32], [9x1x16x64xf32, 18x1x16x64xf32]) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        einsum_204, einsum_205, einsum_206 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_69, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_69
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_272,
+            split_273,
+        ) = einsum_205
+        del einsum_205
+
+        # builtin.split: (9x1x16x64xf32, 18x1x16x64xf32) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        (
+            split_274,
+            split_275,
+        ) = einsum_206
+        del einsum_206
+
+        # pd_op.reshape: (1x16x18x9xf32) <- (1x16x9x18xf32, 4xi64)
+        reshape_81 = paddle._C_ops.reshape(einsum_204, full_int_array_7)
+        del einsum_204
+
+        # pd_op.slice: (1x16x17x9xf32) <- (1x16x18x9xf32, 1xi64, 1xi64)
+        slice_11 = paddle._C_ops.slice(
+            reshape_81, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_81
+
+        # pd_op.reshape: (1x16x9x17xf32) <- (1x16x17x9xf32, 4xi64)
+        reshape_82 = paddle._C_ops.reshape(slice_11, full_int_array_9)
+        del slice_11
+
+        # pd_op.index_select: (1x16x9x9xf32) <- (1x16x9x17xf32, 9xi64)
+        index_select_11 = paddle._C_ops.index_select(reshape_82, arange_2, 3)
+        del reshape_82
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_102 = paddle._C_ops.add(reshape_77, parameter_214)
+        del parameter_214, reshape_77
+
+        # builtin.combine: ([9x1x16x64xf32, 2x16x64xf32]) <- (9x1x16x64xf32, 2x16x64xf32)
+        combine_70 = [add_102, parameter_212]
+        del add_102, parameter_212
+
+        # pd_op.einsum: (9x1x16x2xf32, [0xf32, 0xf32], [9x1x16x64xf32, 2x16x64xf32]) <- ([9x1x16x64xf32, 2x16x64xf32])
+        einsum_207, einsum_208, einsum_209 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_70, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_70
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_276,
+            split_277,
+        ) = einsum_208
+        del einsum_208
+
+        # builtin.split: (9x1x16x64xf32, 2x16x64xf32) <- ([9x1x16x64xf32, 2x16x64xf32])
+        (
+            split_278,
+            split_279,
+        ) = einsum_209
+        del einsum_209
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x16x2xf32]) <- (9x9x1x2xf32, 9x1x16x2xf32)
+        combine_71 = [cast_5, einsum_207]
+        del einsum_207
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x16x2xf32]) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        einsum_210, einsum_211, einsum_212 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_71, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_71
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_280,
+            split_281,
+        ) = einsum_211
+        del einsum_211
+
+        # builtin.split: (9x9x1x2xf32, 9x1x16x2xf32) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        (
+            split_282,
+            split_283,
+        ) = einsum_212
+        del einsum_212
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_103 = paddle._C_ops.add(einsum_201, index_select_11)
+        del einsum_201, index_select_11
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_104 = paddle._C_ops.add(add_103, einsum_210)
+        del add_103, einsum_210
+
+        # pd_op.scale: (1x16x9x9xf32) <- (1x16x9x9xf32, 1xf32)
+        scale_15 = paddle._C_ops.scale(add_104, full_16, float("0"), True)
+        del add_104
+
+        # pd_op.subtract: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x1x9x9xf32)
+        subtract_11 = paddle._C_ops.subtract(scale_15, scale_4)
+        del scale_15
+
+        # pd_op.softmax: (1x16x9x9xf32) <- (1x16x9x9xf32)
+        softmax_11 = paddle._C_ops.softmax(subtract_11, 3)
+        del subtract_11
+
+        # pd_op.dropout: (1x16x9x9xf32, 1x16x9x9xui8) <- (1x16x9x9xf32, None, 1xf32)
+        dropout_92, dropout_93 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_11, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_11
+
+        # builtin.combine: ([1x16x9x9xf32, 9x1x16x64xf32]) <- (1x16x9x9xf32, 9x1x16x64xf32)
+        combine_72 = [dropout_92, reshape_79]
+        del dropout_92, reshape_79
+
+        # pd_op.einsum: (9x1x16x64xf32, [0xf32, 0xf32], [1x16x9x9xf32, 9x1x16x64xf32]) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        einsum_213, einsum_214, einsum_215 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_72, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_72
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_284,
+            split_285,
+        ) = einsum_214
+        del einsum_214
+
+        # builtin.split: (1x16x9x9xf32, 9x1x16x64xf32) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        (
+            split_286,
+            split_287,
+        ) = einsum_215
+        del einsum_215
+
+        # pd_op.reshape: (9x1x1024xf32) <- (9x1x16x64xf32, 3xi64)
+        reshape_83 = paddle._C_ops.reshape(einsum_213, full_int_array_10)
+        del einsum_213
+
+        # builtin.combine: ([9x1x1024xf32, 1024x1024xf32]) <- (9x1x1024xf32, 1024x1024xf32)
+        combine_73 = [reshape_83, parameter_217]
+        del parameter_217, reshape_83
+
+        # pd_op.einsum: (9x1x1024xf32, [0xf32, 0xf32], [9x1x1024xf32, 1024x1024xf32]) <- ([9x1x1024xf32, 1024x1024xf32])
+        einsum_216, einsum_217, einsum_218 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_73, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_73
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_288,
+            split_289,
+        ) = einsum_217
+        del einsum_217
+
+        # builtin.split: (9x1x1024xf32, 1024x1024xf32) <- ([9x1x1024xf32, 1024x1024xf32])
+        (
+            split_290,
+            split_291,
+        ) = einsum_218
+        del einsum_218
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_94, dropout_95 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_216, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_216
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_105 = paddle._C_ops.add(dropout_94, layer_norm_63)
+        del dropout_94, layer_norm_63
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_66, layer_norm_67, layer_norm_68 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_105, parameter_211, parameter_210, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_105, parameter_210, parameter_211
+
+        # pd_op.matmul: (9x1x4096xf32) <- (9x1x1024xf32, 1024x4096xf32)
+        matmul_70 = paddle._C_ops.matmul(layer_norm_66, parameter_207, False, False)
+        del parameter_207
+
+        # pd_op.add: (9x1x4096xf32) <- (9x1x4096xf32, 4096xf32)
+        add_106 = paddle._C_ops.add(matmul_70, parameter_206)
+        del matmul_70, parameter_206
+
+        # pd_op.relu: (9x1x4096xf32) <- (9x1x4096xf32)
+        relu_11 = paddle._C_ops.relu(add_106)
+        del add_106
+
+        # pd_op.dropout: (9x1x4096xf32, 9x1x4096xui8) <- (9x1x4096xf32, None, 1xf32)
+        dropout_96, dropout_97 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_11, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_11
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x4096xf32, 4096x1024xf32)
+        matmul_71 = paddle._C_ops.matmul(dropout_96, parameter_205, False, False)
+        del dropout_96, parameter_205
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 1024xf32)
+        add_107 = paddle._C_ops.add(matmul_71, parameter_204)
+        del matmul_71, parameter_204
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_98, dropout_99 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_107, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_107
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_108 = paddle._C_ops.add(dropout_98, layer_norm_66)
+        del dropout_98, layer_norm_66
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_69, layer_norm_70, layer_norm_71 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_108, parameter_209, parameter_208, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_108, parameter_208, parameter_209
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_72 = paddle._C_ops.matmul(layer_norm_69, parameter_203, False, False)
+        del parameter_203
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_84 = paddle._C_ops.reshape(matmul_72, full_int_array_5)
+        del matmul_72
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_73 = paddle._C_ops.matmul(layer_norm_69, parameter_202, False, False)
+        del parameter_202
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_85 = paddle._C_ops.reshape(matmul_73, full_int_array_5)
+        del matmul_73
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_74 = paddle._C_ops.matmul(layer_norm_69, parameter_201, False, False)
+        del parameter_201
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_86 = paddle._C_ops.reshape(matmul_74, full_int_array_5)
+        del matmul_74
+
+        # pd_op.matmul: (18x1x1024xf32) <- (18x1x1024xf32, 1024x1024xf32)
+        matmul_75 = paddle._C_ops.matmul(dropout_2, parameter_199, False, False)
+        del parameter_199
+
+        # pd_op.reshape: (18x1x16x64xf32) <- (18x1x1024xf32, 4xi64)
+        reshape_87 = paddle._C_ops.reshape(matmul_75, full_int_array_6)
+        del matmul_75
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_109 = paddle._C_ops.add(reshape_84, parameter_196)
+        del parameter_196
+
+        # builtin.combine: ([9x1x16x64xf32, 9x1x16x64xf32]) <- (9x1x16x64xf32, 9x1x16x64xf32)
+        combine_74 = [add_109, reshape_85]
+        del add_109, reshape_85
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x1x16x64xf32, 9x1x16x64xf32]) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        einsum_219, einsum_220, einsum_221 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_74, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_74
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_292,
+            split_293,
+        ) = einsum_220
+        del einsum_220
+
+        # builtin.split: (9x1x16x64xf32, 9x1x16x64xf32) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        (
+            split_294,
+            split_295,
+        ) = einsum_221
+        del einsum_221
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_110 = paddle._C_ops.add(reshape_84, parameter_198)
+        del parameter_198
+
+        # builtin.combine: ([9x1x16x64xf32, 18x1x16x64xf32]) <- (9x1x16x64xf32, 18x1x16x64xf32)
+        combine_75 = [add_110, reshape_87]
+        del add_110, reshape_87
+
+        # pd_op.einsum: (1x16x9x18xf32, [0xf32, 0xf32], [9x1x16x64xf32, 18x1x16x64xf32]) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        einsum_222, einsum_223, einsum_224 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_75, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_75
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_296,
+            split_297,
+        ) = einsum_223
+        del einsum_223
+
+        # builtin.split: (9x1x16x64xf32, 18x1x16x64xf32) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        (
+            split_298,
+            split_299,
+        ) = einsum_224
+        del einsum_224
+
+        # pd_op.reshape: (1x16x18x9xf32) <- (1x16x9x18xf32, 4xi64)
+        reshape_88 = paddle._C_ops.reshape(einsum_222, full_int_array_7)
+        del einsum_222
+
+        # pd_op.slice: (1x16x17x9xf32) <- (1x16x18x9xf32, 1xi64, 1xi64)
+        slice_12 = paddle._C_ops.slice(
+            reshape_88, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_88
+
+        # pd_op.reshape: (1x16x9x17xf32) <- (1x16x17x9xf32, 4xi64)
+        reshape_89 = paddle._C_ops.reshape(slice_12, full_int_array_9)
+        del slice_12
+
+        # pd_op.index_select: (1x16x9x9xf32) <- (1x16x9x17xf32, 9xi64)
+        index_select_12 = paddle._C_ops.index_select(reshape_89, arange_2, 3)
+        del reshape_89
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_111 = paddle._C_ops.add(reshape_84, parameter_197)
+        del parameter_197, reshape_84
+
+        # builtin.combine: ([9x1x16x64xf32, 2x16x64xf32]) <- (9x1x16x64xf32, 2x16x64xf32)
+        combine_76 = [add_111, parameter_195]
+        del add_111, parameter_195
+
+        # pd_op.einsum: (9x1x16x2xf32, [0xf32, 0xf32], [9x1x16x64xf32, 2x16x64xf32]) <- ([9x1x16x64xf32, 2x16x64xf32])
+        einsum_225, einsum_226, einsum_227 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_76, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_76
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_300,
+            split_301,
+        ) = einsum_226
+        del einsum_226
+
+        # builtin.split: (9x1x16x64xf32, 2x16x64xf32) <- ([9x1x16x64xf32, 2x16x64xf32])
+        (
+            split_302,
+            split_303,
+        ) = einsum_227
+        del einsum_227
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x16x2xf32]) <- (9x9x1x2xf32, 9x1x16x2xf32)
+        combine_77 = [cast_5, einsum_225]
+        del einsum_225
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x16x2xf32]) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        einsum_228, einsum_229, einsum_230 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_77, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_77
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_304,
+            split_305,
+        ) = einsum_229
+        del einsum_229
+
+        # builtin.split: (9x9x1x2xf32, 9x1x16x2xf32) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        (
+            split_306,
+            split_307,
+        ) = einsum_230
+        del einsum_230
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_112 = paddle._C_ops.add(einsum_219, index_select_12)
+        del einsum_219, index_select_12
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_113 = paddle._C_ops.add(add_112, einsum_228)
+        del add_112, einsum_228
+
+        # pd_op.scale: (1x16x9x9xf32) <- (1x16x9x9xf32, 1xf32)
+        scale_16 = paddle._C_ops.scale(add_113, full_16, float("0"), True)
+        del add_113
+
+        # pd_op.subtract: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x1x9x9xf32)
+        subtract_12 = paddle._C_ops.subtract(scale_16, scale_4)
+        del scale_16
+
+        # pd_op.softmax: (1x16x9x9xf32) <- (1x16x9x9xf32)
+        softmax_12 = paddle._C_ops.softmax(subtract_12, 3)
+        del subtract_12
+
+        # pd_op.dropout: (1x16x9x9xf32, 1x16x9x9xui8) <- (1x16x9x9xf32, None, 1xf32)
+        dropout_100, dropout_101 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_12, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_12
+
+        # builtin.combine: ([1x16x9x9xf32, 9x1x16x64xf32]) <- (1x16x9x9xf32, 9x1x16x64xf32)
+        combine_78 = [dropout_100, reshape_86]
+        del dropout_100, reshape_86
+
+        # pd_op.einsum: (9x1x16x64xf32, [0xf32, 0xf32], [1x16x9x9xf32, 9x1x16x64xf32]) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        einsum_231, einsum_232, einsum_233 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_78, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_78
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_308,
+            split_309,
+        ) = einsum_232
+        del einsum_232
+
+        # builtin.split: (1x16x9x9xf32, 9x1x16x64xf32) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        (
+            split_310,
+            split_311,
+        ) = einsum_233
+        del einsum_233
+
+        # pd_op.reshape: (9x1x1024xf32) <- (9x1x16x64xf32, 3xi64)
+        reshape_90 = paddle._C_ops.reshape(einsum_231, full_int_array_10)
+        del einsum_231
+
+        # builtin.combine: ([9x1x1024xf32, 1024x1024xf32]) <- (9x1x1024xf32, 1024x1024xf32)
+        combine_79 = [reshape_90, parameter_200]
+        del parameter_200, reshape_90
+
+        # pd_op.einsum: (9x1x1024xf32, [0xf32, 0xf32], [9x1x1024xf32, 1024x1024xf32]) <- ([9x1x1024xf32, 1024x1024xf32])
+        einsum_234, einsum_235, einsum_236 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_79, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_79
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_312,
+            split_313,
+        ) = einsum_235
+        del einsum_235
+
+        # builtin.split: (9x1x1024xf32, 1024x1024xf32) <- ([9x1x1024xf32, 1024x1024xf32])
+        (
+            split_314,
+            split_315,
+        ) = einsum_236
+        del einsum_236
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_102, dropout_103 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_234, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_234
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_114 = paddle._C_ops.add(dropout_102, layer_norm_69)
+        del dropout_102, layer_norm_69
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_72, layer_norm_73, layer_norm_74 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_114, parameter_194, parameter_193, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_114, parameter_193, parameter_194
+
+        # pd_op.matmul: (9x1x4096xf32) <- (9x1x1024xf32, 1024x4096xf32)
+        matmul_76 = paddle._C_ops.matmul(layer_norm_72, parameter_190, False, False)
+        del parameter_190
+
+        # pd_op.add: (9x1x4096xf32) <- (9x1x4096xf32, 4096xf32)
+        add_115 = paddle._C_ops.add(matmul_76, parameter_189)
+        del matmul_76, parameter_189
+
+        # pd_op.relu: (9x1x4096xf32) <- (9x1x4096xf32)
+        relu_12 = paddle._C_ops.relu(add_115)
+        del add_115
+
+        # pd_op.dropout: (9x1x4096xf32, 9x1x4096xui8) <- (9x1x4096xf32, None, 1xf32)
+        dropout_104, dropout_105 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_12, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_12
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x4096xf32, 4096x1024xf32)
+        matmul_77 = paddle._C_ops.matmul(dropout_104, parameter_188, False, False)
+        del dropout_104, parameter_188
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 1024xf32)
+        add_116 = paddle._C_ops.add(matmul_77, parameter_187)
+        del matmul_77, parameter_187
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_106, dropout_107 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_116, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_116
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_117 = paddle._C_ops.add(dropout_106, layer_norm_72)
+        del dropout_106, layer_norm_72
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_75, layer_norm_76, layer_norm_77 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_117, parameter_192, parameter_191, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_117, parameter_191, parameter_192
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_78 = paddle._C_ops.matmul(layer_norm_75, parameter_186, False, False)
+        del parameter_186
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_91 = paddle._C_ops.reshape(matmul_78, full_int_array_5)
+        del matmul_78
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_79 = paddle._C_ops.matmul(layer_norm_75, parameter_185, False, False)
+        del parameter_185
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_92 = paddle._C_ops.reshape(matmul_79, full_int_array_5)
+        del matmul_79
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_80 = paddle._C_ops.matmul(layer_norm_75, parameter_184, False, False)
+        del parameter_184
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_93 = paddle._C_ops.reshape(matmul_80, full_int_array_5)
+        del matmul_80
+
+        # pd_op.matmul: (18x1x1024xf32) <- (18x1x1024xf32, 1024x1024xf32)
+        matmul_81 = paddle._C_ops.matmul(dropout_2, parameter_182, False, False)
+        del parameter_182
+
+        # pd_op.reshape: (18x1x16x64xf32) <- (18x1x1024xf32, 4xi64)
+        reshape_94 = paddle._C_ops.reshape(matmul_81, full_int_array_6)
+        del matmul_81
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_118 = paddle._C_ops.add(reshape_91, parameter_179)
+        del parameter_179
+
+        # builtin.combine: ([9x1x16x64xf32, 9x1x16x64xf32]) <- (9x1x16x64xf32, 9x1x16x64xf32)
+        combine_80 = [add_118, reshape_92]
+        del add_118, reshape_92
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x1x16x64xf32, 9x1x16x64xf32]) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        einsum_237, einsum_238, einsum_239 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_80, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_80
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_316,
+            split_317,
+        ) = einsum_238
+        del einsum_238
+
+        # builtin.split: (9x1x16x64xf32, 9x1x16x64xf32) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        (
+            split_318,
+            split_319,
+        ) = einsum_239
+        del einsum_239
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_119 = paddle._C_ops.add(reshape_91, parameter_181)
+        del parameter_181
+
+        # builtin.combine: ([9x1x16x64xf32, 18x1x16x64xf32]) <- (9x1x16x64xf32, 18x1x16x64xf32)
+        combine_81 = [add_119, reshape_94]
+        del add_119, reshape_94
+
+        # pd_op.einsum: (1x16x9x18xf32, [0xf32, 0xf32], [9x1x16x64xf32, 18x1x16x64xf32]) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        einsum_240, einsum_241, einsum_242 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_81, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_81
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_320,
+            split_321,
+        ) = einsum_241
+        del einsum_241
+
+        # builtin.split: (9x1x16x64xf32, 18x1x16x64xf32) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        (
+            split_322,
+            split_323,
+        ) = einsum_242
+        del einsum_242
+
+        # pd_op.reshape: (1x16x18x9xf32) <- (1x16x9x18xf32, 4xi64)
+        reshape_95 = paddle._C_ops.reshape(einsum_240, full_int_array_7)
+        del einsum_240
+
+        # pd_op.slice: (1x16x17x9xf32) <- (1x16x18x9xf32, 1xi64, 1xi64)
+        slice_13 = paddle._C_ops.slice(
+            reshape_95, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_95
+
+        # pd_op.reshape: (1x16x9x17xf32) <- (1x16x17x9xf32, 4xi64)
+        reshape_96 = paddle._C_ops.reshape(slice_13, full_int_array_9)
+        del slice_13
+
+        # pd_op.index_select: (1x16x9x9xf32) <- (1x16x9x17xf32, 9xi64)
+        index_select_13 = paddle._C_ops.index_select(reshape_96, arange_2, 3)
+        del reshape_96
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_120 = paddle._C_ops.add(reshape_91, parameter_180)
+        del parameter_180, reshape_91
+
+        # builtin.combine: ([9x1x16x64xf32, 2x16x64xf32]) <- (9x1x16x64xf32, 2x16x64xf32)
+        combine_82 = [add_120, parameter_178]
+        del add_120, parameter_178
+
+        # pd_op.einsum: (9x1x16x2xf32, [0xf32, 0xf32], [9x1x16x64xf32, 2x16x64xf32]) <- ([9x1x16x64xf32, 2x16x64xf32])
+        einsum_243, einsum_244, einsum_245 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_82, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_82
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_324,
+            split_325,
+        ) = einsum_244
+        del einsum_244
+
+        # builtin.split: (9x1x16x64xf32, 2x16x64xf32) <- ([9x1x16x64xf32, 2x16x64xf32])
+        (
+            split_326,
+            split_327,
+        ) = einsum_245
+        del einsum_245
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x16x2xf32]) <- (9x9x1x2xf32, 9x1x16x2xf32)
+        combine_83 = [cast_5, einsum_243]
+        del einsum_243
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x16x2xf32]) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        einsum_246, einsum_247, einsum_248 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_83, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_83
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_328,
+            split_329,
+        ) = einsum_247
+        del einsum_247
+
+        # builtin.split: (9x9x1x2xf32, 9x1x16x2xf32) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        (
+            split_330,
+            split_331,
+        ) = einsum_248
+        del einsum_248
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_121 = paddle._C_ops.add(einsum_237, index_select_13)
+        del einsum_237, index_select_13
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_122 = paddle._C_ops.add(add_121, einsum_246)
+        del add_121, einsum_246
+
+        # pd_op.scale: (1x16x9x9xf32) <- (1x16x9x9xf32, 1xf32)
+        scale_17 = paddle._C_ops.scale(add_122, full_16, float("0"), True)
+        del add_122
+
+        # pd_op.subtract: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x1x9x9xf32)
+        subtract_13 = paddle._C_ops.subtract(scale_17, scale_4)
+        del scale_17
+
+        # pd_op.softmax: (1x16x9x9xf32) <- (1x16x9x9xf32)
+        softmax_13 = paddle._C_ops.softmax(subtract_13, 3)
+        del subtract_13
+
+        # pd_op.dropout: (1x16x9x9xf32, 1x16x9x9xui8) <- (1x16x9x9xf32, None, 1xf32)
+        dropout_108, dropout_109 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_13, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_13
+
+        # builtin.combine: ([1x16x9x9xf32, 9x1x16x64xf32]) <- (1x16x9x9xf32, 9x1x16x64xf32)
+        combine_84 = [dropout_108, reshape_93]
+        del dropout_108, reshape_93
+
+        # pd_op.einsum: (9x1x16x64xf32, [0xf32, 0xf32], [1x16x9x9xf32, 9x1x16x64xf32]) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        einsum_249, einsum_250, einsum_251 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_84, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_84
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_332,
+            split_333,
+        ) = einsum_250
+        del einsum_250
+
+        # builtin.split: (1x16x9x9xf32, 9x1x16x64xf32) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        (
+            split_334,
+            split_335,
+        ) = einsum_251
+        del einsum_251
+
+        # pd_op.reshape: (9x1x1024xf32) <- (9x1x16x64xf32, 3xi64)
+        reshape_97 = paddle._C_ops.reshape(einsum_249, full_int_array_10)
+        del einsum_249
+
+        # builtin.combine: ([9x1x1024xf32, 1024x1024xf32]) <- (9x1x1024xf32, 1024x1024xf32)
+        combine_85 = [reshape_97, parameter_183]
+        del parameter_183, reshape_97
+
+        # pd_op.einsum: (9x1x1024xf32, [0xf32, 0xf32], [9x1x1024xf32, 1024x1024xf32]) <- ([9x1x1024xf32, 1024x1024xf32])
+        einsum_252, einsum_253, einsum_254 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_85, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_85
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_336,
+            split_337,
+        ) = einsum_253
+        del einsum_253
+
+        # builtin.split: (9x1x1024xf32, 1024x1024xf32) <- ([9x1x1024xf32, 1024x1024xf32])
+        (
+            split_338,
+            split_339,
+        ) = einsum_254
+        del einsum_254
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_110, dropout_111 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_252, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_252
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_123 = paddle._C_ops.add(dropout_110, layer_norm_75)
+        del dropout_110, layer_norm_75
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_78, layer_norm_79, layer_norm_80 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_123, parameter_177, parameter_176, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_123, parameter_176, parameter_177
+
+        # pd_op.matmul: (9x1x4096xf32) <- (9x1x1024xf32, 1024x4096xf32)
+        matmul_82 = paddle._C_ops.matmul(layer_norm_78, parameter_173, False, False)
+        del parameter_173
+
+        # pd_op.add: (9x1x4096xf32) <- (9x1x4096xf32, 4096xf32)
+        add_124 = paddle._C_ops.add(matmul_82, parameter_172)
+        del matmul_82, parameter_172
+
+        # pd_op.relu: (9x1x4096xf32) <- (9x1x4096xf32)
+        relu_13 = paddle._C_ops.relu(add_124)
+        del add_124
+
+        # pd_op.dropout: (9x1x4096xf32, 9x1x4096xui8) <- (9x1x4096xf32, None, 1xf32)
+        dropout_112, dropout_113 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_13, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_13
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x4096xf32, 4096x1024xf32)
+        matmul_83 = paddle._C_ops.matmul(dropout_112, parameter_171, False, False)
+        del dropout_112, parameter_171
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 1024xf32)
+        add_125 = paddle._C_ops.add(matmul_83, parameter_170)
+        del matmul_83, parameter_170
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_114, dropout_115 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_125, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_125
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_126 = paddle._C_ops.add(dropout_114, layer_norm_78)
+        del dropout_114, layer_norm_78
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_81, layer_norm_82, layer_norm_83 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_126, parameter_175, parameter_174, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_126, parameter_174, parameter_175
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_84 = paddle._C_ops.matmul(layer_norm_81, parameter_169, False, False)
+        del parameter_169
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_98 = paddle._C_ops.reshape(matmul_84, full_int_array_5)
+        del matmul_84
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_85 = paddle._C_ops.matmul(layer_norm_81, parameter_168, False, False)
+        del parameter_168
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_99 = paddle._C_ops.reshape(matmul_85, full_int_array_5)
+        del matmul_85
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_86 = paddle._C_ops.matmul(layer_norm_81, parameter_167, False, False)
+        del parameter_167
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_100 = paddle._C_ops.reshape(matmul_86, full_int_array_5)
+        del matmul_86
+
+        # pd_op.matmul: (18x1x1024xf32) <- (18x1x1024xf32, 1024x1024xf32)
+        matmul_87 = paddle._C_ops.matmul(dropout_2, parameter_165, False, False)
+        del parameter_165
+
+        # pd_op.reshape: (18x1x16x64xf32) <- (18x1x1024xf32, 4xi64)
+        reshape_101 = paddle._C_ops.reshape(matmul_87, full_int_array_6)
+        del matmul_87
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_127 = paddle._C_ops.add(reshape_98, parameter_162)
+        del parameter_162
+
+        # builtin.combine: ([9x1x16x64xf32, 9x1x16x64xf32]) <- (9x1x16x64xf32, 9x1x16x64xf32)
+        combine_86 = [add_127, reshape_99]
+        del add_127, reshape_99
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x1x16x64xf32, 9x1x16x64xf32]) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        einsum_255, einsum_256, einsum_257 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_86, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_86
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_340,
+            split_341,
+        ) = einsum_256
+        del einsum_256
+
+        # builtin.split: (9x1x16x64xf32, 9x1x16x64xf32) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        (
+            split_342,
+            split_343,
+        ) = einsum_257
+        del einsum_257
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_128 = paddle._C_ops.add(reshape_98, parameter_164)
+        del parameter_164
+
+        # builtin.combine: ([9x1x16x64xf32, 18x1x16x64xf32]) <- (9x1x16x64xf32, 18x1x16x64xf32)
+        combine_87 = [add_128, reshape_101]
+        del add_128, reshape_101
+
+        # pd_op.einsum: (1x16x9x18xf32, [0xf32, 0xf32], [9x1x16x64xf32, 18x1x16x64xf32]) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        einsum_258, einsum_259, einsum_260 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_87, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_87
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_344,
+            split_345,
+        ) = einsum_259
+        del einsum_259
+
+        # builtin.split: (9x1x16x64xf32, 18x1x16x64xf32) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        (
+            split_346,
+            split_347,
+        ) = einsum_260
+        del einsum_260
+
+        # pd_op.reshape: (1x16x18x9xf32) <- (1x16x9x18xf32, 4xi64)
+        reshape_102 = paddle._C_ops.reshape(einsum_258, full_int_array_7)
+        del einsum_258
+
+        # pd_op.slice: (1x16x17x9xf32) <- (1x16x18x9xf32, 1xi64, 1xi64)
+        slice_14 = paddle._C_ops.slice(
+            reshape_102, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_102
+
+        # pd_op.reshape: (1x16x9x17xf32) <- (1x16x17x9xf32, 4xi64)
+        reshape_103 = paddle._C_ops.reshape(slice_14, full_int_array_9)
+        del slice_14
+
+        # pd_op.index_select: (1x16x9x9xf32) <- (1x16x9x17xf32, 9xi64)
+        index_select_14 = paddle._C_ops.index_select(reshape_103, arange_2, 3)
+        del reshape_103
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_129 = paddle._C_ops.add(reshape_98, parameter_163)
+        del parameter_163, reshape_98
+
+        # builtin.combine: ([9x1x16x64xf32, 2x16x64xf32]) <- (9x1x16x64xf32, 2x16x64xf32)
+        combine_88 = [add_129, parameter_161]
+        del add_129, parameter_161
+
+        # pd_op.einsum: (9x1x16x2xf32, [0xf32, 0xf32], [9x1x16x64xf32, 2x16x64xf32]) <- ([9x1x16x64xf32, 2x16x64xf32])
+        einsum_261, einsum_262, einsum_263 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_88, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_88
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_348,
+            split_349,
+        ) = einsum_262
+        del einsum_262
+
+        # builtin.split: (9x1x16x64xf32, 2x16x64xf32) <- ([9x1x16x64xf32, 2x16x64xf32])
+        (
+            split_350,
+            split_351,
+        ) = einsum_263
+        del einsum_263
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x16x2xf32]) <- (9x9x1x2xf32, 9x1x16x2xf32)
+        combine_89 = [cast_5, einsum_261]
+        del einsum_261
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x16x2xf32]) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        einsum_264, einsum_265, einsum_266 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_89, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_89
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_352,
+            split_353,
+        ) = einsum_265
+        del einsum_265
+
+        # builtin.split: (9x9x1x2xf32, 9x1x16x2xf32) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        (
+            split_354,
+            split_355,
+        ) = einsum_266
+        del einsum_266
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_130 = paddle._C_ops.add(einsum_255, index_select_14)
+        del einsum_255, index_select_14
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_131 = paddle._C_ops.add(add_130, einsum_264)
+        del add_130, einsum_264
+
+        # pd_op.scale: (1x16x9x9xf32) <- (1x16x9x9xf32, 1xf32)
+        scale_18 = paddle._C_ops.scale(add_131, full_16, float("0"), True)
+        del add_131
+
+        # pd_op.subtract: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x1x9x9xf32)
+        subtract_14 = paddle._C_ops.subtract(scale_18, scale_4)
+        del scale_18
+
+        # pd_op.softmax: (1x16x9x9xf32) <- (1x16x9x9xf32)
+        softmax_14 = paddle._C_ops.softmax(subtract_14, 3)
+        del subtract_14
+
+        # pd_op.dropout: (1x16x9x9xf32, 1x16x9x9xui8) <- (1x16x9x9xf32, None, 1xf32)
+        dropout_116, dropout_117 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_14, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_14
+
+        # builtin.combine: ([1x16x9x9xf32, 9x1x16x64xf32]) <- (1x16x9x9xf32, 9x1x16x64xf32)
+        combine_90 = [dropout_116, reshape_100]
+        del dropout_116, reshape_100
+
+        # pd_op.einsum: (9x1x16x64xf32, [0xf32, 0xf32], [1x16x9x9xf32, 9x1x16x64xf32]) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        einsum_267, einsum_268, einsum_269 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_90, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_90
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_356,
+            split_357,
+        ) = einsum_268
+        del einsum_268
+
+        # builtin.split: (1x16x9x9xf32, 9x1x16x64xf32) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        (
+            split_358,
+            split_359,
+        ) = einsum_269
+        del einsum_269
+
+        # pd_op.reshape: (9x1x1024xf32) <- (9x1x16x64xf32, 3xi64)
+        reshape_104 = paddle._C_ops.reshape(einsum_267, full_int_array_10)
+        del einsum_267
+
+        # builtin.combine: ([9x1x1024xf32, 1024x1024xf32]) <- (9x1x1024xf32, 1024x1024xf32)
+        combine_91 = [reshape_104, parameter_166]
+        del parameter_166, reshape_104
+
+        # pd_op.einsum: (9x1x1024xf32, [0xf32, 0xf32], [9x1x1024xf32, 1024x1024xf32]) <- ([9x1x1024xf32, 1024x1024xf32])
+        einsum_270, einsum_271, einsum_272 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_91, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_91
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_360,
+            split_361,
+        ) = einsum_271
+        del einsum_271
+
+        # builtin.split: (9x1x1024xf32, 1024x1024xf32) <- ([9x1x1024xf32, 1024x1024xf32])
+        (
+            split_362,
+            split_363,
+        ) = einsum_272
+        del einsum_272
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_118, dropout_119 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_270, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_270
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_132 = paddle._C_ops.add(dropout_118, layer_norm_81)
+        del dropout_118, layer_norm_81
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_84, layer_norm_85, layer_norm_86 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_132, parameter_160, parameter_159, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_132, parameter_159, parameter_160
+
+        # pd_op.matmul: (9x1x4096xf32) <- (9x1x1024xf32, 1024x4096xf32)
+        matmul_88 = paddle._C_ops.matmul(layer_norm_84, parameter_156, False, False)
+        del parameter_156
+
+        # pd_op.add: (9x1x4096xf32) <- (9x1x4096xf32, 4096xf32)
+        add_133 = paddle._C_ops.add(matmul_88, parameter_155)
+        del matmul_88, parameter_155
+
+        # pd_op.relu: (9x1x4096xf32) <- (9x1x4096xf32)
+        relu_14 = paddle._C_ops.relu(add_133)
+        del add_133
+
+        # pd_op.dropout: (9x1x4096xf32, 9x1x4096xui8) <- (9x1x4096xf32, None, 1xf32)
+        dropout_120, dropout_121 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_14, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_14
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x4096xf32, 4096x1024xf32)
+        matmul_89 = paddle._C_ops.matmul(dropout_120, parameter_154, False, False)
+        del dropout_120, parameter_154
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 1024xf32)
+        add_134 = paddle._C_ops.add(matmul_89, parameter_153)
+        del matmul_89, parameter_153
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_122, dropout_123 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_134, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_134
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_135 = paddle._C_ops.add(dropout_122, layer_norm_84)
+        del dropout_122, layer_norm_84
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_87, layer_norm_88, layer_norm_89 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_135, parameter_158, parameter_157, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_135, parameter_157, parameter_158
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_90 = paddle._C_ops.matmul(layer_norm_87, parameter_152, False, False)
+        del parameter_152
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_105 = paddle._C_ops.reshape(matmul_90, full_int_array_5)
+        del matmul_90
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_91 = paddle._C_ops.matmul(layer_norm_87, parameter_151, False, False)
+        del parameter_151
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_106 = paddle._C_ops.reshape(matmul_91, full_int_array_5)
+        del matmul_91
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_92 = paddle._C_ops.matmul(layer_norm_87, parameter_150, False, False)
+        del parameter_150
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_107 = paddle._C_ops.reshape(matmul_92, full_int_array_5)
+        del matmul_92
+
+        # pd_op.matmul: (18x1x1024xf32) <- (18x1x1024xf32, 1024x1024xf32)
+        matmul_93 = paddle._C_ops.matmul(dropout_2, parameter_148, False, False)
+        del parameter_148
+
+        # pd_op.reshape: (18x1x16x64xf32) <- (18x1x1024xf32, 4xi64)
+        reshape_108 = paddle._C_ops.reshape(matmul_93, full_int_array_6)
+        del matmul_93
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_136 = paddle._C_ops.add(reshape_105, parameter_145)
+        del parameter_145
+
+        # builtin.combine: ([9x1x16x64xf32, 9x1x16x64xf32]) <- (9x1x16x64xf32, 9x1x16x64xf32)
+        combine_92 = [add_136, reshape_106]
+        del add_136, reshape_106
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x1x16x64xf32, 9x1x16x64xf32]) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        einsum_273, einsum_274, einsum_275 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_92, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_92
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_364,
+            split_365,
+        ) = einsum_274
+        del einsum_274
+
+        # builtin.split: (9x1x16x64xf32, 9x1x16x64xf32) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        (
+            split_366,
+            split_367,
+        ) = einsum_275
+        del einsum_275
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_137 = paddle._C_ops.add(reshape_105, parameter_147)
+        del parameter_147
+
+        # builtin.combine: ([9x1x16x64xf32, 18x1x16x64xf32]) <- (9x1x16x64xf32, 18x1x16x64xf32)
+        combine_93 = [add_137, reshape_108]
+        del add_137, reshape_108
+
+        # pd_op.einsum: (1x16x9x18xf32, [0xf32, 0xf32], [9x1x16x64xf32, 18x1x16x64xf32]) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        einsum_276, einsum_277, einsum_278 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_93, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_93
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_368,
+            split_369,
+        ) = einsum_277
+        del einsum_277
+
+        # builtin.split: (9x1x16x64xf32, 18x1x16x64xf32) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        (
+            split_370,
+            split_371,
+        ) = einsum_278
+        del einsum_278
+
+        # pd_op.reshape: (1x16x18x9xf32) <- (1x16x9x18xf32, 4xi64)
+        reshape_109 = paddle._C_ops.reshape(einsum_276, full_int_array_7)
+        del einsum_276
+
+        # pd_op.slice: (1x16x17x9xf32) <- (1x16x18x9xf32, 1xi64, 1xi64)
+        slice_15 = paddle._C_ops.slice(
+            reshape_109, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_109
+
+        # pd_op.reshape: (1x16x9x17xf32) <- (1x16x17x9xf32, 4xi64)
+        reshape_110 = paddle._C_ops.reshape(slice_15, full_int_array_9)
+        del slice_15
+
+        # pd_op.index_select: (1x16x9x9xf32) <- (1x16x9x17xf32, 9xi64)
+        index_select_15 = paddle._C_ops.index_select(reshape_110, arange_2, 3)
+        del reshape_110
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_138 = paddle._C_ops.add(reshape_105, parameter_146)
+        del parameter_146, reshape_105
+
+        # builtin.combine: ([9x1x16x64xf32, 2x16x64xf32]) <- (9x1x16x64xf32, 2x16x64xf32)
+        combine_94 = [add_138, parameter_144]
+        del add_138, parameter_144
+
+        # pd_op.einsum: (9x1x16x2xf32, [0xf32, 0xf32], [9x1x16x64xf32, 2x16x64xf32]) <- ([9x1x16x64xf32, 2x16x64xf32])
+        einsum_279, einsum_280, einsum_281 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_94, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_94
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_372,
+            split_373,
+        ) = einsum_280
+        del einsum_280
+
+        # builtin.split: (9x1x16x64xf32, 2x16x64xf32) <- ([9x1x16x64xf32, 2x16x64xf32])
+        (
+            split_374,
+            split_375,
+        ) = einsum_281
+        del einsum_281
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x16x2xf32]) <- (9x9x1x2xf32, 9x1x16x2xf32)
+        combine_95 = [cast_5, einsum_279]
+        del einsum_279
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x16x2xf32]) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        einsum_282, einsum_283, einsum_284 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_95, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_95
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_376,
+            split_377,
+        ) = einsum_283
+        del einsum_283
+
+        # builtin.split: (9x9x1x2xf32, 9x1x16x2xf32) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        (
+            split_378,
+            split_379,
+        ) = einsum_284
+        del einsum_284
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_139 = paddle._C_ops.add(einsum_273, index_select_15)
+        del einsum_273, index_select_15
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_140 = paddle._C_ops.add(add_139, einsum_282)
+        del add_139, einsum_282
+
+        # pd_op.scale: (1x16x9x9xf32) <- (1x16x9x9xf32, 1xf32)
+        scale_19 = paddle._C_ops.scale(add_140, full_16, float("0"), True)
+        del add_140
+
+        # pd_op.subtract: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x1x9x9xf32)
+        subtract_15 = paddle._C_ops.subtract(scale_19, scale_4)
+        del scale_19
+
+        # pd_op.softmax: (1x16x9x9xf32) <- (1x16x9x9xf32)
+        softmax_15 = paddle._C_ops.softmax(subtract_15, 3)
+        del subtract_15
+
+        # pd_op.dropout: (1x16x9x9xf32, 1x16x9x9xui8) <- (1x16x9x9xf32, None, 1xf32)
+        dropout_124, dropout_125 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_15, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_15
+
+        # builtin.combine: ([1x16x9x9xf32, 9x1x16x64xf32]) <- (1x16x9x9xf32, 9x1x16x64xf32)
+        combine_96 = [dropout_124, reshape_107]
+        del dropout_124, reshape_107
+
+        # pd_op.einsum: (9x1x16x64xf32, [0xf32, 0xf32], [1x16x9x9xf32, 9x1x16x64xf32]) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        einsum_285, einsum_286, einsum_287 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_96, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_96
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_380,
+            split_381,
+        ) = einsum_286
+        del einsum_286
+
+        # builtin.split: (1x16x9x9xf32, 9x1x16x64xf32) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        (
+            split_382,
+            split_383,
+        ) = einsum_287
+        del einsum_287
+
+        # pd_op.reshape: (9x1x1024xf32) <- (9x1x16x64xf32, 3xi64)
+        reshape_111 = paddle._C_ops.reshape(einsum_285, full_int_array_10)
+        del einsum_285
+
+        # builtin.combine: ([9x1x1024xf32, 1024x1024xf32]) <- (9x1x1024xf32, 1024x1024xf32)
+        combine_97 = [reshape_111, parameter_149]
+        del parameter_149, reshape_111
+
+        # pd_op.einsum: (9x1x1024xf32, [0xf32, 0xf32], [9x1x1024xf32, 1024x1024xf32]) <- ([9x1x1024xf32, 1024x1024xf32])
+        einsum_288, einsum_289, einsum_290 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_97, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_97
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_384,
+            split_385,
+        ) = einsum_289
+        del einsum_289
+
+        # builtin.split: (9x1x1024xf32, 1024x1024xf32) <- ([9x1x1024xf32, 1024x1024xf32])
+        (
+            split_386,
+            split_387,
+        ) = einsum_290
+        del einsum_290
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_126, dropout_127 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_288, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_288
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_141 = paddle._C_ops.add(dropout_126, layer_norm_87)
+        del dropout_126, layer_norm_87
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_90, layer_norm_91, layer_norm_92 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_141, parameter_143, parameter_142, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_141, parameter_142, parameter_143
+
+        # pd_op.matmul: (9x1x4096xf32) <- (9x1x1024xf32, 1024x4096xf32)
+        matmul_94 = paddle._C_ops.matmul(layer_norm_90, parameter_139, False, False)
+        del parameter_139
+
+        # pd_op.add: (9x1x4096xf32) <- (9x1x4096xf32, 4096xf32)
+        add_142 = paddle._C_ops.add(matmul_94, parameter_138)
+        del matmul_94, parameter_138
+
+        # pd_op.relu: (9x1x4096xf32) <- (9x1x4096xf32)
+        relu_15 = paddle._C_ops.relu(add_142)
+        del add_142
+
+        # pd_op.dropout: (9x1x4096xf32, 9x1x4096xui8) <- (9x1x4096xf32, None, 1xf32)
+        dropout_128, dropout_129 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_15, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_15
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x4096xf32, 4096x1024xf32)
+        matmul_95 = paddle._C_ops.matmul(dropout_128, parameter_137, False, False)
+        del dropout_128, parameter_137
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 1024xf32)
+        add_143 = paddle._C_ops.add(matmul_95, parameter_136)
+        del matmul_95, parameter_136
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_130, dropout_131 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_143, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_143
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_144 = paddle._C_ops.add(dropout_130, layer_norm_90)
+        del dropout_130, layer_norm_90
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_93, layer_norm_94, layer_norm_95 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_144, parameter_141, parameter_140, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_144, parameter_140, parameter_141
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_96 = paddle._C_ops.matmul(layer_norm_93, parameter_135, False, False)
+        del parameter_135
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_112 = paddle._C_ops.reshape(matmul_96, full_int_array_5)
+        del matmul_96
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_97 = paddle._C_ops.matmul(layer_norm_93, parameter_134, False, False)
+        del parameter_134
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_113 = paddle._C_ops.reshape(matmul_97, full_int_array_5)
+        del matmul_97
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_98 = paddle._C_ops.matmul(layer_norm_93, parameter_133, False, False)
+        del parameter_133
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_114 = paddle._C_ops.reshape(matmul_98, full_int_array_5)
+        del matmul_98
+
+        # pd_op.matmul: (18x1x1024xf32) <- (18x1x1024xf32, 1024x1024xf32)
+        matmul_99 = paddle._C_ops.matmul(dropout_2, parameter_131, False, False)
+        del parameter_131
+
+        # pd_op.reshape: (18x1x16x64xf32) <- (18x1x1024xf32, 4xi64)
+        reshape_115 = paddle._C_ops.reshape(matmul_99, full_int_array_6)
+        del matmul_99
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_145 = paddle._C_ops.add(reshape_112, parameter_128)
+        del parameter_128
+
+        # builtin.combine: ([9x1x16x64xf32, 9x1x16x64xf32]) <- (9x1x16x64xf32, 9x1x16x64xf32)
+        combine_98 = [add_145, reshape_113]
+        del add_145, reshape_113
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x1x16x64xf32, 9x1x16x64xf32]) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        einsum_291, einsum_292, einsum_293 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_98, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_98
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_388,
+            split_389,
+        ) = einsum_292
+        del einsum_292
+
+        # builtin.split: (9x1x16x64xf32, 9x1x16x64xf32) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        (
+            split_390,
+            split_391,
+        ) = einsum_293
+        del einsum_293
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_146 = paddle._C_ops.add(reshape_112, parameter_130)
+        del parameter_130
+
+        # builtin.combine: ([9x1x16x64xf32, 18x1x16x64xf32]) <- (9x1x16x64xf32, 18x1x16x64xf32)
+        combine_99 = [add_146, reshape_115]
+        del add_146, reshape_115
+
+        # pd_op.einsum: (1x16x9x18xf32, [0xf32, 0xf32], [9x1x16x64xf32, 18x1x16x64xf32]) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        einsum_294, einsum_295, einsum_296 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_99, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_99
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_392,
+            split_393,
+        ) = einsum_295
+        del einsum_295
+
+        # builtin.split: (9x1x16x64xf32, 18x1x16x64xf32) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        (
+            split_394,
+            split_395,
+        ) = einsum_296
+        del einsum_296
+
+        # pd_op.reshape: (1x16x18x9xf32) <- (1x16x9x18xf32, 4xi64)
+        reshape_116 = paddle._C_ops.reshape(einsum_294, full_int_array_7)
+        del einsum_294
+
+        # pd_op.slice: (1x16x17x9xf32) <- (1x16x18x9xf32, 1xi64, 1xi64)
+        slice_16 = paddle._C_ops.slice(
+            reshape_116, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_116
+
+        # pd_op.reshape: (1x16x9x17xf32) <- (1x16x17x9xf32, 4xi64)
+        reshape_117 = paddle._C_ops.reshape(slice_16, full_int_array_9)
+        del slice_16
+
+        # pd_op.index_select: (1x16x9x9xf32) <- (1x16x9x17xf32, 9xi64)
+        index_select_16 = paddle._C_ops.index_select(reshape_117, arange_2, 3)
+        del reshape_117
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_147 = paddle._C_ops.add(reshape_112, parameter_129)
+        del parameter_129, reshape_112
+
+        # builtin.combine: ([9x1x16x64xf32, 2x16x64xf32]) <- (9x1x16x64xf32, 2x16x64xf32)
+        combine_100 = [add_147, parameter_127]
+        del add_147, parameter_127
+
+        # pd_op.einsum: (9x1x16x2xf32, [0xf32, 0xf32], [9x1x16x64xf32, 2x16x64xf32]) <- ([9x1x16x64xf32, 2x16x64xf32])
+        einsum_297, einsum_298, einsum_299 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_100, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_100
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_396,
+            split_397,
+        ) = einsum_298
+        del einsum_298
+
+        # builtin.split: (9x1x16x64xf32, 2x16x64xf32) <- ([9x1x16x64xf32, 2x16x64xf32])
+        (
+            split_398,
+            split_399,
+        ) = einsum_299
+        del einsum_299
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x16x2xf32]) <- (9x9x1x2xf32, 9x1x16x2xf32)
+        combine_101 = [cast_5, einsum_297]
+        del einsum_297
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x16x2xf32]) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        einsum_300, einsum_301, einsum_302 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_101, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_101
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_400,
+            split_401,
+        ) = einsum_301
+        del einsum_301
+
+        # builtin.split: (9x9x1x2xf32, 9x1x16x2xf32) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        (
+            split_402,
+            split_403,
+        ) = einsum_302
+        del einsum_302
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_148 = paddle._C_ops.add(einsum_291, index_select_16)
+        del einsum_291, index_select_16
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_149 = paddle._C_ops.add(add_148, einsum_300)
+        del add_148, einsum_300
+
+        # pd_op.scale: (1x16x9x9xf32) <- (1x16x9x9xf32, 1xf32)
+        scale_20 = paddle._C_ops.scale(add_149, full_16, float("0"), True)
+        del add_149
+
+        # pd_op.subtract: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x1x9x9xf32)
+        subtract_16 = paddle._C_ops.subtract(scale_20, scale_4)
+        del scale_20
+
+        # pd_op.softmax: (1x16x9x9xf32) <- (1x16x9x9xf32)
+        softmax_16 = paddle._C_ops.softmax(subtract_16, 3)
+        del subtract_16
+
+        # pd_op.dropout: (1x16x9x9xf32, 1x16x9x9xui8) <- (1x16x9x9xf32, None, 1xf32)
+        dropout_132, dropout_133 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_16, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_16
+
+        # builtin.combine: ([1x16x9x9xf32, 9x1x16x64xf32]) <- (1x16x9x9xf32, 9x1x16x64xf32)
+        combine_102 = [dropout_132, reshape_114]
+        del dropout_132, reshape_114
+
+        # pd_op.einsum: (9x1x16x64xf32, [0xf32, 0xf32], [1x16x9x9xf32, 9x1x16x64xf32]) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        einsum_303, einsum_304, einsum_305 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_102, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_102
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_404,
+            split_405,
+        ) = einsum_304
+        del einsum_304
+
+        # builtin.split: (1x16x9x9xf32, 9x1x16x64xf32) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        (
+            split_406,
+            split_407,
+        ) = einsum_305
+        del einsum_305
+
+        # pd_op.reshape: (9x1x1024xf32) <- (9x1x16x64xf32, 3xi64)
+        reshape_118 = paddle._C_ops.reshape(einsum_303, full_int_array_10)
+        del einsum_303
+
+        # builtin.combine: ([9x1x1024xf32, 1024x1024xf32]) <- (9x1x1024xf32, 1024x1024xf32)
+        combine_103 = [reshape_118, parameter_132]
+        del parameter_132, reshape_118
+
+        # pd_op.einsum: (9x1x1024xf32, [0xf32, 0xf32], [9x1x1024xf32, 1024x1024xf32]) <- ([9x1x1024xf32, 1024x1024xf32])
+        einsum_306, einsum_307, einsum_308 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_103, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_103
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_408,
+            split_409,
+        ) = einsum_307
+        del einsum_307
+
+        # builtin.split: (9x1x1024xf32, 1024x1024xf32) <- ([9x1x1024xf32, 1024x1024xf32])
+        (
+            split_410,
+            split_411,
+        ) = einsum_308
+        del einsum_308
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_134, dropout_135 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_306, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_306
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_150 = paddle._C_ops.add(dropout_134, layer_norm_93)
+        del dropout_134, layer_norm_93
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_96, layer_norm_97, layer_norm_98 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_150, parameter_126, parameter_125, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_150, parameter_125, parameter_126
+
+        # pd_op.matmul: (9x1x4096xf32) <- (9x1x1024xf32, 1024x4096xf32)
+        matmul_100 = paddle._C_ops.matmul(layer_norm_96, parameter_122, False, False)
+        del parameter_122
+
+        # pd_op.add: (9x1x4096xf32) <- (9x1x4096xf32, 4096xf32)
+        add_151 = paddle._C_ops.add(matmul_100, parameter_121)
+        del matmul_100, parameter_121
+
+        # pd_op.relu: (9x1x4096xf32) <- (9x1x4096xf32)
+        relu_16 = paddle._C_ops.relu(add_151)
+        del add_151
+
+        # pd_op.dropout: (9x1x4096xf32, 9x1x4096xui8) <- (9x1x4096xf32, None, 1xf32)
+        dropout_136, dropout_137 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_16, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_16
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x4096xf32, 4096x1024xf32)
+        matmul_101 = paddle._C_ops.matmul(dropout_136, parameter_120, False, False)
+        del dropout_136, parameter_120
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 1024xf32)
+        add_152 = paddle._C_ops.add(matmul_101, parameter_119)
+        del matmul_101, parameter_119
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_138, dropout_139 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_152, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_152
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_153 = paddle._C_ops.add(dropout_138, layer_norm_96)
+        del dropout_138, layer_norm_96
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_99, layer_norm_100, layer_norm_101 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_153, parameter_124, parameter_123, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_153, parameter_123, parameter_124
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_102 = paddle._C_ops.matmul(layer_norm_99, parameter_118, False, False)
+        del parameter_118
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_119 = paddle._C_ops.reshape(matmul_102, full_int_array_5)
+        del matmul_102
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_103 = paddle._C_ops.matmul(layer_norm_99, parameter_117, False, False)
+        del parameter_117
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_120 = paddle._C_ops.reshape(matmul_103, full_int_array_5)
+        del matmul_103
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_104 = paddle._C_ops.matmul(layer_norm_99, parameter_116, False, False)
+        del parameter_116
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_121 = paddle._C_ops.reshape(matmul_104, full_int_array_5)
+        del matmul_104
+
+        # pd_op.matmul: (18x1x1024xf32) <- (18x1x1024xf32, 1024x1024xf32)
+        matmul_105 = paddle._C_ops.matmul(dropout_2, parameter_114, False, False)
+        del parameter_114
+
+        # pd_op.reshape: (18x1x16x64xf32) <- (18x1x1024xf32, 4xi64)
+        reshape_122 = paddle._C_ops.reshape(matmul_105, full_int_array_6)
+        del matmul_105
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_154 = paddle._C_ops.add(reshape_119, parameter_111)
+        del parameter_111
+
+        # builtin.combine: ([9x1x16x64xf32, 9x1x16x64xf32]) <- (9x1x16x64xf32, 9x1x16x64xf32)
+        combine_104 = [add_154, reshape_120]
+        del add_154, reshape_120
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x1x16x64xf32, 9x1x16x64xf32]) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        einsum_309, einsum_310, einsum_311 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_104, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_104
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_412,
+            split_413,
+        ) = einsum_310
+        del einsum_310
+
+        # builtin.split: (9x1x16x64xf32, 9x1x16x64xf32) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        (
+            split_414,
+            split_415,
+        ) = einsum_311
+        del einsum_311
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_155 = paddle._C_ops.add(reshape_119, parameter_113)
+        del parameter_113
+
+        # builtin.combine: ([9x1x16x64xf32, 18x1x16x64xf32]) <- (9x1x16x64xf32, 18x1x16x64xf32)
+        combine_105 = [add_155, reshape_122]
+        del add_155, reshape_122
+
+        # pd_op.einsum: (1x16x9x18xf32, [0xf32, 0xf32], [9x1x16x64xf32, 18x1x16x64xf32]) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        einsum_312, einsum_313, einsum_314 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_105, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_105
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_416,
+            split_417,
+        ) = einsum_313
+        del einsum_313
+
+        # builtin.split: (9x1x16x64xf32, 18x1x16x64xf32) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        (
+            split_418,
+            split_419,
+        ) = einsum_314
+        del einsum_314
+
+        # pd_op.reshape: (1x16x18x9xf32) <- (1x16x9x18xf32, 4xi64)
+        reshape_123 = paddle._C_ops.reshape(einsum_312, full_int_array_7)
+        del einsum_312
+
+        # pd_op.slice: (1x16x17x9xf32) <- (1x16x18x9xf32, 1xi64, 1xi64)
+        slice_17 = paddle._C_ops.slice(
+            reshape_123, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_123
+
+        # pd_op.reshape: (1x16x9x17xf32) <- (1x16x17x9xf32, 4xi64)
+        reshape_124 = paddle._C_ops.reshape(slice_17, full_int_array_9)
+        del slice_17
+
+        # pd_op.index_select: (1x16x9x9xf32) <- (1x16x9x17xf32, 9xi64)
+        index_select_17 = paddle._C_ops.index_select(reshape_124, arange_2, 3)
+        del reshape_124
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_156 = paddle._C_ops.add(reshape_119, parameter_112)
+        del parameter_112, reshape_119
+
+        # builtin.combine: ([9x1x16x64xf32, 2x16x64xf32]) <- (9x1x16x64xf32, 2x16x64xf32)
+        combine_106 = [add_156, parameter_110]
+        del add_156, parameter_110
+
+        # pd_op.einsum: (9x1x16x2xf32, [0xf32, 0xf32], [9x1x16x64xf32, 2x16x64xf32]) <- ([9x1x16x64xf32, 2x16x64xf32])
+        einsum_315, einsum_316, einsum_317 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_106, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_106
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_420,
+            split_421,
+        ) = einsum_316
+        del einsum_316
+
+        # builtin.split: (9x1x16x64xf32, 2x16x64xf32) <- ([9x1x16x64xf32, 2x16x64xf32])
+        (
+            split_422,
+            split_423,
+        ) = einsum_317
+        del einsum_317
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x16x2xf32]) <- (9x9x1x2xf32, 9x1x16x2xf32)
+        combine_107 = [cast_5, einsum_315]
+        del einsum_315
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x16x2xf32]) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        einsum_318, einsum_319, einsum_320 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_107, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_107
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_424,
+            split_425,
+        ) = einsum_319
+        del einsum_319
+
+        # builtin.split: (9x9x1x2xf32, 9x1x16x2xf32) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        (
+            split_426,
+            split_427,
+        ) = einsum_320
+        del einsum_320
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_157 = paddle._C_ops.add(einsum_309, index_select_17)
+        del einsum_309, index_select_17
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_158 = paddle._C_ops.add(add_157, einsum_318)
+        del add_157, einsum_318
+
+        # pd_op.scale: (1x16x9x9xf32) <- (1x16x9x9xf32, 1xf32)
+        scale_21 = paddle._C_ops.scale(add_158, full_16, float("0"), True)
+        del add_158
+
+        # pd_op.subtract: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x1x9x9xf32)
+        subtract_17 = paddle._C_ops.subtract(scale_21, scale_4)
+        del scale_21
+
+        # pd_op.softmax: (1x16x9x9xf32) <- (1x16x9x9xf32)
+        softmax_17 = paddle._C_ops.softmax(subtract_17, 3)
+        del subtract_17
+
+        # pd_op.dropout: (1x16x9x9xf32, 1x16x9x9xui8) <- (1x16x9x9xf32, None, 1xf32)
+        dropout_140, dropout_141 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_17, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_17
+
+        # builtin.combine: ([1x16x9x9xf32, 9x1x16x64xf32]) <- (1x16x9x9xf32, 9x1x16x64xf32)
+        combine_108 = [dropout_140, reshape_121]
+        del dropout_140, reshape_121
+
+        # pd_op.einsum: (9x1x16x64xf32, [0xf32, 0xf32], [1x16x9x9xf32, 9x1x16x64xf32]) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        einsum_321, einsum_322, einsum_323 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_108, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_108
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_428,
+            split_429,
+        ) = einsum_322
+        del einsum_322
+
+        # builtin.split: (1x16x9x9xf32, 9x1x16x64xf32) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        (
+            split_430,
+            split_431,
+        ) = einsum_323
+        del einsum_323
+
+        # pd_op.reshape: (9x1x1024xf32) <- (9x1x16x64xf32, 3xi64)
+        reshape_125 = paddle._C_ops.reshape(einsum_321, full_int_array_10)
+        del einsum_321
+
+        # builtin.combine: ([9x1x1024xf32, 1024x1024xf32]) <- (9x1x1024xf32, 1024x1024xf32)
+        combine_109 = [reshape_125, parameter_115]
+        del parameter_115, reshape_125
+
+        # pd_op.einsum: (9x1x1024xf32, [0xf32, 0xf32], [9x1x1024xf32, 1024x1024xf32]) <- ([9x1x1024xf32, 1024x1024xf32])
+        einsum_324, einsum_325, einsum_326 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_109, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_109
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_432,
+            split_433,
+        ) = einsum_325
+        del einsum_325
+
+        # builtin.split: (9x1x1024xf32, 1024x1024xf32) <- ([9x1x1024xf32, 1024x1024xf32])
+        (
+            split_434,
+            split_435,
+        ) = einsum_326
+        del einsum_326
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_142, dropout_143 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_324, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_324
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_159 = paddle._C_ops.add(dropout_142, layer_norm_99)
+        del dropout_142, layer_norm_99
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_102, layer_norm_103, layer_norm_104 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_159, parameter_109, parameter_108, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_159, parameter_108, parameter_109
+
+        # pd_op.matmul: (9x1x4096xf32) <- (9x1x1024xf32, 1024x4096xf32)
+        matmul_106 = paddle._C_ops.matmul(layer_norm_102, parameter_105, False, False)
+        del parameter_105
+
+        # pd_op.add: (9x1x4096xf32) <- (9x1x4096xf32, 4096xf32)
+        add_160 = paddle._C_ops.add(matmul_106, parameter_104)
+        del matmul_106, parameter_104
+
+        # pd_op.relu: (9x1x4096xf32) <- (9x1x4096xf32)
+        relu_17 = paddle._C_ops.relu(add_160)
+        del add_160
+
+        # pd_op.dropout: (9x1x4096xf32, 9x1x4096xui8) <- (9x1x4096xf32, None, 1xf32)
+        dropout_144, dropout_145 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_17, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_17
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x4096xf32, 4096x1024xf32)
+        matmul_107 = paddle._C_ops.matmul(dropout_144, parameter_103, False, False)
+        del dropout_144, parameter_103
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 1024xf32)
+        add_161 = paddle._C_ops.add(matmul_107, parameter_102)
+        del matmul_107, parameter_102
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_146, dropout_147 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_161, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_161
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_162 = paddle._C_ops.add(dropout_146, layer_norm_102)
+        del dropout_146, layer_norm_102
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_105, layer_norm_106, layer_norm_107 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_162, parameter_107, parameter_106, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_162, parameter_106, parameter_107
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_108 = paddle._C_ops.matmul(layer_norm_105, parameter_101, False, False)
+        del parameter_101
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_126 = paddle._C_ops.reshape(matmul_108, full_int_array_5)
+        del matmul_108
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_109 = paddle._C_ops.matmul(layer_norm_105, parameter_100, False, False)
+        del parameter_100
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_127 = paddle._C_ops.reshape(matmul_109, full_int_array_5)
+        del matmul_109
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_110 = paddle._C_ops.matmul(layer_norm_105, parameter_99, False, False)
+        del parameter_99
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_128 = paddle._C_ops.reshape(matmul_110, full_int_array_5)
+        del matmul_110
+
+        # pd_op.matmul: (18x1x1024xf32) <- (18x1x1024xf32, 1024x1024xf32)
+        matmul_111 = paddle._C_ops.matmul(dropout_2, parameter_97, False, False)
+        del parameter_97
+
+        # pd_op.reshape: (18x1x16x64xf32) <- (18x1x1024xf32, 4xi64)
+        reshape_129 = paddle._C_ops.reshape(matmul_111, full_int_array_6)
+        del matmul_111
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_163 = paddle._C_ops.add(reshape_126, parameter_94)
+        del parameter_94
+
+        # builtin.combine: ([9x1x16x64xf32, 9x1x16x64xf32]) <- (9x1x16x64xf32, 9x1x16x64xf32)
+        combine_110 = [add_163, reshape_127]
+        del add_163, reshape_127
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x1x16x64xf32, 9x1x16x64xf32]) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        einsum_327, einsum_328, einsum_329 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_110, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_110
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_436,
+            split_437,
+        ) = einsum_328
+        del einsum_328
+
+        # builtin.split: (9x1x16x64xf32, 9x1x16x64xf32) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        (
+            split_438,
+            split_439,
+        ) = einsum_329
+        del einsum_329
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_164 = paddle._C_ops.add(reshape_126, parameter_96)
+        del parameter_96
+
+        # builtin.combine: ([9x1x16x64xf32, 18x1x16x64xf32]) <- (9x1x16x64xf32, 18x1x16x64xf32)
+        combine_111 = [add_164, reshape_129]
+        del add_164, reshape_129
+
+        # pd_op.einsum: (1x16x9x18xf32, [0xf32, 0xf32], [9x1x16x64xf32, 18x1x16x64xf32]) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        einsum_330, einsum_331, einsum_332 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_111, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_111
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_440,
+            split_441,
+        ) = einsum_331
+        del einsum_331
+
+        # builtin.split: (9x1x16x64xf32, 18x1x16x64xf32) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        (
+            split_442,
+            split_443,
+        ) = einsum_332
+        del einsum_332
+
+        # pd_op.reshape: (1x16x18x9xf32) <- (1x16x9x18xf32, 4xi64)
+        reshape_130 = paddle._C_ops.reshape(einsum_330, full_int_array_7)
+        del einsum_330
+
+        # pd_op.slice: (1x16x17x9xf32) <- (1x16x18x9xf32, 1xi64, 1xi64)
+        slice_18 = paddle._C_ops.slice(
+            reshape_130, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_130
+
+        # pd_op.reshape: (1x16x9x17xf32) <- (1x16x17x9xf32, 4xi64)
+        reshape_131 = paddle._C_ops.reshape(slice_18, full_int_array_9)
+        del slice_18
+
+        # pd_op.index_select: (1x16x9x9xf32) <- (1x16x9x17xf32, 9xi64)
+        index_select_18 = paddle._C_ops.index_select(reshape_131, arange_2, 3)
+        del reshape_131
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_165 = paddle._C_ops.add(reshape_126, parameter_95)
+        del parameter_95, reshape_126
+
+        # builtin.combine: ([9x1x16x64xf32, 2x16x64xf32]) <- (9x1x16x64xf32, 2x16x64xf32)
+        combine_112 = [add_165, parameter_93]
+        del add_165, parameter_93
+
+        # pd_op.einsum: (9x1x16x2xf32, [0xf32, 0xf32], [9x1x16x64xf32, 2x16x64xf32]) <- ([9x1x16x64xf32, 2x16x64xf32])
+        einsum_333, einsum_334, einsum_335 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_112, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_112
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_444,
+            split_445,
+        ) = einsum_334
+        del einsum_334
+
+        # builtin.split: (9x1x16x64xf32, 2x16x64xf32) <- ([9x1x16x64xf32, 2x16x64xf32])
+        (
+            split_446,
+            split_447,
+        ) = einsum_335
+        del einsum_335
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x16x2xf32]) <- (9x9x1x2xf32, 9x1x16x2xf32)
+        combine_113 = [cast_5, einsum_333]
+        del einsum_333
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x16x2xf32]) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        einsum_336, einsum_337, einsum_338 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_113, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_113
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_448,
+            split_449,
+        ) = einsum_337
+        del einsum_337
+
+        # builtin.split: (9x9x1x2xf32, 9x1x16x2xf32) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        (
+            split_450,
+            split_451,
+        ) = einsum_338
+        del einsum_338
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_166 = paddle._C_ops.add(einsum_327, index_select_18)
+        del einsum_327, index_select_18
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_167 = paddle._C_ops.add(add_166, einsum_336)
+        del add_166, einsum_336
+
+        # pd_op.scale: (1x16x9x9xf32) <- (1x16x9x9xf32, 1xf32)
+        scale_22 = paddle._C_ops.scale(add_167, full_16, float("0"), True)
+        del add_167
+
+        # pd_op.subtract: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x1x9x9xf32)
+        subtract_18 = paddle._C_ops.subtract(scale_22, scale_4)
+        del scale_22
+
+        # pd_op.softmax: (1x16x9x9xf32) <- (1x16x9x9xf32)
+        softmax_18 = paddle._C_ops.softmax(subtract_18, 3)
+        del subtract_18
+
+        # pd_op.dropout: (1x16x9x9xf32, 1x16x9x9xui8) <- (1x16x9x9xf32, None, 1xf32)
+        dropout_148, dropout_149 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_18, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_18
+
+        # builtin.combine: ([1x16x9x9xf32, 9x1x16x64xf32]) <- (1x16x9x9xf32, 9x1x16x64xf32)
+        combine_114 = [dropout_148, reshape_128]
+        del dropout_148, reshape_128
+
+        # pd_op.einsum: (9x1x16x64xf32, [0xf32, 0xf32], [1x16x9x9xf32, 9x1x16x64xf32]) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        einsum_339, einsum_340, einsum_341 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_114, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_114
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_452,
+            split_453,
+        ) = einsum_340
+        del einsum_340
+
+        # builtin.split: (1x16x9x9xf32, 9x1x16x64xf32) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        (
+            split_454,
+            split_455,
+        ) = einsum_341
+        del einsum_341
+
+        # pd_op.reshape: (9x1x1024xf32) <- (9x1x16x64xf32, 3xi64)
+        reshape_132 = paddle._C_ops.reshape(einsum_339, full_int_array_10)
+        del einsum_339
+
+        # builtin.combine: ([9x1x1024xf32, 1024x1024xf32]) <- (9x1x1024xf32, 1024x1024xf32)
+        combine_115 = [reshape_132, parameter_98]
+        del parameter_98, reshape_132
+
+        # pd_op.einsum: (9x1x1024xf32, [0xf32, 0xf32], [9x1x1024xf32, 1024x1024xf32]) <- ([9x1x1024xf32, 1024x1024xf32])
+        einsum_342, einsum_343, einsum_344 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_115, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_115
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_456,
+            split_457,
+        ) = einsum_343
+        del einsum_343
+
+        # builtin.split: (9x1x1024xf32, 1024x1024xf32) <- ([9x1x1024xf32, 1024x1024xf32])
+        (
+            split_458,
+            split_459,
+        ) = einsum_344
+        del einsum_344
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_150, dropout_151 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_342, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_342
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_168 = paddle._C_ops.add(dropout_150, layer_norm_105)
+        del dropout_150, layer_norm_105
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_108, layer_norm_109, layer_norm_110 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_168, parameter_92, parameter_91, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_168, parameter_91, parameter_92
+
+        # pd_op.matmul: (9x1x4096xf32) <- (9x1x1024xf32, 1024x4096xf32)
+        matmul_112 = paddle._C_ops.matmul(layer_norm_108, parameter_88, False, False)
+        del parameter_88
+
+        # pd_op.add: (9x1x4096xf32) <- (9x1x4096xf32, 4096xf32)
+        add_169 = paddle._C_ops.add(matmul_112, parameter_87)
+        del matmul_112, parameter_87
+
+        # pd_op.relu: (9x1x4096xf32) <- (9x1x4096xf32)
+        relu_18 = paddle._C_ops.relu(add_169)
+        del add_169
+
+        # pd_op.dropout: (9x1x4096xf32, 9x1x4096xui8) <- (9x1x4096xf32, None, 1xf32)
+        dropout_152, dropout_153 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_18, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_18
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x4096xf32, 4096x1024xf32)
+        matmul_113 = paddle._C_ops.matmul(dropout_152, parameter_86, False, False)
+        del dropout_152, parameter_86
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 1024xf32)
+        add_170 = paddle._C_ops.add(matmul_113, parameter_85)
+        del matmul_113, parameter_85
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_154, dropout_155 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_170, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_170
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_171 = paddle._C_ops.add(dropout_154, layer_norm_108)
+        del dropout_154, layer_norm_108
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_111, layer_norm_112, layer_norm_113 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_171, parameter_90, parameter_89, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_171, parameter_89, parameter_90
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_114 = paddle._C_ops.matmul(layer_norm_111, parameter_84, False, False)
+        del parameter_84
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_133 = paddle._C_ops.reshape(matmul_114, full_int_array_5)
+        del matmul_114
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_115 = paddle._C_ops.matmul(layer_norm_111, parameter_83, False, False)
+        del parameter_83
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_134 = paddle._C_ops.reshape(matmul_115, full_int_array_5)
+        del matmul_115
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_116 = paddle._C_ops.matmul(layer_norm_111, parameter_82, False, False)
+        del parameter_82
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_135 = paddle._C_ops.reshape(matmul_116, full_int_array_5)
+        del matmul_116
+
+        # pd_op.matmul: (18x1x1024xf32) <- (18x1x1024xf32, 1024x1024xf32)
+        matmul_117 = paddle._C_ops.matmul(dropout_2, parameter_80, False, False)
+        del parameter_80
+
+        # pd_op.reshape: (18x1x16x64xf32) <- (18x1x1024xf32, 4xi64)
+        reshape_136 = paddle._C_ops.reshape(matmul_117, full_int_array_6)
+        del matmul_117
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_172 = paddle._C_ops.add(reshape_133, parameter_77)
+        del parameter_77
+
+        # builtin.combine: ([9x1x16x64xf32, 9x1x16x64xf32]) <- (9x1x16x64xf32, 9x1x16x64xf32)
+        combine_116 = [add_172, reshape_134]
+        del add_172, reshape_134
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x1x16x64xf32, 9x1x16x64xf32]) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        einsum_345, einsum_346, einsum_347 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_116, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_116
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_460,
+            split_461,
+        ) = einsum_346
+        del einsum_346
+
+        # builtin.split: (9x1x16x64xf32, 9x1x16x64xf32) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        (
+            split_462,
+            split_463,
+        ) = einsum_347
+        del einsum_347
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_173 = paddle._C_ops.add(reshape_133, parameter_79)
+        del parameter_79
+
+        # builtin.combine: ([9x1x16x64xf32, 18x1x16x64xf32]) <- (9x1x16x64xf32, 18x1x16x64xf32)
+        combine_117 = [add_173, reshape_136]
+        del add_173, reshape_136
+
+        # pd_op.einsum: (1x16x9x18xf32, [0xf32, 0xf32], [9x1x16x64xf32, 18x1x16x64xf32]) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        einsum_348, einsum_349, einsum_350 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_117, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_117
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_464,
+            split_465,
+        ) = einsum_349
+        del einsum_349
+
+        # builtin.split: (9x1x16x64xf32, 18x1x16x64xf32) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        (
+            split_466,
+            split_467,
+        ) = einsum_350
+        del einsum_350
+
+        # pd_op.reshape: (1x16x18x9xf32) <- (1x16x9x18xf32, 4xi64)
+        reshape_137 = paddle._C_ops.reshape(einsum_348, full_int_array_7)
+        del einsum_348
+
+        # pd_op.slice: (1x16x17x9xf32) <- (1x16x18x9xf32, 1xi64, 1xi64)
+        slice_19 = paddle._C_ops.slice(
+            reshape_137, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_137
+
+        # pd_op.reshape: (1x16x9x17xf32) <- (1x16x17x9xf32, 4xi64)
+        reshape_138 = paddle._C_ops.reshape(slice_19, full_int_array_9)
+        del slice_19
+
+        # pd_op.index_select: (1x16x9x9xf32) <- (1x16x9x17xf32, 9xi64)
+        index_select_19 = paddle._C_ops.index_select(reshape_138, arange_2, 3)
+        del reshape_138
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_174 = paddle._C_ops.add(reshape_133, parameter_78)
+        del parameter_78, reshape_133
+
+        # builtin.combine: ([9x1x16x64xf32, 2x16x64xf32]) <- (9x1x16x64xf32, 2x16x64xf32)
+        combine_118 = [add_174, parameter_76]
+        del add_174, parameter_76
+
+        # pd_op.einsum: (9x1x16x2xf32, [0xf32, 0xf32], [9x1x16x64xf32, 2x16x64xf32]) <- ([9x1x16x64xf32, 2x16x64xf32])
+        einsum_351, einsum_352, einsum_353 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_118, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_118
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_468,
+            split_469,
+        ) = einsum_352
+        del einsum_352
+
+        # builtin.split: (9x1x16x64xf32, 2x16x64xf32) <- ([9x1x16x64xf32, 2x16x64xf32])
+        (
+            split_470,
+            split_471,
+        ) = einsum_353
+        del einsum_353
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x16x2xf32]) <- (9x9x1x2xf32, 9x1x16x2xf32)
+        combine_119 = [cast_5, einsum_351]
+        del einsum_351
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x16x2xf32]) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        einsum_354, einsum_355, einsum_356 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_119, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_119
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_472,
+            split_473,
+        ) = einsum_355
+        del einsum_355
+
+        # builtin.split: (9x9x1x2xf32, 9x1x16x2xf32) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        (
+            split_474,
+            split_475,
+        ) = einsum_356
+        del einsum_356
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_175 = paddle._C_ops.add(einsum_345, index_select_19)
+        del einsum_345, index_select_19
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_176 = paddle._C_ops.add(add_175, einsum_354)
+        del add_175, einsum_354
+
+        # pd_op.scale: (1x16x9x9xf32) <- (1x16x9x9xf32, 1xf32)
+        scale_23 = paddle._C_ops.scale(add_176, full_16, float("0"), True)
+        del add_176
+
+        # pd_op.subtract: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x1x9x9xf32)
+        subtract_19 = paddle._C_ops.subtract(scale_23, scale_4)
+        del scale_23
+
+        # pd_op.softmax: (1x16x9x9xf32) <- (1x16x9x9xf32)
+        softmax_19 = paddle._C_ops.softmax(subtract_19, 3)
+        del subtract_19
+
+        # pd_op.dropout: (1x16x9x9xf32, 1x16x9x9xui8) <- (1x16x9x9xf32, None, 1xf32)
+        dropout_156, dropout_157 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_19, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_19
+
+        # builtin.combine: ([1x16x9x9xf32, 9x1x16x64xf32]) <- (1x16x9x9xf32, 9x1x16x64xf32)
+        combine_120 = [dropout_156, reshape_135]
+        del dropout_156, reshape_135
+
+        # pd_op.einsum: (9x1x16x64xf32, [0xf32, 0xf32], [1x16x9x9xf32, 9x1x16x64xf32]) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        einsum_357, einsum_358, einsum_359 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_120, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_120
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_476,
+            split_477,
+        ) = einsum_358
+        del einsum_358
+
+        # builtin.split: (1x16x9x9xf32, 9x1x16x64xf32) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        (
+            split_478,
+            split_479,
+        ) = einsum_359
+        del einsum_359
+
+        # pd_op.reshape: (9x1x1024xf32) <- (9x1x16x64xf32, 3xi64)
+        reshape_139 = paddle._C_ops.reshape(einsum_357, full_int_array_10)
+        del einsum_357
+
+        # builtin.combine: ([9x1x1024xf32, 1024x1024xf32]) <- (9x1x1024xf32, 1024x1024xf32)
+        combine_121 = [reshape_139, parameter_81]
+        del parameter_81, reshape_139
+
+        # pd_op.einsum: (9x1x1024xf32, [0xf32, 0xf32], [9x1x1024xf32, 1024x1024xf32]) <- ([9x1x1024xf32, 1024x1024xf32])
+        einsum_360, einsum_361, einsum_362 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_121, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_121
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_480,
+            split_481,
+        ) = einsum_361
+        del einsum_361
+
+        # builtin.split: (9x1x1024xf32, 1024x1024xf32) <- ([9x1x1024xf32, 1024x1024xf32])
+        (
+            split_482,
+            split_483,
+        ) = einsum_362
+        del einsum_362
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_158, dropout_159 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_360, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_360
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_177 = paddle._C_ops.add(dropout_158, layer_norm_111)
+        del dropout_158, layer_norm_111
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_114, layer_norm_115, layer_norm_116 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_177, parameter_75, parameter_74, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_177, parameter_74, parameter_75
+
+        # pd_op.matmul: (9x1x4096xf32) <- (9x1x1024xf32, 1024x4096xf32)
+        matmul_118 = paddle._C_ops.matmul(layer_norm_114, parameter_71, False, False)
+        del parameter_71
+
+        # pd_op.add: (9x1x4096xf32) <- (9x1x4096xf32, 4096xf32)
+        add_178 = paddle._C_ops.add(matmul_118, parameter_70)
+        del matmul_118, parameter_70
+
+        # pd_op.relu: (9x1x4096xf32) <- (9x1x4096xf32)
+        relu_19 = paddle._C_ops.relu(add_178)
+        del add_178
+
+        # pd_op.dropout: (9x1x4096xf32, 9x1x4096xui8) <- (9x1x4096xf32, None, 1xf32)
+        dropout_160, dropout_161 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_19, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_19
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x4096xf32, 4096x1024xf32)
+        matmul_119 = paddle._C_ops.matmul(dropout_160, parameter_69, False, False)
+        del dropout_160, parameter_69
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 1024xf32)
+        add_179 = paddle._C_ops.add(matmul_119, parameter_68)
+        del matmul_119, parameter_68
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_162, dropout_163 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_179, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_179
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_180 = paddle._C_ops.add(dropout_162, layer_norm_114)
+        del dropout_162, layer_norm_114
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_117, layer_norm_118, layer_norm_119 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_180, parameter_73, parameter_72, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_180, parameter_72, parameter_73
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_120 = paddle._C_ops.matmul(layer_norm_117, parameter_67, False, False)
+        del parameter_67
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_140 = paddle._C_ops.reshape(matmul_120, full_int_array_5)
+        del matmul_120
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_121 = paddle._C_ops.matmul(layer_norm_117, parameter_66, False, False)
+        del parameter_66
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_141 = paddle._C_ops.reshape(matmul_121, full_int_array_5)
+        del matmul_121
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_122 = paddle._C_ops.matmul(layer_norm_117, parameter_65, False, False)
+        del parameter_65
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_142 = paddle._C_ops.reshape(matmul_122, full_int_array_5)
+        del matmul_122
+
+        # pd_op.matmul: (18x1x1024xf32) <- (18x1x1024xf32, 1024x1024xf32)
+        matmul_123 = paddle._C_ops.matmul(dropout_2, parameter_63, False, False)
+        del parameter_63
+
+        # pd_op.reshape: (18x1x16x64xf32) <- (18x1x1024xf32, 4xi64)
+        reshape_143 = paddle._C_ops.reshape(matmul_123, full_int_array_6)
+        del matmul_123
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_181 = paddle._C_ops.add(reshape_140, parameter_60)
+        del parameter_60
+
+        # builtin.combine: ([9x1x16x64xf32, 9x1x16x64xf32]) <- (9x1x16x64xf32, 9x1x16x64xf32)
+        combine_122 = [add_181, reshape_141]
+        del add_181, reshape_141
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x1x16x64xf32, 9x1x16x64xf32]) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        einsum_363, einsum_364, einsum_365 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_122, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_122
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_484,
+            split_485,
+        ) = einsum_364
+        del einsum_364
+
+        # builtin.split: (9x1x16x64xf32, 9x1x16x64xf32) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        (
+            split_486,
+            split_487,
+        ) = einsum_365
+        del einsum_365
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_182 = paddle._C_ops.add(reshape_140, parameter_62)
+        del parameter_62
+
+        # builtin.combine: ([9x1x16x64xf32, 18x1x16x64xf32]) <- (9x1x16x64xf32, 18x1x16x64xf32)
+        combine_123 = [add_182, reshape_143]
+        del add_182, reshape_143
+
+        # pd_op.einsum: (1x16x9x18xf32, [0xf32, 0xf32], [9x1x16x64xf32, 18x1x16x64xf32]) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        einsum_366, einsum_367, einsum_368 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_123, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_123
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_488,
+            split_489,
+        ) = einsum_367
+        del einsum_367
+
+        # builtin.split: (9x1x16x64xf32, 18x1x16x64xf32) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        (
+            split_490,
+            split_491,
+        ) = einsum_368
+        del einsum_368
+
+        # pd_op.reshape: (1x16x18x9xf32) <- (1x16x9x18xf32, 4xi64)
+        reshape_144 = paddle._C_ops.reshape(einsum_366, full_int_array_7)
+        del einsum_366
+
+        # pd_op.slice: (1x16x17x9xf32) <- (1x16x18x9xf32, 1xi64, 1xi64)
+        slice_20 = paddle._C_ops.slice(
+            reshape_144, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_144
+
+        # pd_op.reshape: (1x16x9x17xf32) <- (1x16x17x9xf32, 4xi64)
+        reshape_145 = paddle._C_ops.reshape(slice_20, full_int_array_9)
+        del slice_20
+
+        # pd_op.index_select: (1x16x9x9xf32) <- (1x16x9x17xf32, 9xi64)
+        index_select_20 = paddle._C_ops.index_select(reshape_145, arange_2, 3)
+        del reshape_145
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_183 = paddle._C_ops.add(reshape_140, parameter_61)
+        del parameter_61, reshape_140
+
+        # builtin.combine: ([9x1x16x64xf32, 2x16x64xf32]) <- (9x1x16x64xf32, 2x16x64xf32)
+        combine_124 = [add_183, parameter_59]
+        del add_183, parameter_59
+
+        # pd_op.einsum: (9x1x16x2xf32, [0xf32, 0xf32], [9x1x16x64xf32, 2x16x64xf32]) <- ([9x1x16x64xf32, 2x16x64xf32])
+        einsum_369, einsum_370, einsum_371 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_124, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_124
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_492,
+            split_493,
+        ) = einsum_370
+        del einsum_370
+
+        # builtin.split: (9x1x16x64xf32, 2x16x64xf32) <- ([9x1x16x64xf32, 2x16x64xf32])
+        (
+            split_494,
+            split_495,
+        ) = einsum_371
+        del einsum_371
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x16x2xf32]) <- (9x9x1x2xf32, 9x1x16x2xf32)
+        combine_125 = [cast_5, einsum_369]
+        del einsum_369
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x16x2xf32]) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        einsum_372, einsum_373, einsum_374 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_125, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_125
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_496,
+            split_497,
+        ) = einsum_373
+        del einsum_373
+
+        # builtin.split: (9x9x1x2xf32, 9x1x16x2xf32) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        (
+            split_498,
+            split_499,
+        ) = einsum_374
+        del einsum_374
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_184 = paddle._C_ops.add(einsum_363, index_select_20)
+        del einsum_363, index_select_20
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_185 = paddle._C_ops.add(add_184, einsum_372)
+        del add_184, einsum_372
+
+        # pd_op.scale: (1x16x9x9xf32) <- (1x16x9x9xf32, 1xf32)
+        scale_24 = paddle._C_ops.scale(add_185, full_16, float("0"), True)
+        del add_185
+
+        # pd_op.subtract: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x1x9x9xf32)
+        subtract_20 = paddle._C_ops.subtract(scale_24, scale_4)
+        del scale_24
+
+        # pd_op.softmax: (1x16x9x9xf32) <- (1x16x9x9xf32)
+        softmax_20 = paddle._C_ops.softmax(subtract_20, 3)
+        del subtract_20
+
+        # pd_op.dropout: (1x16x9x9xf32, 1x16x9x9xui8) <- (1x16x9x9xf32, None, 1xf32)
+        dropout_164, dropout_165 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_20, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_20
+
+        # builtin.combine: ([1x16x9x9xf32, 9x1x16x64xf32]) <- (1x16x9x9xf32, 9x1x16x64xf32)
+        combine_126 = [dropout_164, reshape_142]
+        del dropout_164, reshape_142
+
+        # pd_op.einsum: (9x1x16x64xf32, [0xf32, 0xf32], [1x16x9x9xf32, 9x1x16x64xf32]) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        einsum_375, einsum_376, einsum_377 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_126, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_126
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_500,
+            split_501,
+        ) = einsum_376
+        del einsum_376
+
+        # builtin.split: (1x16x9x9xf32, 9x1x16x64xf32) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        (
+            split_502,
+            split_503,
+        ) = einsum_377
+        del einsum_377
+
+        # pd_op.reshape: (9x1x1024xf32) <- (9x1x16x64xf32, 3xi64)
+        reshape_146 = paddle._C_ops.reshape(einsum_375, full_int_array_10)
+        del einsum_375
+
+        # builtin.combine: ([9x1x1024xf32, 1024x1024xf32]) <- (9x1x1024xf32, 1024x1024xf32)
+        combine_127 = [reshape_146, parameter_64]
+        del parameter_64, reshape_146
+
+        # pd_op.einsum: (9x1x1024xf32, [0xf32, 0xf32], [9x1x1024xf32, 1024x1024xf32]) <- ([9x1x1024xf32, 1024x1024xf32])
+        einsum_378, einsum_379, einsum_380 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_127, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_127
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_504,
+            split_505,
+        ) = einsum_379
+        del einsum_379
+
+        # builtin.split: (9x1x1024xf32, 1024x1024xf32) <- ([9x1x1024xf32, 1024x1024xf32])
+        (
+            split_506,
+            split_507,
+        ) = einsum_380
+        del einsum_380
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_166, dropout_167 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_378, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_378
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_186 = paddle._C_ops.add(dropout_166, layer_norm_117)
+        del dropout_166, layer_norm_117
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_120, layer_norm_121, layer_norm_122 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_186, parameter_58, parameter_57, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_186, parameter_57, parameter_58
+
+        # pd_op.matmul: (9x1x4096xf32) <- (9x1x1024xf32, 1024x4096xf32)
+        matmul_124 = paddle._C_ops.matmul(layer_norm_120, parameter_54, False, False)
+        del parameter_54
+
+        # pd_op.add: (9x1x4096xf32) <- (9x1x4096xf32, 4096xf32)
+        add_187 = paddle._C_ops.add(matmul_124, parameter_53)
+        del matmul_124, parameter_53
+
+        # pd_op.relu: (9x1x4096xf32) <- (9x1x4096xf32)
+        relu_20 = paddle._C_ops.relu(add_187)
+        del add_187
+
+        # pd_op.dropout: (9x1x4096xf32, 9x1x4096xui8) <- (9x1x4096xf32, None, 1xf32)
+        dropout_168, dropout_169 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_20, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_20
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x4096xf32, 4096x1024xf32)
+        matmul_125 = paddle._C_ops.matmul(dropout_168, parameter_52, False, False)
+        del dropout_168, parameter_52
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 1024xf32)
+        add_188 = paddle._C_ops.add(matmul_125, parameter_51)
+        del matmul_125, parameter_51
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_170, dropout_171 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_188, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_188
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_189 = paddle._C_ops.add(dropout_170, layer_norm_120)
+        del dropout_170, layer_norm_120
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_123, layer_norm_124, layer_norm_125 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_189, parameter_56, parameter_55, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_189, parameter_55, parameter_56
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_126 = paddle._C_ops.matmul(layer_norm_123, parameter_50, False, False)
+        del parameter_50
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_147 = paddle._C_ops.reshape(matmul_126, full_int_array_5)
+        del matmul_126
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_127 = paddle._C_ops.matmul(layer_norm_123, parameter_49, False, False)
+        del parameter_49
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_148 = paddle._C_ops.reshape(matmul_127, full_int_array_5)
+        del matmul_127
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_128 = paddle._C_ops.matmul(layer_norm_123, parameter_48, False, False)
+        del parameter_48
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_149 = paddle._C_ops.reshape(matmul_128, full_int_array_5)
+        del matmul_128
+
+        # pd_op.matmul: (18x1x1024xf32) <- (18x1x1024xf32, 1024x1024xf32)
+        matmul_129 = paddle._C_ops.matmul(dropout_2, parameter_46, False, False)
+        del parameter_46
+
+        # pd_op.reshape: (18x1x16x64xf32) <- (18x1x1024xf32, 4xi64)
+        reshape_150 = paddle._C_ops.reshape(matmul_129, full_int_array_6)
+        del matmul_129
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_190 = paddle._C_ops.add(reshape_147, parameter_43)
+        del parameter_43
+
+        # builtin.combine: ([9x1x16x64xf32, 9x1x16x64xf32]) <- (9x1x16x64xf32, 9x1x16x64xf32)
+        combine_128 = [add_190, reshape_148]
+        del add_190, reshape_148
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x1x16x64xf32, 9x1x16x64xf32]) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        einsum_381, einsum_382, einsum_383 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_128, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_128
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_508,
+            split_509,
+        ) = einsum_382
+        del einsum_382
+
+        # builtin.split: (9x1x16x64xf32, 9x1x16x64xf32) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        (
+            split_510,
+            split_511,
+        ) = einsum_383
+        del einsum_383
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_191 = paddle._C_ops.add(reshape_147, parameter_45)
+        del parameter_45
+
+        # builtin.combine: ([9x1x16x64xf32, 18x1x16x64xf32]) <- (9x1x16x64xf32, 18x1x16x64xf32)
+        combine_129 = [add_191, reshape_150]
+        del add_191, reshape_150
+
+        # pd_op.einsum: (1x16x9x18xf32, [0xf32, 0xf32], [9x1x16x64xf32, 18x1x16x64xf32]) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        einsum_384, einsum_385, einsum_386 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_129, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_129
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_512,
+            split_513,
+        ) = einsum_385
+        del einsum_385
+
+        # builtin.split: (9x1x16x64xf32, 18x1x16x64xf32) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        (
+            split_514,
+            split_515,
+        ) = einsum_386
+        del einsum_386
+
+        # pd_op.reshape: (1x16x18x9xf32) <- (1x16x9x18xf32, 4xi64)
+        reshape_151 = paddle._C_ops.reshape(einsum_384, full_int_array_7)
+        del einsum_384
+
+        # pd_op.slice: (1x16x17x9xf32) <- (1x16x18x9xf32, 1xi64, 1xi64)
+        slice_21 = paddle._C_ops.slice(
+            reshape_151, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_151
+
+        # pd_op.reshape: (1x16x9x17xf32) <- (1x16x17x9xf32, 4xi64)
+        reshape_152 = paddle._C_ops.reshape(slice_21, full_int_array_9)
+        del slice_21
+
+        # pd_op.index_select: (1x16x9x9xf32) <- (1x16x9x17xf32, 9xi64)
+        index_select_21 = paddle._C_ops.index_select(reshape_152, arange_2, 3)
+        del reshape_152
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_192 = paddle._C_ops.add(reshape_147, parameter_44)
+        del parameter_44, reshape_147
+
+        # builtin.combine: ([9x1x16x64xf32, 2x16x64xf32]) <- (9x1x16x64xf32, 2x16x64xf32)
+        combine_130 = [add_192, parameter_42]
+        del add_192, parameter_42
+
+        # pd_op.einsum: (9x1x16x2xf32, [0xf32, 0xf32], [9x1x16x64xf32, 2x16x64xf32]) <- ([9x1x16x64xf32, 2x16x64xf32])
+        einsum_387, einsum_388, einsum_389 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_130, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_130
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_516,
+            split_517,
+        ) = einsum_388
+        del einsum_388
+
+        # builtin.split: (9x1x16x64xf32, 2x16x64xf32) <- ([9x1x16x64xf32, 2x16x64xf32])
+        (
+            split_518,
+            split_519,
+        ) = einsum_389
+        del einsum_389
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x16x2xf32]) <- (9x9x1x2xf32, 9x1x16x2xf32)
+        combine_131 = [cast_5, einsum_387]
+        del einsum_387
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x16x2xf32]) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        einsum_390, einsum_391, einsum_392 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_131, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_131
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_520,
+            split_521,
+        ) = einsum_391
+        del einsum_391
+
+        # builtin.split: (9x9x1x2xf32, 9x1x16x2xf32) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        (
+            split_522,
+            split_523,
+        ) = einsum_392
+        del einsum_392
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_193 = paddle._C_ops.add(einsum_381, index_select_21)
+        del einsum_381, index_select_21
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_194 = paddle._C_ops.add(add_193, einsum_390)
+        del add_193, einsum_390
+
+        # pd_op.scale: (1x16x9x9xf32) <- (1x16x9x9xf32, 1xf32)
+        scale_25 = paddle._C_ops.scale(add_194, full_16, float("0"), True)
+        del add_194
+
+        # pd_op.subtract: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x1x9x9xf32)
+        subtract_21 = paddle._C_ops.subtract(scale_25, scale_4)
+        del scale_25
+
+        # pd_op.softmax: (1x16x9x9xf32) <- (1x16x9x9xf32)
+        softmax_21 = paddle._C_ops.softmax(subtract_21, 3)
+        del subtract_21
+
+        # pd_op.dropout: (1x16x9x9xf32, 1x16x9x9xui8) <- (1x16x9x9xf32, None, 1xf32)
+        dropout_172, dropout_173 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_21, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_21
+
+        # builtin.combine: ([1x16x9x9xf32, 9x1x16x64xf32]) <- (1x16x9x9xf32, 9x1x16x64xf32)
+        combine_132 = [dropout_172, reshape_149]
+        del dropout_172, reshape_149
+
+        # pd_op.einsum: (9x1x16x64xf32, [0xf32, 0xf32], [1x16x9x9xf32, 9x1x16x64xf32]) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        einsum_393, einsum_394, einsum_395 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_132, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_132
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_524,
+            split_525,
+        ) = einsum_394
+        del einsum_394
+
+        # builtin.split: (1x16x9x9xf32, 9x1x16x64xf32) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        (
+            split_526,
+            split_527,
+        ) = einsum_395
+        del einsum_395
+
+        # pd_op.reshape: (9x1x1024xf32) <- (9x1x16x64xf32, 3xi64)
+        reshape_153 = paddle._C_ops.reshape(einsum_393, full_int_array_10)
+        del einsum_393
+
+        # builtin.combine: ([9x1x1024xf32, 1024x1024xf32]) <- (9x1x1024xf32, 1024x1024xf32)
+        combine_133 = [reshape_153, parameter_47]
+        del parameter_47, reshape_153
+
+        # pd_op.einsum: (9x1x1024xf32, [0xf32, 0xf32], [9x1x1024xf32, 1024x1024xf32]) <- ([9x1x1024xf32, 1024x1024xf32])
+        einsum_396, einsum_397, einsum_398 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_133, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_133
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_528,
+            split_529,
+        ) = einsum_397
+        del einsum_397
+
+        # builtin.split: (9x1x1024xf32, 1024x1024xf32) <- ([9x1x1024xf32, 1024x1024xf32])
+        (
+            split_530,
+            split_531,
+        ) = einsum_398
+        del einsum_398
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_174, dropout_175 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_396, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_396
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_195 = paddle._C_ops.add(dropout_174, layer_norm_123)
+        del dropout_174, layer_norm_123
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_126, layer_norm_127, layer_norm_128 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_195, parameter_41, parameter_40, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_195, parameter_40, parameter_41
+
+        # pd_op.matmul: (9x1x4096xf32) <- (9x1x1024xf32, 1024x4096xf32)
+        matmul_130 = paddle._C_ops.matmul(layer_norm_126, parameter_37, False, False)
+        del parameter_37
+
+        # pd_op.add: (9x1x4096xf32) <- (9x1x4096xf32, 4096xf32)
+        add_196 = paddle._C_ops.add(matmul_130, parameter_36)
+        del matmul_130, parameter_36
+
+        # pd_op.relu: (9x1x4096xf32) <- (9x1x4096xf32)
+        relu_21 = paddle._C_ops.relu(add_196)
+        del add_196
+
+        # pd_op.dropout: (9x1x4096xf32, 9x1x4096xui8) <- (9x1x4096xf32, None, 1xf32)
+        dropout_176, dropout_177 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_21, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_21
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x4096xf32, 4096x1024xf32)
+        matmul_131 = paddle._C_ops.matmul(dropout_176, parameter_35, False, False)
+        del dropout_176, parameter_35
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 1024xf32)
+        add_197 = paddle._C_ops.add(matmul_131, parameter_34)
+        del matmul_131, parameter_34
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_178, dropout_179 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_197, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_197
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_198 = paddle._C_ops.add(dropout_178, layer_norm_126)
+        del dropout_178, layer_norm_126
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_129, layer_norm_130, layer_norm_131 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_198, parameter_39, parameter_38, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_198, parameter_38, parameter_39
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_132 = paddle._C_ops.matmul(layer_norm_129, parameter_33, False, False)
+        del parameter_33
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_154 = paddle._C_ops.reshape(matmul_132, full_int_array_5)
+        del matmul_132
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_133 = paddle._C_ops.matmul(layer_norm_129, parameter_32, False, False)
+        del parameter_32
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_155 = paddle._C_ops.reshape(matmul_133, full_int_array_5)
+        del matmul_133
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_134 = paddle._C_ops.matmul(layer_norm_129, parameter_31, False, False)
+        del parameter_31
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_156 = paddle._C_ops.reshape(matmul_134, full_int_array_5)
+        del matmul_134
+
+        # pd_op.matmul: (18x1x1024xf32) <- (18x1x1024xf32, 1024x1024xf32)
+        matmul_135 = paddle._C_ops.matmul(dropout_2, parameter_29, False, False)
+        del parameter_29
+
+        # pd_op.reshape: (18x1x16x64xf32) <- (18x1x1024xf32, 4xi64)
+        reshape_157 = paddle._C_ops.reshape(matmul_135, full_int_array_6)
+        del matmul_135
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_199 = paddle._C_ops.add(reshape_154, parameter_26)
+        del parameter_26
+
+        # builtin.combine: ([9x1x16x64xf32, 9x1x16x64xf32]) <- (9x1x16x64xf32, 9x1x16x64xf32)
+        combine_134 = [add_199, reshape_155]
+        del add_199, reshape_155
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x1x16x64xf32, 9x1x16x64xf32]) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        einsum_399, einsum_400, einsum_401 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_134, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_134
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_532,
+            split_533,
+        ) = einsum_400
+        del einsum_400
+
+        # builtin.split: (9x1x16x64xf32, 9x1x16x64xf32) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        (
+            split_534,
+            split_535,
+        ) = einsum_401
+        del einsum_401
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_200 = paddle._C_ops.add(reshape_154, parameter_28)
+        del parameter_28
+
+        # builtin.combine: ([9x1x16x64xf32, 18x1x16x64xf32]) <- (9x1x16x64xf32, 18x1x16x64xf32)
+        combine_135 = [add_200, reshape_157]
+        del add_200, reshape_157
+
+        # pd_op.einsum: (1x16x9x18xf32, [0xf32, 0xf32], [9x1x16x64xf32, 18x1x16x64xf32]) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        einsum_402, einsum_403, einsum_404 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_135, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_135
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_536,
+            split_537,
+        ) = einsum_403
+        del einsum_403
+
+        # builtin.split: (9x1x16x64xf32, 18x1x16x64xf32) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        (
+            split_538,
+            split_539,
+        ) = einsum_404
+        del einsum_404
+
+        # pd_op.reshape: (1x16x18x9xf32) <- (1x16x9x18xf32, 4xi64)
+        reshape_158 = paddle._C_ops.reshape(einsum_402, full_int_array_7)
+        del einsum_402
+
+        # pd_op.slice: (1x16x17x9xf32) <- (1x16x18x9xf32, 1xi64, 1xi64)
+        slice_22 = paddle._C_ops.slice(
+            reshape_158, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_158
+
+        # pd_op.reshape: (1x16x9x17xf32) <- (1x16x17x9xf32, 4xi64)
+        reshape_159 = paddle._C_ops.reshape(slice_22, full_int_array_9)
+        del slice_22
+
+        # pd_op.index_select: (1x16x9x9xf32) <- (1x16x9x17xf32, 9xi64)
+        index_select_22 = paddle._C_ops.index_select(reshape_159, arange_2, 3)
+        del reshape_159
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_201 = paddle._C_ops.add(reshape_154, parameter_27)
+        del parameter_27, reshape_154
+
+        # builtin.combine: ([9x1x16x64xf32, 2x16x64xf32]) <- (9x1x16x64xf32, 2x16x64xf32)
+        combine_136 = [add_201, parameter_25]
+        del add_201, parameter_25
+
+        # pd_op.einsum: (9x1x16x2xf32, [0xf32, 0xf32], [9x1x16x64xf32, 2x16x64xf32]) <- ([9x1x16x64xf32, 2x16x64xf32])
+        einsum_405, einsum_406, einsum_407 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_136, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_136
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_540,
+            split_541,
+        ) = einsum_406
+        del einsum_406
+
+        # builtin.split: (9x1x16x64xf32, 2x16x64xf32) <- ([9x1x16x64xf32, 2x16x64xf32])
+        (
+            split_542,
+            split_543,
+        ) = einsum_407
+        del einsum_407
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x16x2xf32]) <- (9x9x1x2xf32, 9x1x16x2xf32)
+        combine_137 = [cast_5, einsum_405]
+        del einsum_405
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x16x2xf32]) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        einsum_408, einsum_409, einsum_410 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_137, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_137
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_544,
+            split_545,
+        ) = einsum_409
+        del einsum_409
+
+        # builtin.split: (9x9x1x2xf32, 9x1x16x2xf32) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        (
+            split_546,
+            split_547,
+        ) = einsum_410
+        del einsum_410
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_202 = paddle._C_ops.add(einsum_399, index_select_22)
+        del einsum_399, index_select_22
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_203 = paddle._C_ops.add(add_202, einsum_408)
+        del add_202, einsum_408
+
+        # pd_op.scale: (1x16x9x9xf32) <- (1x16x9x9xf32, 1xf32)
+        scale_26 = paddle._C_ops.scale(add_203, full_16, float("0"), True)
+        del add_203
+
+        # pd_op.subtract: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x1x9x9xf32)
+        subtract_22 = paddle._C_ops.subtract(scale_26, scale_4)
+        del scale_26
+
+        # pd_op.softmax: (1x16x9x9xf32) <- (1x16x9x9xf32)
+        softmax_22 = paddle._C_ops.softmax(subtract_22, 3)
+        del subtract_22
+
+        # pd_op.dropout: (1x16x9x9xf32, 1x16x9x9xui8) <- (1x16x9x9xf32, None, 1xf32)
+        dropout_180, dropout_181 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_22, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_22
+
+        # builtin.combine: ([1x16x9x9xf32, 9x1x16x64xf32]) <- (1x16x9x9xf32, 9x1x16x64xf32)
+        combine_138 = [dropout_180, reshape_156]
+        del dropout_180, reshape_156
+
+        # pd_op.einsum: (9x1x16x64xf32, [0xf32, 0xf32], [1x16x9x9xf32, 9x1x16x64xf32]) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        einsum_411, einsum_412, einsum_413 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_138, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_138
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_548,
+            split_549,
+        ) = einsum_412
+        del einsum_412
+
+        # builtin.split: (1x16x9x9xf32, 9x1x16x64xf32) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        (
+            split_550,
+            split_551,
+        ) = einsum_413
+        del einsum_413
+
+        # pd_op.reshape: (9x1x1024xf32) <- (9x1x16x64xf32, 3xi64)
+        reshape_160 = paddle._C_ops.reshape(einsum_411, full_int_array_10)
+        del einsum_411
+
+        # builtin.combine: ([9x1x1024xf32, 1024x1024xf32]) <- (9x1x1024xf32, 1024x1024xf32)
+        combine_139 = [reshape_160, parameter_30]
+        del parameter_30, reshape_160
+
+        # pd_op.einsum: (9x1x1024xf32, [0xf32, 0xf32], [9x1x1024xf32, 1024x1024xf32]) <- ([9x1x1024xf32, 1024x1024xf32])
+        einsum_414, einsum_415, einsum_416 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_139, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_139
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_552,
+            split_553,
+        ) = einsum_415
+        del einsum_415
+
+        # builtin.split: (9x1x1024xf32, 1024x1024xf32) <- ([9x1x1024xf32, 1024x1024xf32])
+        (
+            split_554,
+            split_555,
+        ) = einsum_416
+        del einsum_416
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_182, dropout_183 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_414, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_414
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_204 = paddle._C_ops.add(dropout_182, layer_norm_129)
+        del dropout_182, layer_norm_129
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_132, layer_norm_133, layer_norm_134 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_204, parameter_24, parameter_23, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_204, parameter_23, parameter_24
+
+        # pd_op.matmul: (9x1x4096xf32) <- (9x1x1024xf32, 1024x4096xf32)
+        matmul_136 = paddle._C_ops.matmul(layer_norm_132, parameter_20, False, False)
+        del parameter_20
+
+        # pd_op.add: (9x1x4096xf32) <- (9x1x4096xf32, 4096xf32)
+        add_205 = paddle._C_ops.add(matmul_136, parameter_19)
+        del matmul_136, parameter_19
+
+        # pd_op.relu: (9x1x4096xf32) <- (9x1x4096xf32)
+        relu_22 = paddle._C_ops.relu(add_205)
+        del add_205
+
+        # pd_op.dropout: (9x1x4096xf32, 9x1x4096xui8) <- (9x1x4096xf32, None, 1xf32)
+        dropout_184, dropout_185 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_22, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_22
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x4096xf32, 4096x1024xf32)
+        matmul_137 = paddle._C_ops.matmul(dropout_184, parameter_18, False, False)
+        del dropout_184, parameter_18
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 1024xf32)
+        add_206 = paddle._C_ops.add(matmul_137, parameter_17)
+        del matmul_137, parameter_17
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_186, dropout_187 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_206, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_206
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_207 = paddle._C_ops.add(dropout_186, layer_norm_132)
+        del dropout_186, layer_norm_132
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_135, layer_norm_136, layer_norm_137 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_207, parameter_22, parameter_21, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_207, parameter_21, parameter_22
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_138 = paddle._C_ops.matmul(layer_norm_135, parameter_16, False, False)
+        del parameter_16
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_161 = paddle._C_ops.reshape(matmul_138, full_int_array_5)
+        del matmul_138
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_139 = paddle._C_ops.matmul(layer_norm_135, parameter_15, False, False)
+        del parameter_15
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_162 = paddle._C_ops.reshape(matmul_139, full_int_array_5)
+        del matmul_139
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x1024xf32, 1024x1024xf32)
+        matmul_140 = paddle._C_ops.matmul(layer_norm_135, parameter_14, False, False)
+        del parameter_14
+
+        # pd_op.reshape: (9x1x16x64xf32) <- (9x1x1024xf32, 4xi64)
+        reshape_163 = paddle._C_ops.reshape(matmul_140, full_int_array_5)
+        del full_int_array_5, matmul_140
+
+        # pd_op.matmul: (18x1x1024xf32) <- (18x1x1024xf32, 1024x1024xf32)
+        matmul_141 = paddle._C_ops.matmul(dropout_2, parameter_12, False, False)
+        del dropout_2, parameter_12
+
+        # pd_op.reshape: (18x1x16x64xf32) <- (18x1x1024xf32, 4xi64)
+        reshape_164 = paddle._C_ops.reshape(matmul_141, full_int_array_6)
+        del full_int_array_6, matmul_141
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_208 = paddle._C_ops.add(reshape_161, parameter_9)
+        del parameter_9
+
+        # builtin.combine: ([9x1x16x64xf32, 9x1x16x64xf32]) <- (9x1x16x64xf32, 9x1x16x64xf32)
+        combine_140 = [add_208, reshape_162]
+        del add_208, reshape_162
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x1x16x64xf32, 9x1x16x64xf32]) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        einsum_417, einsum_418, einsum_419 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_140, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_140
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_556,
+            split_557,
+        ) = einsum_418
+        del einsum_418
+
+        # builtin.split: (9x1x16x64xf32, 9x1x16x64xf32) <- ([9x1x16x64xf32, 9x1x16x64xf32])
+        (
+            split_558,
+            split_559,
+        ) = einsum_419
+        del einsum_419
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_209 = paddle._C_ops.add(reshape_161, parameter_11)
+        del parameter_11
+
+        # builtin.combine: ([9x1x16x64xf32, 18x1x16x64xf32]) <- (9x1x16x64xf32, 18x1x16x64xf32)
+        combine_141 = [add_209, reshape_164]
+        del add_209, reshape_164
+
+        # pd_op.einsum: (1x16x9x18xf32, [0xf32, 0xf32], [9x1x16x64xf32, 18x1x16x64xf32]) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        einsum_420, einsum_421, einsum_422 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_141, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_141
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_560,
+            split_561,
+        ) = einsum_421
+        del einsum_421
+
+        # builtin.split: (9x1x16x64xf32, 18x1x16x64xf32) <- ([9x1x16x64xf32, 18x1x16x64xf32])
+        (
+            split_562,
+            split_563,
+        ) = einsum_422
+        del einsum_422
+
+        # pd_op.reshape: (1x16x18x9xf32) <- (1x16x9x18xf32, 4xi64)
+        reshape_165 = paddle._C_ops.reshape(einsum_420, full_int_array_7)
+        del einsum_420, full_int_array_7
+
+        # pd_op.slice: (1x16x17x9xf32) <- (1x16x18x9xf32, 1xi64, 1xi64)
+        slice_23 = paddle._C_ops.slice(
+            reshape_165, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del full_int_array_3, full_int_array_8, reshape_165
+
+        # pd_op.reshape: (1x16x9x17xf32) <- (1x16x17x9xf32, 4xi64)
+        reshape_166 = paddle._C_ops.reshape(slice_23, full_int_array_9)
+        del full_int_array_9, slice_23
+
+        # pd_op.index_select: (1x16x9x9xf32) <- (1x16x9x17xf32, 9xi64)
+        index_select_23 = paddle._C_ops.index_select(reshape_166, arange_2, 3)
+        del arange_2, reshape_166
+
+        # pd_op.add: (9x1x16x64xf32) <- (9x1x16x64xf32, 16x64xf32)
+        add_210 = paddle._C_ops.add(reshape_161, parameter_10)
+        del parameter_10, reshape_161
+
+        # builtin.combine: ([9x1x16x64xf32, 2x16x64xf32]) <- (9x1x16x64xf32, 2x16x64xf32)
+        combine_142 = [add_210, parameter_8]
+        del add_210, parameter_8
+
+        # pd_op.einsum: (9x1x16x2xf32, [0xf32, 0xf32], [9x1x16x64xf32, 2x16x64xf32]) <- ([9x1x16x64xf32, 2x16x64xf32])
+        einsum_423, einsum_424, einsum_425 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_142, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_142
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_564,
+            split_565,
+        ) = einsum_424
+        del einsum_424
+
+        # builtin.split: (9x1x16x64xf32, 2x16x64xf32) <- ([9x1x16x64xf32, 2x16x64xf32])
+        (
+            split_566,
+            split_567,
+        ) = einsum_425
+        del einsum_425
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x16x2xf32]) <- (9x9x1x2xf32, 9x1x16x2xf32)
+        combine_143 = [cast_5, einsum_423]
+        del cast_5, einsum_423
+
+        # pd_op.einsum: (1x16x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x16x2xf32]) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        einsum_426, einsum_427, einsum_428 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_143, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_143
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_568,
+            split_569,
+        ) = einsum_427
+        del einsum_427
+
+        # builtin.split: (9x9x1x2xf32, 9x1x16x2xf32) <- ([9x9x1x2xf32, 9x1x16x2xf32])
+        (
+            split_570,
+            split_571,
+        ) = einsum_428
+        del einsum_428
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_211 = paddle._C_ops.add(einsum_417, index_select_23)
+        del einsum_417, index_select_23
+
+        # pd_op.add: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x16x9x9xf32)
+        add_212 = paddle._C_ops.add(add_211, einsum_426)
+        del add_211, einsum_426
+
+        # pd_op.scale: (1x16x9x9xf32) <- (1x16x9x9xf32, 1xf32)
+        scale_27 = paddle._C_ops.scale(add_212, full_16, float("0"), True)
+        del add_212, full_16
+
+        # pd_op.subtract: (1x16x9x9xf32) <- (1x16x9x9xf32, 1x1x9x9xf32)
+        subtract_23 = paddle._C_ops.subtract(scale_27, scale_4)
+        del scale_27, scale_4
+
+        # pd_op.softmax: (1x16x9x9xf32) <- (1x16x9x9xf32)
+        softmax_23 = paddle._C_ops.softmax(subtract_23, 3)
+        del subtract_23
+
+        # pd_op.dropout: (1x16x9x9xf32, 1x16x9x9xui8) <- (1x16x9x9xf32, None, 1xf32)
+        dropout_188, dropout_189 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_23, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_23
+
+        # builtin.combine: ([1x16x9x9xf32, 9x1x16x64xf32]) <- (1x16x9x9xf32, 9x1x16x64xf32)
+        combine_144 = [dropout_188, reshape_163]
+        del dropout_188, reshape_163
+
+        # pd_op.einsum: (9x1x16x64xf32, [0xf32, 0xf32], [1x16x9x9xf32, 9x1x16x64xf32]) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        einsum_429, einsum_430, einsum_431 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_144, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_144
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_572,
+            split_573,
+        ) = einsum_430
+        del einsum_430
+
+        # builtin.split: (1x16x9x9xf32, 9x1x16x64xf32) <- ([1x16x9x9xf32, 9x1x16x64xf32])
+        (
+            split_574,
+            split_575,
+        ) = einsum_431
+        del einsum_431
+
+        # pd_op.reshape: (9x1x1024xf32) <- (9x1x16x64xf32, 3xi64)
+        reshape_167 = paddle._C_ops.reshape(einsum_429, full_int_array_10)
+        del einsum_429, full_int_array_10
+
+        # builtin.combine: ([9x1x1024xf32, 1024x1024xf32]) <- (9x1x1024xf32, 1024x1024xf32)
+        combine_145 = [reshape_167, parameter_13]
+        del parameter_13, reshape_167
+
+        # pd_op.einsum: (9x1x1024xf32, [0xf32, 0xf32], [9x1x1024xf32, 1024x1024xf32]) <- ([9x1x1024xf32, 1024x1024xf32])
+        einsum_432, einsum_433, einsum_434 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_145, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_145
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_576,
+            split_577,
+        ) = einsum_433
+        del einsum_433
+
+        # builtin.split: (9x1x1024xf32, 1024x1024xf32) <- ([9x1x1024xf32, 1024x1024xf32])
+        (
+            split_578,
+            split_579,
+        ) = einsum_434
+        del einsum_434
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_190, dropout_191 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_432, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_432
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_213 = paddle._C_ops.add(dropout_190, layer_norm_135)
+        del dropout_190, layer_norm_135
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_138, layer_norm_139, layer_norm_140 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_213, parameter_7, parameter_6, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_213, parameter_6, parameter_7
+
+        # pd_op.matmul: (9x1x4096xf32) <- (9x1x1024xf32, 1024x4096xf32)
+        matmul_142 = paddle._C_ops.matmul(layer_norm_138, parameter_3, False, False)
+        del parameter_3
+
+        # pd_op.add: (9x1x4096xf32) <- (9x1x4096xf32, 4096xf32)
+        add_214 = paddle._C_ops.add(matmul_142, parameter_2)
+        del matmul_142, parameter_2
+
+        # pd_op.relu: (9x1x4096xf32) <- (9x1x4096xf32)
+        relu_23 = paddle._C_ops.relu(add_214)
+        del add_214
+
+        # pd_op.dropout: (9x1x4096xf32, 9x1x4096xui8) <- (9x1x4096xf32, None, 1xf32)
+        dropout_192, dropout_193 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_23, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_23
+
+        # pd_op.matmul: (9x1x1024xf32) <- (9x1x4096xf32, 4096x1024xf32)
+        matmul_143 = paddle._C_ops.matmul(dropout_192, parameter_1, False, False)
+        del dropout_192, parameter_1
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 1024xf32)
+        add_215 = paddle._C_ops.add(matmul_143, parameter_0)
+        del matmul_143, parameter_0
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_194, dropout_195 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_215, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_215
+
+        # pd_op.add: (9x1x1024xf32) <- (9x1x1024xf32, 9x1x1024xf32)
+        add_216 = paddle._C_ops.add(dropout_194, layer_norm_138)
+        del dropout_194, layer_norm_138
+
+        # pd_op.layer_norm: (9x1x1024xf32, 9x1xf32, 9x1xf32) <- (9x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_141, layer_norm_142, layer_norm_143 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_216, parameter_5, parameter_4, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_216, parameter_4, parameter_5
+
+        # pd_op.dropout: (9x1x1024xf32, 9x1x1024xui8) <- (9x1x1024xf32, None, 1xf32)
+        dropout_196, dropout_197 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                layer_norm_141, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del full_3, layer_norm_141
+
+        # pd_op.transpose: (1x9x1024xf32) <- (9x1x1024xf32)
+        transpose_0 = paddle._C_ops.transpose(dropout_196, [1, 0, 2])
+        del dropout_196
+
+        return transpose_0
diff --git a/paddle_samples/PaddleNLP/chinese-xlnet-large/weight_meta.py b/paddle_samples/PaddleNLP/chinese-xlnet-large/weight_meta.py
new file mode 100644
index 000000000..f950aa405
--- /dev/null
+++ b/paddle_samples/PaddleNLP/chinese-xlnet-large/weight_meta.py
@@ -0,0 +1,4076 @@
+class Program_weight_tensor_parameter_0:
+    name = "parameter_0"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_1:
+    name = "parameter_1"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.101174")
+    max_val = float("0.106974")
+    mean = float("1.12949e-05")
+    std = float("0.0200075")
+    data = None
+
+
+class Program_weight_tensor_parameter_2:
+    name = "parameter_2"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_3:
+    name = "parameter_3"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.103823")
+    max_val = float("0.101782")
+    mean = float("7.26515e-07")
+    std = float("0.0200079")
+    data = None
+
+
+class Program_weight_tensor_parameter_4:
+    name = "parameter_4"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_5:
+    name = "parameter_5"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_6:
+    name = "parameter_6"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_7:
+    name = "parameter_7"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_8:
+    name = "parameter_8"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.080153")
+    max_val = float("0.0671401")
+    mean = float("0.000422448")
+    std = float("0.0202072")
+    data = None
+
+
+class Program_weight_tensor_parameter_9:
+    name = "parameter_9"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0689389")
+    max_val = float("0.0645356")
+    mean = float("-3.42223e-05")
+    std = float("0.0190464")
+    data = None
+
+
+class Program_weight_tensor_parameter_10:
+    name = "parameter_10"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0680583")
+    max_val = float("0.058991")
+    mean = float("-7.84936e-05")
+    std = float("0.0205614")
+    data = None
+
+
+class Program_weight_tensor_parameter_11:
+    name = "parameter_11"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0644402")
+    max_val = float("0.0624891")
+    mean = float("0.00137222")
+    std = float("0.0198877")
+    data = None
+
+
+class Program_weight_tensor_parameter_12:
+    name = "parameter_12"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.105641")
+    max_val = float("0.0969925")
+    mean = float("-2.13787e-05")
+    std = float("0.0200024")
+    data = None
+
+
+class Program_weight_tensor_parameter_13:
+    name = "parameter_13"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0982633")
+    max_val = float("0.101413")
+    mean = float("2.36475e-05")
+    std = float("0.0199959")
+    data = None
+
+
+class Program_weight_tensor_parameter_14:
+    name = "parameter_14"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0963071")
+    max_val = float("0.093978")
+    mean = float("-5.18626e-06")
+    std = float("0.0199941")
+    data = None
+
+
+class Program_weight_tensor_parameter_15:
+    name = "parameter_15"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0981089")
+    max_val = float("0.111486")
+    mean = float("1.5163e-05")
+    std = float("0.0199782")
+    data = None
+
+
+class Program_weight_tensor_parameter_16:
+    name = "parameter_16"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0959459")
+    max_val = float("0.0927912")
+    mean = float("-2.88949e-05")
+    std = float("0.0199985")
+    data = None
+
+
+class Program_weight_tensor_parameter_17:
+    name = "parameter_17"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_18:
+    name = "parameter_18"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.11361")
+    max_val = float("0.0986661")
+    mean = float("-1.20929e-05")
+    std = float("0.0199967")
+    data = None
+
+
+class Program_weight_tensor_parameter_19:
+    name = "parameter_19"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_20:
+    name = "parameter_20"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.108304")
+    max_val = float("0.101341")
+    mean = float("1.0301e-06")
+    std = float("0.0199869")
+    data = None
+
+
+class Program_weight_tensor_parameter_21:
+    name = "parameter_21"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_22:
+    name = "parameter_22"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_23:
+    name = "parameter_23"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_24:
+    name = "parameter_24"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_25:
+    name = "parameter_25"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0776118")
+    max_val = float("0.0769515")
+    mean = float("-0.000727962")
+    std = float("0.0201071")
+    data = None
+
+
+class Program_weight_tensor_parameter_26:
+    name = "parameter_26"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0595223")
+    max_val = float("0.0820465")
+    mean = float("-0.000412415")
+    std = float("0.0201139")
+    data = None
+
+
+class Program_weight_tensor_parameter_27:
+    name = "parameter_27"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0582567")
+    max_val = float("0.0791273")
+    mean = float("0.000193121")
+    std = float("0.0197251")
+    data = None
+
+
+class Program_weight_tensor_parameter_28:
+    name = "parameter_28"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0675272")
+    max_val = float("0.100214")
+    mean = float("-0.000132156")
+    std = float("0.0208549")
+    data = None
+
+
+class Program_weight_tensor_parameter_29:
+    name = "parameter_29"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.100283")
+    max_val = float("0.0990686")
+    mean = float("-9.22493e-06")
+    std = float("0.0200041")
+    data = None
+
+
+class Program_weight_tensor_parameter_30:
+    name = "parameter_30"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0979155")
+    max_val = float("0.0984891")
+    mean = float("-2.74728e-05")
+    std = float("0.0200198")
+    data = None
+
+
+class Program_weight_tensor_parameter_31:
+    name = "parameter_31"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.115893")
+    max_val = float("0.0921145")
+    mean = float("4.66574e-05")
+    std = float("0.0200015")
+    data = None
+
+
+class Program_weight_tensor_parameter_32:
+    name = "parameter_32"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0988392")
+    max_val = float("0.107493")
+    mean = float("1.13895e-05")
+    std = float("0.0199926")
+    data = None
+
+
+class Program_weight_tensor_parameter_33:
+    name = "parameter_33"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0931193")
+    max_val = float("0.107737")
+    mean = float("6.50127e-06")
+    std = float("0.0200053")
+    data = None
+
+
+class Program_weight_tensor_parameter_34:
+    name = "parameter_34"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_35:
+    name = "parameter_35"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.107361")
+    max_val = float("0.0978937")
+    mean = float("-6.68323e-06")
+    std = float("0.0200042")
+    data = None
+
+
+class Program_weight_tensor_parameter_36:
+    name = "parameter_36"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_37:
+    name = "parameter_37"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.101607")
+    max_val = float("0.104253")
+    mean = float("1.84875e-05")
+    std = float("0.0199944")
+    data = None
+
+
+class Program_weight_tensor_parameter_38:
+    name = "parameter_38"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_39:
+    name = "parameter_39"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_40:
+    name = "parameter_40"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_41:
+    name = "parameter_41"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_42:
+    name = "parameter_42"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0688502")
+    max_val = float("0.0717973")
+    mean = float("0.00071679")
+    std = float("0.020282")
+    data = None
+
+
+class Program_weight_tensor_parameter_43:
+    name = "parameter_43"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.065053")
+    max_val = float("0.0546639")
+    mean = float("-0.000881583")
+    std = float("0.0198136")
+    data = None
+
+
+class Program_weight_tensor_parameter_44:
+    name = "parameter_44"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0616152")
+    max_val = float("0.0633376")
+    mean = float("-0.00082395")
+    std = float("0.0201433")
+    data = None
+
+
+class Program_weight_tensor_parameter_45:
+    name = "parameter_45"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0703036")
+    max_val = float("0.071403")
+    mean = float("0.000445171")
+    std = float("0.0197137")
+    data = None
+
+
+class Program_weight_tensor_parameter_46:
+    name = "parameter_46"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0943561")
+    max_val = float("0.0883682")
+    mean = float("7.88088e-06")
+    std = float("0.0200132")
+    data = None
+
+
+class Program_weight_tensor_parameter_47:
+    name = "parameter_47"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0939037")
+    max_val = float("0.101689")
+    mean = float("-4.57994e-05")
+    std = float("0.020009")
+    data = None
+
+
+class Program_weight_tensor_parameter_48:
+    name = "parameter_48"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0882293")
+    max_val = float("0.0969764")
+    mean = float("2.04783e-05")
+    std = float("0.0199901")
+    data = None
+
+
+class Program_weight_tensor_parameter_49:
+    name = "parameter_49"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.090025")
+    max_val = float("0.0979138")
+    mean = float("-5.89511e-06")
+    std = float("0.0199945")
+    data = None
+
+
+class Program_weight_tensor_parameter_50:
+    name = "parameter_50"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0971927")
+    max_val = float("0.0930698")
+    mean = float("3.42978e-05")
+    std = float("0.0200034")
+    data = None
+
+
+class Program_weight_tensor_parameter_51:
+    name = "parameter_51"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_52:
+    name = "parameter_52"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.102314")
+    max_val = float("0.0957205")
+    mean = float("-5.60234e-06")
+    std = float("0.019992")
+    data = None
+
+
+class Program_weight_tensor_parameter_53:
+    name = "parameter_53"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_54:
+    name = "parameter_54"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.0976568")
+    max_val = float("0.110507")
+    mean = float("-5.50881e-06")
+    std = float("0.0200016")
+    data = None
+
+
+class Program_weight_tensor_parameter_55:
+    name = "parameter_55"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_56:
+    name = "parameter_56"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_57:
+    name = "parameter_57"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_58:
+    name = "parameter_58"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_59:
+    name = "parameter_59"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0741549")
+    max_val = float("0.0757544")
+    mean = float("0.000369111")
+    std = float("0.0199624")
+    data = None
+
+
+class Program_weight_tensor_parameter_60:
+    name = "parameter_60"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0553387")
+    max_val = float("0.0627761")
+    mean = float("0.000173923")
+    std = float("0.0192062")
+    data = None
+
+
+class Program_weight_tensor_parameter_61:
+    name = "parameter_61"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0625853")
+    max_val = float("0.0559364")
+    mean = float("-0.000558363")
+    std = float("0.0205408")
+    data = None
+
+
+class Program_weight_tensor_parameter_62:
+    name = "parameter_62"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.08983")
+    max_val = float("0.0593039")
+    mean = float("0.000151348")
+    std = float("0.0206735")
+    data = None
+
+
+class Program_weight_tensor_parameter_63:
+    name = "parameter_63"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0981591")
+    max_val = float("0.0960741")
+    mean = float("1.92988e-05")
+    std = float("0.02001")
+    data = None
+
+
+class Program_weight_tensor_parameter_64:
+    name = "parameter_64"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0997372")
+    max_val = float("0.098958")
+    mean = float("-3.23046e-06")
+    std = float("0.0199816")
+    data = None
+
+
+class Program_weight_tensor_parameter_65:
+    name = "parameter_65"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.096217")
+    max_val = float("0.095661")
+    mean = float("-9.95077e-06")
+    std = float("0.020003")
+    data = None
+
+
+class Program_weight_tensor_parameter_66:
+    name = "parameter_66"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0955656")
+    max_val = float("0.0941511")
+    mean = float("-4.31246e-06")
+    std = float("0.0199914")
+    data = None
+
+
+class Program_weight_tensor_parameter_67:
+    name = "parameter_67"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0992673")
+    max_val = float("0.105218")
+    mean = float("2.06525e-05")
+    std = float("0.0199946")
+    data = None
+
+
+class Program_weight_tensor_parameter_68:
+    name = "parameter_68"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_69:
+    name = "parameter_69"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.111812")
+    max_val = float("0.107004")
+    mean = float("5.71117e-06")
+    std = float("0.0200079")
+    data = None
+
+
+class Program_weight_tensor_parameter_70:
+    name = "parameter_70"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_71:
+    name = "parameter_71"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.100086")
+    max_val = float("0.106293")
+    mean = float("-1.2655e-05")
+    std = float("0.019992")
+    data = None
+
+
+class Program_weight_tensor_parameter_72:
+    name = "parameter_72"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_73:
+    name = "parameter_73"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_74:
+    name = "parameter_74"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_75:
+    name = "parameter_75"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_76:
+    name = "parameter_76"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0634019")
+    max_val = float("0.0621452")
+    mean = float("-0.00104323")
+    std = float("0.0198099")
+    data = None
+
+
+class Program_weight_tensor_parameter_77:
+    name = "parameter_77"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0703846")
+    max_val = float("0.0622295")
+    mean = float("0.000349012")
+    std = float("0.0197498")
+    data = None
+
+
+class Program_weight_tensor_parameter_78:
+    name = "parameter_78"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0663927")
+    max_val = float("0.0833278")
+    mean = float("0.000419491")
+    std = float("0.020438")
+    data = None
+
+
+class Program_weight_tensor_parameter_79:
+    name = "parameter_79"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0657833")
+    max_val = float("0.0647611")
+    mean = float("0.00073824")
+    std = float("0.0198901")
+    data = None
+
+
+class Program_weight_tensor_parameter_80:
+    name = "parameter_80"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0983278")
+    max_val = float("0.09088")
+    mean = float("2.25245e-05")
+    std = float("0.0199907")
+    data = None
+
+
+class Program_weight_tensor_parameter_81:
+    name = "parameter_81"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0916077")
+    max_val = float("0.101837")
+    mean = float("3.14194e-06")
+    std = float("0.0199902")
+    data = None
+
+
+class Program_weight_tensor_parameter_82:
+    name = "parameter_82"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.092645")
+    max_val = float("0.0972162")
+    mean = float("-2.07655e-05")
+    std = float("0.01999")
+    data = None
+
+
+class Program_weight_tensor_parameter_83:
+    name = "parameter_83"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0946761")
+    max_val = float("0.0993856")
+    mean = float("7.70147e-06")
+    std = float("0.0199826")
+    data = None
+
+
+class Program_weight_tensor_parameter_84:
+    name = "parameter_84"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.119291")
+    max_val = float("0.0945843")
+    mean = float("-1.80735e-05")
+    std = float("0.0199915")
+    data = None
+
+
+class Program_weight_tensor_parameter_85:
+    name = "parameter_85"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_86:
+    name = "parameter_86"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.106226")
+    max_val = float("0.106861")
+    mean = float("1.53389e-06")
+    std = float("0.019991")
+    data = None
+
+
+class Program_weight_tensor_parameter_87:
+    name = "parameter_87"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_88:
+    name = "parameter_88"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.107367")
+    max_val = float("0.0993204")
+    mean = float("6.34937e-06")
+    std = float("0.0199885")
+    data = None
+
+
+class Program_weight_tensor_parameter_89:
+    name = "parameter_89"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_90:
+    name = "parameter_90"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_91:
+    name = "parameter_91"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_92:
+    name = "parameter_92"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_93:
+    name = "parameter_93"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0635183")
+    max_val = float("0.0880163")
+    mean = float("-0.00041083")
+    std = float("0.020506")
+    data = None
+
+
+class Program_weight_tensor_parameter_94:
+    name = "parameter_94"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0758214")
+    max_val = float("0.0642966")
+    mean = float("-0.000908967")
+    std = float("0.0199706")
+    data = None
+
+
+class Program_weight_tensor_parameter_95:
+    name = "parameter_95"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0587357")
+    max_val = float("0.0604111")
+    mean = float("-0.000315282")
+    std = float("0.0196432")
+    data = None
+
+
+class Program_weight_tensor_parameter_96:
+    name = "parameter_96"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0623426")
+    max_val = float("0.0619337")
+    mean = float("0.000404857")
+    std = float("0.0207849")
+    data = None
+
+
+class Program_weight_tensor_parameter_97:
+    name = "parameter_97"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0939795")
+    max_val = float("0.101385")
+    mean = float("2.0498e-05")
+    std = float("0.0199942")
+    data = None
+
+
+class Program_weight_tensor_parameter_98:
+    name = "parameter_98"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0954476")
+    max_val = float("0.101808")
+    mean = float("8.66002e-07")
+    std = float("0.0199923")
+    data = None
+
+
+class Program_weight_tensor_parameter_99:
+    name = "parameter_99"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0988117")
+    max_val = float("0.0962046")
+    mean = float("2.66444e-05")
+    std = float("0.0199712")
+    data = None
+
+
+class Program_weight_tensor_parameter_100:
+    name = "parameter_100"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.104092")
+    max_val = float("0.093028")
+    mean = float("-3.59512e-06")
+    std = float("0.0200182")
+    data = None
+
+
+class Program_weight_tensor_parameter_101:
+    name = "parameter_101"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0977985")
+    max_val = float("0.0991767")
+    mean = float("5.80583e-05")
+    std = float("0.0199986")
+    data = None
+
+
+class Program_weight_tensor_parameter_102:
+    name = "parameter_102"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_103:
+    name = "parameter_103"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.100591")
+    max_val = float("0.102676")
+    mean = float("1.4664e-05")
+    std = float("0.0199999")
+    data = None
+
+
+class Program_weight_tensor_parameter_104:
+    name = "parameter_104"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_105:
+    name = "parameter_105"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.10186")
+    max_val = float("0.103011")
+    mean = float("7.18337e-06")
+    std = float("0.0199978")
+    data = None
+
+
+class Program_weight_tensor_parameter_106:
+    name = "parameter_106"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_107:
+    name = "parameter_107"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_108:
+    name = "parameter_108"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_109:
+    name = "parameter_109"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_110:
+    name = "parameter_110"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0722179")
+    max_val = float("0.0697168")
+    mean = float("0.000629881")
+    std = float("0.020497")
+    data = None
+
+
+class Program_weight_tensor_parameter_111:
+    name = "parameter_111"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0683354")
+    max_val = float("0.0530142")
+    mean = float("-0.00041802")
+    std = float("0.0197935")
+    data = None
+
+
+class Program_weight_tensor_parameter_112:
+    name = "parameter_112"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0682838")
+    max_val = float("0.0628097")
+    mean = float("-0.000535469")
+    std = float("0.0203411")
+    data = None
+
+
+class Program_weight_tensor_parameter_113:
+    name = "parameter_113"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.056469")
+    max_val = float("0.0633971")
+    mean = float("0.00113328")
+    std = float("0.019812")
+    data = None
+
+
+class Program_weight_tensor_parameter_114:
+    name = "parameter_114"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.103305")
+    max_val = float("0.107857")
+    mean = float("5.03488e-06")
+    std = float("0.0199934")
+    data = None
+
+
+class Program_weight_tensor_parameter_115:
+    name = "parameter_115"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.105087")
+    max_val = float("0.0940546")
+    mean = float("-1.19034e-05")
+    std = float("0.0200155")
+    data = None
+
+
+class Program_weight_tensor_parameter_116:
+    name = "parameter_116"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0942608")
+    max_val = float("0.0985209")
+    mean = float("-1.18328e-05")
+    std = float("0.0200002")
+    data = None
+
+
+class Program_weight_tensor_parameter_117:
+    name = "parameter_117"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.101857")
+    max_val = float("0.103268")
+    mean = float("-2.53371e-05")
+    std = float("0.020014")
+    data = None
+
+
+class Program_weight_tensor_parameter_118:
+    name = "parameter_118"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.100453")
+    max_val = float("0.101783")
+    mean = float("8.46206e-06")
+    std = float("0.0200377")
+    data = None
+
+
+class Program_weight_tensor_parameter_119:
+    name = "parameter_119"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_120:
+    name = "parameter_120"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.110061")
+    max_val = float("0.0977406")
+    mean = float("-1.44333e-05")
+    std = float("0.0200069")
+    data = None
+
+
+class Program_weight_tensor_parameter_121:
+    name = "parameter_121"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_122:
+    name = "parameter_122"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.10051")
+    max_val = float("0.112146")
+    mean = float("-8.05737e-06")
+    std = float("0.0199925")
+    data = None
+
+
+class Program_weight_tensor_parameter_123:
+    name = "parameter_123"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_124:
+    name = "parameter_124"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_125:
+    name = "parameter_125"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_126:
+    name = "parameter_126"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_127:
+    name = "parameter_127"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0686442")
+    max_val = float("0.0714007")
+    mean = float("-0.000159975")
+    std = float("0.0201678")
+    data = None
+
+
+class Program_weight_tensor_parameter_128:
+    name = "parameter_128"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0595842")
+    max_val = float("0.0719294")
+    mean = float("-3.13658e-05")
+    std = float("0.0204201")
+    data = None
+
+
+class Program_weight_tensor_parameter_129:
+    name = "parameter_129"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0647841")
+    max_val = float("0.0633613")
+    mean = float("-0.00077124")
+    std = float("0.0195923")
+    data = None
+
+
+class Program_weight_tensor_parameter_130:
+    name = "parameter_130"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0657016")
+    max_val = float("0.0680139")
+    mean = float("-0.00124655")
+    std = float("0.0194623")
+    data = None
+
+
+class Program_weight_tensor_parameter_131:
+    name = "parameter_131"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0941171")
+    max_val = float("0.0950286")
+    mean = float("6.25059e-06")
+    std = float("0.020012")
+    data = None
+
+
+class Program_weight_tensor_parameter_132:
+    name = "parameter_132"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0945435")
+    max_val = float("0.100857")
+    mean = float("3.24381e-05")
+    std = float("0.0200183")
+    data = None
+
+
+class Program_weight_tensor_parameter_133:
+    name = "parameter_133"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.108057")
+    max_val = float("0.0941034")
+    mean = float("-1.74459e-05")
+    std = float("0.019992")
+    data = None
+
+
+class Program_weight_tensor_parameter_134:
+    name = "parameter_134"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0978291")
+    max_val = float("0.0925467")
+    mean = float("1.02683e-06")
+    std = float("0.0200146")
+    data = None
+
+
+class Program_weight_tensor_parameter_135:
+    name = "parameter_135"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.109219")
+    max_val = float("0.0987054")
+    mean = float("1.04907e-05")
+    std = float("0.0200115")
+    data = None
+
+
+class Program_weight_tensor_parameter_136:
+    name = "parameter_136"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_137:
+    name = "parameter_137"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.103292")
+    max_val = float("0.10374")
+    mean = float("-1.9846e-06")
+    std = float("0.0199935")
+    data = None
+
+
+class Program_weight_tensor_parameter_138:
+    name = "parameter_138"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_139:
+    name = "parameter_139"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.100205")
+    max_val = float("0.0964732")
+    mean = float("4.08738e-06")
+    std = float("0.0200045")
+    data = None
+
+
+class Program_weight_tensor_parameter_140:
+    name = "parameter_140"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_141:
+    name = "parameter_141"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_142:
+    name = "parameter_142"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_143:
+    name = "parameter_143"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_144:
+    name = "parameter_144"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.066533")
+    max_val = float("0.0640852")
+    mean = float("0.000620742")
+    std = float("0.0199716")
+    data = None
+
+
+class Program_weight_tensor_parameter_145:
+    name = "parameter_145"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0606203")
+    max_val = float("0.0627061")
+    mean = float("4.99137e-06")
+    std = float("0.0194108")
+    data = None
+
+
+class Program_weight_tensor_parameter_146:
+    name = "parameter_146"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0717645")
+    max_val = float("0.0675073")
+    mean = float("0.000530434")
+    std = float("0.0199398")
+    data = None
+
+
+class Program_weight_tensor_parameter_147:
+    name = "parameter_147"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0561925")
+    max_val = float("0.0620645")
+    mean = float("0.000263358")
+    std = float("0.0195281")
+    data = None
+
+
+class Program_weight_tensor_parameter_148:
+    name = "parameter_148"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0920362")
+    max_val = float("0.0918777")
+    mean = float("-1.84304e-05")
+    std = float("0.0199942")
+    data = None
+
+
+class Program_weight_tensor_parameter_149:
+    name = "parameter_149"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0990218")
+    max_val = float("0.096936")
+    mean = float("-2.33644e-05")
+    std = float("0.0199892")
+    data = None
+
+
+class Program_weight_tensor_parameter_150:
+    name = "parameter_150"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.100763")
+    max_val = float("0.0914228")
+    mean = float("-4.19569e-05")
+    std = float("0.0199779")
+    data = None
+
+
+class Program_weight_tensor_parameter_151:
+    name = "parameter_151"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0997828")
+    max_val = float("0.0945503")
+    mean = float("-1.85671e-05")
+    std = float("0.0200027")
+    data = None
+
+
+class Program_weight_tensor_parameter_152:
+    name = "parameter_152"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0970994")
+    max_val = float("0.103876")
+    mean = float("9.65016e-06")
+    std = float("0.0199919")
+    data = None
+
+
+class Program_weight_tensor_parameter_153:
+    name = "parameter_153"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_154:
+    name = "parameter_154"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.108847")
+    max_val = float("0.104224")
+    mean = float("1.70981e-07")
+    std = float("0.0199995")
+    data = None
+
+
+class Program_weight_tensor_parameter_155:
+    name = "parameter_155"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_156:
+    name = "parameter_156"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.0984719")
+    max_val = float("0.100283")
+    mean = float("-2.14459e-06")
+    std = float("0.0200008")
+    data = None
+
+
+class Program_weight_tensor_parameter_157:
+    name = "parameter_157"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_158:
+    name = "parameter_158"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_159:
+    name = "parameter_159"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_160:
+    name = "parameter_160"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_161:
+    name = "parameter_161"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0652733")
+    max_val = float("0.0623973")
+    mean = float("-0.000111954")
+    std = float("0.0196823")
+    data = None
+
+
+class Program_weight_tensor_parameter_162:
+    name = "parameter_162"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0612179")
+    max_val = float("0.0602291")
+    mean = float("0.000256394")
+    std = float("0.0194512")
+    data = None
+
+
+class Program_weight_tensor_parameter_163:
+    name = "parameter_163"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0608343")
+    max_val = float("0.0584399")
+    mean = float("0.000367647")
+    std = float("0.019854")
+    data = None
+
+
+class Program_weight_tensor_parameter_164:
+    name = "parameter_164"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.063469")
+    max_val = float("0.0856444")
+    mean = float("-0.000209721")
+    std = float("0.0194563")
+    data = None
+
+
+class Program_weight_tensor_parameter_165:
+    name = "parameter_165"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0932418")
+    max_val = float("0.0971317")
+    mean = float("8.3827e-06")
+    std = float("0.019981")
+    data = None
+
+
+class Program_weight_tensor_parameter_166:
+    name = "parameter_166"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0943668")
+    max_val = float("0.0910483")
+    mean = float("7.40488e-06")
+    std = float("0.0200086")
+    data = None
+
+
+class Program_weight_tensor_parameter_167:
+    name = "parameter_167"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0931213")
+    max_val = float("0.102227")
+    mean = float("4.29648e-05")
+    std = float("0.0199886")
+    data = None
+
+
+class Program_weight_tensor_parameter_168:
+    name = "parameter_168"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0963163")
+    max_val = float("0.101721")
+    mean = float("1.81921e-05")
+    std = float("0.0199958")
+    data = None
+
+
+class Program_weight_tensor_parameter_169:
+    name = "parameter_169"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.105238")
+    max_val = float("0.102362")
+    mean = float("-2.42609e-05")
+    std = float("0.0200077")
+    data = None
+
+
+class Program_weight_tensor_parameter_170:
+    name = "parameter_170"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_171:
+    name = "parameter_171"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.0950606")
+    max_val = float("0.104676")
+    mean = float("1.38038e-05")
+    std = float("0.0200084")
+    data = None
+
+
+class Program_weight_tensor_parameter_172:
+    name = "parameter_172"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_173:
+    name = "parameter_173"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.101412")
+    max_val = float("0.10013")
+    mean = float("1.1315e-05")
+    std = float("0.0199956")
+    data = None
+
+
+class Program_weight_tensor_parameter_174:
+    name = "parameter_174"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_175:
+    name = "parameter_175"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_176:
+    name = "parameter_176"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_177:
+    name = "parameter_177"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_178:
+    name = "parameter_178"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0655597")
+    max_val = float("0.0763105")
+    mean = float("-0.000411208")
+    std = float("0.0201434")
+    data = None
+
+
+class Program_weight_tensor_parameter_179:
+    name = "parameter_179"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0668925")
+    max_val = float("0.0576623")
+    mean = float("0.00108288")
+    std = float("0.0203384")
+    data = None
+
+
+class Program_weight_tensor_parameter_180:
+    name = "parameter_180"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0646394")
+    max_val = float("0.0680152")
+    mean = float("-0.000135809")
+    std = float("0.0195784")
+    data = None
+
+
+class Program_weight_tensor_parameter_181:
+    name = "parameter_181"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.060051")
+    max_val = float("0.0630734")
+    mean = float("-0.000379699")
+    std = float("0.0203422")
+    data = None
+
+
+class Program_weight_tensor_parameter_182:
+    name = "parameter_182"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0969839")
+    max_val = float("0.101515")
+    mean = float("-2.37039e-05")
+    std = float("0.0200014")
+    data = None
+
+
+class Program_weight_tensor_parameter_183:
+    name = "parameter_183"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0968036")
+    max_val = float("0.0968174")
+    mean = float("1.96444e-05")
+    std = float("0.0200059")
+    data = None
+
+
+class Program_weight_tensor_parameter_184:
+    name = "parameter_184"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0865728")
+    max_val = float("0.0994847")
+    mean = float("-3.51517e-06")
+    std = float("0.0200172")
+    data = None
+
+
+class Program_weight_tensor_parameter_185:
+    name = "parameter_185"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0948586")
+    max_val = float("0.0898406")
+    mean = float("9.69405e-06")
+    std = float("0.0199866")
+    data = None
+
+
+class Program_weight_tensor_parameter_186:
+    name = "parameter_186"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0948052")
+    max_val = float("0.0915345")
+    mean = float("-2.06129e-05")
+    std = float("0.0199876")
+    data = None
+
+
+class Program_weight_tensor_parameter_187:
+    name = "parameter_187"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_188:
+    name = "parameter_188"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.105008")
+    max_val = float("0.113057")
+    mean = float("6.88203e-06")
+    std = float("0.0200095")
+    data = None
+
+
+class Program_weight_tensor_parameter_189:
+    name = "parameter_189"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_190:
+    name = "parameter_190"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.100402")
+    max_val = float("0.104401")
+    mean = float("1.41856e-05")
+    std = float("0.0199831")
+    data = None
+
+
+class Program_weight_tensor_parameter_191:
+    name = "parameter_191"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_192:
+    name = "parameter_192"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_193:
+    name = "parameter_193"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_194:
+    name = "parameter_194"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_195:
+    name = "parameter_195"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0805676")
+    max_val = float("0.0700786")
+    mean = float("3.40461e-05")
+    std = float("0.0200066")
+    data = None
+
+
+class Program_weight_tensor_parameter_196:
+    name = "parameter_196"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0611997")
+    max_val = float("0.0788406")
+    mean = float("0.000472469")
+    std = float("0.0202516")
+    data = None
+
+
+class Program_weight_tensor_parameter_197:
+    name = "parameter_197"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0586935")
+    max_val = float("0.0629197")
+    mean = float("-0.000473277")
+    std = float("0.0198758")
+    data = None
+
+
+class Program_weight_tensor_parameter_198:
+    name = "parameter_198"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0574957")
+    max_val = float("0.0719211")
+    mean = float("-0.00096253")
+    std = float("0.0200222")
+    data = None
+
+
+class Program_weight_tensor_parameter_199:
+    name = "parameter_199"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0975594")
+    max_val = float("0.0934616")
+    mean = float("-7.13832e-06")
+    std = float("0.0200197")
+    data = None
+
+
+class Program_weight_tensor_parameter_200:
+    name = "parameter_200"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0962643")
+    max_val = float("0.0999671")
+    mean = float("1.92496e-06")
+    std = float("0.019963")
+    data = None
+
+
+class Program_weight_tensor_parameter_201:
+    name = "parameter_201"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.105258")
+    max_val = float("0.0996388")
+    mean = float("1.98332e-05")
+    std = float("0.0200079")
+    data = None
+
+
+class Program_weight_tensor_parameter_202:
+    name = "parameter_202"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0928025")
+    max_val = float("0.0988722")
+    mean = float("-1.5117e-05")
+    std = float("0.0200138")
+    data = None
+
+
+class Program_weight_tensor_parameter_203:
+    name = "parameter_203"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.095285")
+    max_val = float("0.0982607")
+    mean = float("7.84191e-06")
+    std = float("0.0199938")
+    data = None
+
+
+class Program_weight_tensor_parameter_204:
+    name = "parameter_204"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_205:
+    name = "parameter_205"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.107757")
+    max_val = float("0.0972812")
+    mean = float("6.76515e-06")
+    std = float("0.0200009")
+    data = None
+
+
+class Program_weight_tensor_parameter_206:
+    name = "parameter_206"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_207:
+    name = "parameter_207"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.0997285")
+    max_val = float("0.100684")
+    mean = float("8.63008e-06")
+    std = float("0.0199863")
+    data = None
+
+
+class Program_weight_tensor_parameter_208:
+    name = "parameter_208"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_209:
+    name = "parameter_209"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_210:
+    name = "parameter_210"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_211:
+    name = "parameter_211"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_212:
+    name = "parameter_212"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0701117")
+    max_val = float("0.0675819")
+    mean = float("-0.00082489")
+    std = float("0.0199821")
+    data = None
+
+
+class Program_weight_tensor_parameter_213:
+    name = "parameter_213"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0623671")
+    max_val = float("0.0615726")
+    mean = float("-0.000293522")
+    std = float("0.0202768")
+    data = None
+
+
+class Program_weight_tensor_parameter_214:
+    name = "parameter_214"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0646346")
+    max_val = float("0.0690476")
+    mean = float("0.000435714")
+    std = float("0.0194082")
+    data = None
+
+
+class Program_weight_tensor_parameter_215:
+    name = "parameter_215"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0644714")
+    max_val = float("0.055561")
+    mean = float("0.00138591")
+    std = float("0.0195516")
+    data = None
+
+
+class Program_weight_tensor_parameter_216:
+    name = "parameter_216"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0918271")
+    max_val = float("0.0972187")
+    mean = float("1.31732e-05")
+    std = float("0.0199885")
+    data = None
+
+
+class Program_weight_tensor_parameter_217:
+    name = "parameter_217"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0997392")
+    max_val = float("0.100097")
+    mean = float("3.5355e-06")
+    std = float("0.0199998")
+    data = None
+
+
+class Program_weight_tensor_parameter_218:
+    name = "parameter_218"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0972613")
+    max_val = float("0.0896363")
+    mean = float("8.42154e-06")
+    std = float("0.0199989")
+    data = None
+
+
+class Program_weight_tensor_parameter_219:
+    name = "parameter_219"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0981378")
+    max_val = float("0.0933598")
+    mean = float("1.3638e-05")
+    std = float("0.0200094")
+    data = None
+
+
+class Program_weight_tensor_parameter_220:
+    name = "parameter_220"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.106812")
+    max_val = float("0.101117")
+    mean = float("6.56158e-07")
+    std = float("0.0199876")
+    data = None
+
+
+class Program_weight_tensor_parameter_221:
+    name = "parameter_221"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_222:
+    name = "parameter_222"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.100439")
+    max_val = float("0.102285")
+    mean = float("-9.53002e-06")
+    std = float("0.0199964")
+    data = None
+
+
+class Program_weight_tensor_parameter_223:
+    name = "parameter_223"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_224:
+    name = "parameter_224"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.0995681")
+    max_val = float("0.0990056")
+    mean = float("8.84426e-06")
+    std = float("0.0199936")
+    data = None
+
+
+class Program_weight_tensor_parameter_225:
+    name = "parameter_225"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_226:
+    name = "parameter_226"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_227:
+    name = "parameter_227"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_228:
+    name = "parameter_228"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_229:
+    name = "parameter_229"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0806161")
+    max_val = float("0.0728875")
+    mean = float("0.000353456")
+    std = float("0.0196011")
+    data = None
+
+
+class Program_weight_tensor_parameter_230:
+    name = "parameter_230"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0537205")
+    max_val = float("0.0627947")
+    mean = float("0.00138492")
+    std = float("0.0194261")
+    data = None
+
+
+class Program_weight_tensor_parameter_231:
+    name = "parameter_231"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0614613")
+    max_val = float("0.0610024")
+    mean = float("-0.00019822")
+    std = float("0.0195517")
+    data = None
+
+
+class Program_weight_tensor_parameter_232:
+    name = "parameter_232"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0737003")
+    max_val = float("0.0481376")
+    mean = float("-0.000334779")
+    std = float("0.0194257")
+    data = None
+
+
+class Program_weight_tensor_parameter_233:
+    name = "parameter_233"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.092247")
+    max_val = float("0.106602")
+    mean = float("2.78315e-05")
+    std = float("0.0200159")
+    data = None
+
+
+class Program_weight_tensor_parameter_234:
+    name = "parameter_234"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0976784")
+    max_val = float("0.0968902")
+    mean = float("-1.74879e-05")
+    std = float("0.0199951")
+    data = None
+
+
+class Program_weight_tensor_parameter_235:
+    name = "parameter_235"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.097143")
+    max_val = float("0.0984268")
+    mean = float("-1.14251e-05")
+    std = float("0.0199745")
+    data = None
+
+
+class Program_weight_tensor_parameter_236:
+    name = "parameter_236"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0973396")
+    max_val = float("0.0968062")
+    mean = float("8.64476e-06")
+    std = float("0.0199767")
+    data = None
+
+
+class Program_weight_tensor_parameter_237:
+    name = "parameter_237"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0898708")
+    max_val = float("0.0938607")
+    mean = float("1.28792e-05")
+    std = float("0.0200167")
+    data = None
+
+
+class Program_weight_tensor_parameter_238:
+    name = "parameter_238"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_239:
+    name = "parameter_239"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.103183")
+    max_val = float("0.10181")
+    mean = float("1.76587e-05")
+    std = float("0.0199945")
+    data = None
+
+
+class Program_weight_tensor_parameter_240:
+    name = "parameter_240"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_241:
+    name = "parameter_241"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.101759")
+    max_val = float("0.108292")
+    mean = float("3.31314e-06")
+    std = float("0.0200034")
+    data = None
+
+
+class Program_weight_tensor_parameter_242:
+    name = "parameter_242"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_243:
+    name = "parameter_243"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_244:
+    name = "parameter_244"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_245:
+    name = "parameter_245"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_246:
+    name = "parameter_246"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0649272")
+    max_val = float("0.0724965")
+    mean = float("1.94794e-05")
+    std = float("0.0201683")
+    data = None
+
+
+class Program_weight_tensor_parameter_247:
+    name = "parameter_247"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0579609")
+    max_val = float("0.063785")
+    mean = float("-0.000174216")
+    std = float("0.0196671")
+    data = None
+
+
+class Program_weight_tensor_parameter_248:
+    name = "parameter_248"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0673813")
+    max_val = float("0.0673014")
+    mean = float("0.000625371")
+    std = float("0.0199423")
+    data = None
+
+
+class Program_weight_tensor_parameter_249:
+    name = "parameter_249"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0588756")
+    max_val = float("0.0810359")
+    mean = float("0.000141649")
+    std = float("0.020254")
+    data = None
+
+
+class Program_weight_tensor_parameter_250:
+    name = "parameter_250"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.094093")
+    max_val = float("0.0971453")
+    mean = float("2.29591e-05")
+    std = float("0.0200024")
+    data = None
+
+
+class Program_weight_tensor_parameter_251:
+    name = "parameter_251"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0951525")
+    max_val = float("0.0955532")
+    mean = float("-3.29141e-05")
+    std = float("0.0199776")
+    data = None
+
+
+class Program_weight_tensor_parameter_252:
+    name = "parameter_252"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.091507")
+    max_val = float("0.105083")
+    mean = float("1.98842e-05")
+    std = float("0.0199812")
+    data = None
+
+
+class Program_weight_tensor_parameter_253:
+    name = "parameter_253"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.107993")
+    max_val = float("0.102463")
+    mean = float("-1.49592e-05")
+    std = float("0.0199948")
+    data = None
+
+
+class Program_weight_tensor_parameter_254:
+    name = "parameter_254"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0939214")
+    max_val = float("0.0950489")
+    mean = float("-2.89977e-05")
+    std = float("0.0199896")
+    data = None
+
+
+class Program_weight_tensor_parameter_255:
+    name = "parameter_255"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_256:
+    name = "parameter_256"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.0974402")
+    max_val = float("0.109486")
+    mean = float("-1.05386e-05")
+    std = float("0.0200016")
+    data = None
+
+
+class Program_weight_tensor_parameter_257:
+    name = "parameter_257"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_258:
+    name = "parameter_258"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.098575")
+    max_val = float("0.0998085")
+    mean = float("-1.17447e-05")
+    std = float("0.0200097")
+    data = None
+
+
+class Program_weight_tensor_parameter_259:
+    name = "parameter_259"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_260:
+    name = "parameter_260"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_261:
+    name = "parameter_261"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_262:
+    name = "parameter_262"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_263:
+    name = "parameter_263"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.069583")
+    max_val = float("0.090873")
+    mean = float("8.99848e-05")
+    std = float("0.0197018")
+    data = None
+
+
+class Program_weight_tensor_parameter_264:
+    name = "parameter_264"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0557726")
+    max_val = float("0.0740368")
+    mean = float("-0.000676958")
+    std = float("0.0203581")
+    data = None
+
+
+class Program_weight_tensor_parameter_265:
+    name = "parameter_265"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0786428")
+    max_val = float("0.0575711")
+    mean = float("0.00127308")
+    std = float("0.0200049")
+    data = None
+
+
+class Program_weight_tensor_parameter_266:
+    name = "parameter_266"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0625172")
+    max_val = float("0.064942")
+    mean = float("0.000881327")
+    std = float("0.0211175")
+    data = None
+
+
+class Program_weight_tensor_parameter_267:
+    name = "parameter_267"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.103271")
+    max_val = float("0.094095")
+    mean = float("4.64615e-08")
+    std = float("0.0199997")
+    data = None
+
+
+class Program_weight_tensor_parameter_268:
+    name = "parameter_268"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0935565")
+    max_val = float("0.09608")
+    mean = float("1.38109e-05")
+    std = float("0.0200004")
+    data = None
+
+
+class Program_weight_tensor_parameter_269:
+    name = "parameter_269"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0937777")
+    max_val = float("0.0868086")
+    mean = float("3.14887e-05")
+    std = float("0.0200033")
+    data = None
+
+
+class Program_weight_tensor_parameter_270:
+    name = "parameter_270"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0982899")
+    max_val = float("0.105437")
+    mean = float("-8.34959e-06")
+    std = float("0.0199841")
+    data = None
+
+
+class Program_weight_tensor_parameter_271:
+    name = "parameter_271"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.109972")
+    max_val = float("0.104686")
+    mean = float("2.04251e-05")
+    std = float("0.0200012")
+    data = None
+
+
+class Program_weight_tensor_parameter_272:
+    name = "parameter_272"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_273:
+    name = "parameter_273"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.101295")
+    max_val = float("0.104349")
+    mean = float("-6.44501e-06")
+    std = float("0.0200025")
+    data = None
+
+
+class Program_weight_tensor_parameter_274:
+    name = "parameter_274"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_275:
+    name = "parameter_275"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.10127")
+    max_val = float("0.0981984")
+    mean = float("4.20567e-06")
+    std = float("0.0199984")
+    data = None
+
+
+class Program_weight_tensor_parameter_276:
+    name = "parameter_276"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_277:
+    name = "parameter_277"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_278:
+    name = "parameter_278"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_279:
+    name = "parameter_279"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_280:
+    name = "parameter_280"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0679478")
+    max_val = float("0.0749753")
+    mean = float("-6.18988e-05")
+    std = float("0.0200871")
+    data = None
+
+
+class Program_weight_tensor_parameter_281:
+    name = "parameter_281"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0752752")
+    max_val = float("0.0699409")
+    mean = float("8.90834e-05")
+    std = float("0.0204845")
+    data = None
+
+
+class Program_weight_tensor_parameter_282:
+    name = "parameter_282"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0564257")
+    max_val = float("0.0582638")
+    mean = float("-0.000267411")
+    std = float("0.0197657")
+    data = None
+
+
+class Program_weight_tensor_parameter_283:
+    name = "parameter_283"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.070838")
+    max_val = float("0.0572247")
+    mean = float("-0.000798589")
+    std = float("0.019529")
+    data = None
+
+
+class Program_weight_tensor_parameter_284:
+    name = "parameter_284"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0957708")
+    max_val = float("0.0956196")
+    mean = float("3.22566e-05")
+    std = float("0.0200046")
+    data = None
+
+
+class Program_weight_tensor_parameter_285:
+    name = "parameter_285"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.103727")
+    max_val = float("0.108267")
+    mean = float("-2.74841e-05")
+    std = float("0.0199752")
+    data = None
+
+
+class Program_weight_tensor_parameter_286:
+    name = "parameter_286"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0984173")
+    max_val = float("0.0913852")
+    mean = float("-1.74837e-05")
+    std = float("0.0199919")
+    data = None
+
+
+class Program_weight_tensor_parameter_287:
+    name = "parameter_287"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0978536")
+    max_val = float("0.104957")
+    mean = float("-5.7202e-06")
+    std = float("0.0199945")
+    data = None
+
+
+class Program_weight_tensor_parameter_288:
+    name = "parameter_288"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.103587")
+    max_val = float("0.0976505")
+    mean = float("-2.64606e-05")
+    std = float("0.0199942")
+    data = None
+
+
+class Program_weight_tensor_parameter_289:
+    name = "parameter_289"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_290:
+    name = "parameter_290"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.103851")
+    max_val = float("0.101262")
+    mean = float("9.33794e-06")
+    std = float("0.0199928")
+    data = None
+
+
+class Program_weight_tensor_parameter_291:
+    name = "parameter_291"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_292:
+    name = "parameter_292"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.100662")
+    max_val = float("0.102835")
+    mean = float("-2.55434e-06")
+    std = float("0.0200103")
+    data = None
+
+
+class Program_weight_tensor_parameter_293:
+    name = "parameter_293"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_294:
+    name = "parameter_294"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_295:
+    name = "parameter_295"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_296:
+    name = "parameter_296"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_297:
+    name = "parameter_297"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0626017")
+    max_val = float("0.0656066")
+    mean = float("-0.000111057")
+    std = float("0.0202468")
+    data = None
+
+
+class Program_weight_tensor_parameter_298:
+    name = "parameter_298"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0619167")
+    max_val = float("0.0700004")
+    mean = float("0.00032209")
+    std = float("0.0201107")
+    data = None
+
+
+class Program_weight_tensor_parameter_299:
+    name = "parameter_299"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0703513")
+    max_val = float("0.0520745")
+    mean = float("0.000125638")
+    std = float("0.0192279")
+    data = None
+
+
+class Program_weight_tensor_parameter_300:
+    name = "parameter_300"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0523908")
+    max_val = float("0.0653329")
+    mean = float("-0.000826464")
+    std = float("0.020002")
+    data = None
+
+
+class Program_weight_tensor_parameter_301:
+    name = "parameter_301"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.10032")
+    max_val = float("0.107837")
+    mean = float("-7.52083e-06")
+    std = float("0.0199915")
+    data = None
+
+
+class Program_weight_tensor_parameter_302:
+    name = "parameter_302"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0963977")
+    max_val = float("0.101089")
+    mean = float("3.80581e-05")
+    std = float("0.0199917")
+    data = None
+
+
+class Program_weight_tensor_parameter_303:
+    name = "parameter_303"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0909416")
+    max_val = float("0.0948842")
+    mean = float("-3.12662e-06")
+    std = float("0.0200055")
+    data = None
+
+
+class Program_weight_tensor_parameter_304:
+    name = "parameter_304"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.101874")
+    max_val = float("0.0946097")
+    mean = float("5.74289e-07")
+    std = float("0.0199916")
+    data = None
+
+
+class Program_weight_tensor_parameter_305:
+    name = "parameter_305"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.107225")
+    max_val = float("0.0995108")
+    mean = float("-4.92357e-06")
+    std = float("0.0200176")
+    data = None
+
+
+class Program_weight_tensor_parameter_306:
+    name = "parameter_306"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_307:
+    name = "parameter_307"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.104269")
+    max_val = float("0.104589")
+    mean = float("-1.60604e-06")
+    std = float("0.0200035")
+    data = None
+
+
+class Program_weight_tensor_parameter_308:
+    name = "parameter_308"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_309:
+    name = "parameter_309"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.102395")
+    max_val = float("0.0991339")
+    mean = float("1.0274e-05")
+    std = float("0.019996")
+    data = None
+
+
+class Program_weight_tensor_parameter_310:
+    name = "parameter_310"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_311:
+    name = "parameter_311"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_312:
+    name = "parameter_312"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_313:
+    name = "parameter_313"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_314:
+    name = "parameter_314"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0841563")
+    max_val = float("0.0637723")
+    mean = float("-0.000201798")
+    std = float("0.0196366")
+    data = None
+
+
+class Program_weight_tensor_parameter_315:
+    name = "parameter_315"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0652055")
+    max_val = float("0.0702022")
+    mean = float("9.55746e-05")
+    std = float("0.0203474")
+    data = None
+
+
+class Program_weight_tensor_parameter_316:
+    name = "parameter_316"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0613293")
+    max_val = float("0.0549883")
+    mean = float("-0.000485551")
+    std = float("0.0195201")
+    data = None
+
+
+class Program_weight_tensor_parameter_317:
+    name = "parameter_317"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0746907")
+    max_val = float("0.0600717")
+    mean = float("-0.000675786")
+    std = float("0.0194105")
+    data = None
+
+
+class Program_weight_tensor_parameter_318:
+    name = "parameter_318"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.107453")
+    max_val = float("0.0994477")
+    mean = float("1.77619e-05")
+    std = float("0.0199824")
+    data = None
+
+
+class Program_weight_tensor_parameter_319:
+    name = "parameter_319"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0966336")
+    max_val = float("0.0984281")
+    mean = float("2.17781e-05")
+    std = float("0.0200194")
+    data = None
+
+
+class Program_weight_tensor_parameter_320:
+    name = "parameter_320"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0942078")
+    max_val = float("0.106695")
+    mean = float("1.58374e-05")
+    std = float("0.0199883")
+    data = None
+
+
+class Program_weight_tensor_parameter_321:
+    name = "parameter_321"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0992307")
+    max_val = float("0.0907452")
+    mean = float("1.90126e-05")
+    std = float("0.0199944")
+    data = None
+
+
+class Program_weight_tensor_parameter_322:
+    name = "parameter_322"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0919617")
+    max_val = float("0.0934963")
+    mean = float("-2.61964e-05")
+    std = float("0.0199878")
+    data = None
+
+
+class Program_weight_tensor_parameter_323:
+    name = "parameter_323"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_324:
+    name = "parameter_324"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.100454")
+    max_val = float("0.100013")
+    mean = float("2.46501e-05")
+    std = float("0.0200119")
+    data = None
+
+
+class Program_weight_tensor_parameter_325:
+    name = "parameter_325"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_326:
+    name = "parameter_326"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.104897")
+    max_val = float("0.0974905")
+    mean = float("1.47232e-05")
+    std = float("0.0199932")
+    data = None
+
+
+class Program_weight_tensor_parameter_327:
+    name = "parameter_327"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_328:
+    name = "parameter_328"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_329:
+    name = "parameter_329"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_330:
+    name = "parameter_330"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_331:
+    name = "parameter_331"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0641577")
+    max_val = float("0.0666822")
+    mean = float("0.000249277")
+    std = float("0.0200067")
+    data = None
+
+
+class Program_weight_tensor_parameter_332:
+    name = "parameter_332"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0581891")
+    max_val = float("0.0640808")
+    mean = float("0.0010717")
+    std = float("0.0200259")
+    data = None
+
+
+class Program_weight_tensor_parameter_333:
+    name = "parameter_333"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0621781")
+    max_val = float("0.0541678")
+    mean = float("-0.00076242")
+    std = float("0.0205017")
+    data = None
+
+
+class Program_weight_tensor_parameter_334:
+    name = "parameter_334"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0653996")
+    max_val = float("0.0684167")
+    mean = float("0.000209998")
+    std = float("0.0206692")
+    data = None
+
+
+class Program_weight_tensor_parameter_335:
+    name = "parameter_335"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.093042")
+    max_val = float("0.0933048")
+    mean = float("-3.51466e-05")
+    std = float("0.0200255")
+    data = None
+
+
+class Program_weight_tensor_parameter_336:
+    name = "parameter_336"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0970212")
+    max_val = float("0.0938276")
+    mean = float("-1.91991e-06")
+    std = float("0.0200234")
+    data = None
+
+
+class Program_weight_tensor_parameter_337:
+    name = "parameter_337"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0990197")
+    max_val = float("0.0960994")
+    mean = float("9.8341e-06")
+    std = float("0.0200041")
+    data = None
+
+
+class Program_weight_tensor_parameter_338:
+    name = "parameter_338"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0958936")
+    max_val = float("0.101472")
+    mean = float("8.20897e-06")
+    std = float("0.02")
+    data = None
+
+
+class Program_weight_tensor_parameter_339:
+    name = "parameter_339"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0994704")
+    max_val = float("0.0964828")
+    mean = float("1.65094e-05")
+    std = float("0.0199897")
+    data = None
+
+
+class Program_weight_tensor_parameter_340:
+    name = "parameter_340"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_341:
+    name = "parameter_341"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.0985067")
+    max_val = float("0.100013")
+    mean = float("-1.95348e-05")
+    std = float("0.0200054")
+    data = None
+
+
+class Program_weight_tensor_parameter_342:
+    name = "parameter_342"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_343:
+    name = "parameter_343"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.101093")
+    max_val = float("0.0976068")
+    mean = float("1.17624e-05")
+    std = float("0.0200044")
+    data = None
+
+
+class Program_weight_tensor_parameter_344:
+    name = "parameter_344"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_345:
+    name = "parameter_345"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_346:
+    name = "parameter_346"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_347:
+    name = "parameter_347"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_348:
+    name = "parameter_348"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0643575")
+    max_val = float("0.0683558")
+    mean = float("0.000492696")
+    std = float("0.0192451")
+    data = None
+
+
+class Program_weight_tensor_parameter_349:
+    name = "parameter_349"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0688406")
+    max_val = float("0.0745166")
+    mean = float("-0.000331381")
+    std = float("0.0204476")
+    data = None
+
+
+class Program_weight_tensor_parameter_350:
+    name = "parameter_350"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0738689")
+    max_val = float("0.0695295")
+    mean = float("0.000565032")
+    std = float("0.0202335")
+    data = None
+
+
+class Program_weight_tensor_parameter_351:
+    name = "parameter_351"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.056985")
+    max_val = float("0.0685464")
+    mean = float("3.19221e-05")
+    std = float("0.019697")
+    data = None
+
+
+class Program_weight_tensor_parameter_352:
+    name = "parameter_352"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.100409")
+    max_val = float("0.107729")
+    mean = float("5.13221e-07")
+    std = float("0.0199895")
+    data = None
+
+
+class Program_weight_tensor_parameter_353:
+    name = "parameter_353"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.097502")
+    max_val = float("0.095367")
+    mean = float("-2.76522e-05")
+    std = float("0.0199902")
+    data = None
+
+
+class Program_weight_tensor_parameter_354:
+    name = "parameter_354"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.098712")
+    max_val = float("0.10371")
+    mean = float("6.20669e-06")
+    std = float("0.0199942")
+    data = None
+
+
+class Program_weight_tensor_parameter_355:
+    name = "parameter_355"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0926043")
+    max_val = float("0.111207")
+    mean = float("-3.25062e-06")
+    std = float("0.0200042")
+    data = None
+
+
+class Program_weight_tensor_parameter_356:
+    name = "parameter_356"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0980042")
+    max_val = float("0.0994727")
+    mean = float("3.53943e-05")
+    std = float("0.0199748")
+    data = None
+
+
+class Program_weight_tensor_parameter_357:
+    name = "parameter_357"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_358:
+    name = "parameter_358"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.101384")
+    max_val = float("0.109823")
+    mean = float("9.79956e-06")
+    std = float("0.0199974")
+    data = None
+
+
+class Program_weight_tensor_parameter_359:
+    name = "parameter_359"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_360:
+    name = "parameter_360"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.100813")
+    max_val = float("0.104755")
+    mean = float("-1.55498e-05")
+    std = float("0.0200078")
+    data = None
+
+
+class Program_weight_tensor_parameter_361:
+    name = "parameter_361"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_362:
+    name = "parameter_362"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_363:
+    name = "parameter_363"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_364:
+    name = "parameter_364"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_365:
+    name = "parameter_365"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0636676")
+    max_val = float("0.0673576")
+    mean = float("-0.00101459")
+    std = float("0.0198469")
+    data = None
+
+
+class Program_weight_tensor_parameter_366:
+    name = "parameter_366"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0685063")
+    max_val = float("0.0597098")
+    mean = float("-0.000893922")
+    std = float("0.0194709")
+    data = None
+
+
+class Program_weight_tensor_parameter_367:
+    name = "parameter_367"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.06185")
+    max_val = float("0.0633766")
+    mean = float("-0.00025569")
+    std = float("0.0199222")
+    data = None
+
+
+class Program_weight_tensor_parameter_368:
+    name = "parameter_368"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.053385")
+    max_val = float("0.0562548")
+    mean = float("4.75082e-06")
+    std = float("0.0200367")
+    data = None
+
+
+class Program_weight_tensor_parameter_369:
+    name = "parameter_369"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.1079")
+    max_val = float("0.096237")
+    mean = float("-1.25352e-05")
+    std = float("0.019986")
+    data = None
+
+
+class Program_weight_tensor_parameter_370:
+    name = "parameter_370"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.102431")
+    max_val = float("0.0992015")
+    mean = float("-1.14069e-05")
+    std = float("0.0200011")
+    data = None
+
+
+class Program_weight_tensor_parameter_371:
+    name = "parameter_371"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0940297")
+    max_val = float("0.102051")
+    mean = float("1.7207e-05")
+    std = float("0.0199849")
+    data = None
+
+
+class Program_weight_tensor_parameter_372:
+    name = "parameter_372"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0985293")
+    max_val = float("0.0991618")
+    mean = float("-1.43206e-05")
+    std = float("0.0200159")
+    data = None
+
+
+class Program_weight_tensor_parameter_373:
+    name = "parameter_373"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.105187")
+    max_val = float("0.105412")
+    mean = float("3.0328e-05")
+    std = float("0.0200177")
+    data = None
+
+
+class Program_weight_tensor_parameter_374:
+    name = "parameter_374"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_375:
+    name = "parameter_375"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.0987471")
+    max_val = float("0.105192")
+    mean = float("-9.56708e-06")
+    std = float("0.0200079")
+    data = None
+
+
+class Program_weight_tensor_parameter_376:
+    name = "parameter_376"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_377:
+    name = "parameter_377"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.0997025")
+    max_val = float("0.102423")
+    mean = float("4.15081e-06")
+    std = float("0.0199907")
+    data = None
+
+
+class Program_weight_tensor_parameter_378:
+    name = "parameter_378"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_379:
+    name = "parameter_379"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_380:
+    name = "parameter_380"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_381:
+    name = "parameter_381"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_382:
+    name = "parameter_382"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0670102")
+    max_val = float("0.0670993")
+    mean = float("0.000295633")
+    std = float("0.0200305")
+    data = None
+
+
+class Program_weight_tensor_parameter_383:
+    name = "parameter_383"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0734343")
+    max_val = float("0.0606798")
+    mean = float("-0.00036096")
+    std = float("0.0196409")
+    data = None
+
+
+class Program_weight_tensor_parameter_384:
+    name = "parameter_384"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0699884")
+    max_val = float("0.0709267")
+    mean = float("-0.000168359")
+    std = float("0.0203052")
+    data = None
+
+
+class Program_weight_tensor_parameter_385:
+    name = "parameter_385"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.058247")
+    max_val = float("0.0571072")
+    mean = float("-0.00104375")
+    std = float("0.0190464")
+    data = None
+
+
+class Program_weight_tensor_parameter_386:
+    name = "parameter_386"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.103492")
+    max_val = float("0.0961583")
+    mean = float("-3.09677e-05")
+    std = float("0.0199968")
+    data = None
+
+
+class Program_weight_tensor_parameter_387:
+    name = "parameter_387"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0982702")
+    max_val = float("0.0894224")
+    mean = float("-2.02479e-05")
+    std = float("0.0199937")
+    data = None
+
+
+class Program_weight_tensor_parameter_388:
+    name = "parameter_388"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.097582")
+    max_val = float("0.0910485")
+    mean = float("1.53476e-05")
+    std = float("0.0199956")
+    data = None
+
+
+class Program_weight_tensor_parameter_389:
+    name = "parameter_389"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0965115")
+    max_val = float("0.0953453")
+    mean = float("1.63286e-05")
+    std = float("0.0200075")
+    data = None
+
+
+class Program_weight_tensor_parameter_390:
+    name = "parameter_390"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0955548")
+    max_val = float("0.104097")
+    mean = float("5.95724e-06")
+    std = float("0.0200001")
+    data = None
+
+
+class Program_weight_tensor_parameter_391:
+    name = "parameter_391"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_392:
+    name = "parameter_392"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.106777")
+    max_val = float("0.103619")
+    mean = float("-1.28013e-05")
+    std = float("0.0199958")
+    data = None
+
+
+class Program_weight_tensor_parameter_393:
+    name = "parameter_393"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_394:
+    name = "parameter_394"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.10106")
+    max_val = float("0.100392")
+    mean = float("2.31494e-06")
+    std = float("0.0199927")
+    data = None
+
+
+class Program_weight_tensor_parameter_395:
+    name = "parameter_395"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_396:
+    name = "parameter_396"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_397:
+    name = "parameter_397"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_398:
+    name = "parameter_398"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_399:
+    name = "parameter_399"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0549828")
+    max_val = float("0.0664872")
+    mean = float("0.000389889")
+    std = float("0.0199559")
+    data = None
+
+
+class Program_weight_tensor_parameter_400:
+    name = "parameter_400"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0626439")
+    max_val = float("0.0606803")
+    mean = float("0.000118815")
+    std = float("0.0203933")
+    data = None
+
+
+class Program_weight_tensor_parameter_401:
+    name = "parameter_401"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0707977")
+    max_val = float("0.067955")
+    mean = float("0.000513946")
+    std = float("0.0196487")
+    data = None
+
+
+class Program_weight_tensor_parameter_402:
+    name = "parameter_402"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0592983")
+    max_val = float("0.0920773")
+    mean = float("0.000159731")
+    std = float("0.0207634")
+    data = None
+
+
+class Program_weight_tensor_parameter_403:
+    name = "parameter_403"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0923136")
+    max_val = float("0.0980971")
+    mean = float("-2.65779e-06")
+    std = float("0.0199978")
+    data = None
+
+
+class Program_weight_tensor_parameter_404:
+    name = "parameter_404"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0896717")
+    max_val = float("0.0923605")
+    mean = float("-1.91466e-05")
+    std = float("0.0200097")
+    data = None
+
+
+class Program_weight_tensor_parameter_405:
+    name = "parameter_405"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0999169")
+    max_val = float("0.0928902")
+    mean = float("-2.14941e-05")
+    std = float("0.0200104")
+    data = None
+
+
+class Program_weight_tensor_parameter_406:
+    name = "parameter_406"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0945058")
+    max_val = float("0.0918114")
+    mean = float("-1.16767e-05")
+    std = float("0.0200079")
+    data = None
+
+
+class Program_weight_tensor_parameter_407:
+    name = "parameter_407"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.101291")
+    max_val = float("0.0917568")
+    mean = float("4.35013e-05")
+    std = float("0.0199835")
+    data = None
+
+
+class Program_weight_tensor_parameter_408:
+    name = "parameter_408"
+    shape = [32000, 1024]
+    dtype = "float32"
+    min_val = float("-0.104754")
+    max_val = float("0.116291")
+    mean = float("-1.36643e-06")
+    std = float("0.0199993")
+    data = None
+
+
+class Program_weight_tensor_parameter_409:
+    name = "parameter_409"
+    shape = [1, 1, 1024]
+    dtype = "float32"
+    min_val = float("-0.053712")
+    max_val = float("0.0665643")
+    mean = float("0.000561582")
+    std = float("0.0194981")
+    data = None
diff --git a/paddle_samples/PaddleNLP/chinese-xlnet-mid/graph_net.json b/paddle_samples/PaddleNLP/chinese-xlnet-mid/graph_net.json
new file mode 100644
index 000000000..637c415b1
--- /dev/null
+++ b/paddle_samples/PaddleNLP/chinese-xlnet-mid/graph_net.json
@@ -0,0 +1,6 @@
+{
+    "framework": "paddle",
+    "model_name": "chinese-xlnet-mid",
+    "num_devices_required": 1,
+    "num_nodes_required": 1
+}
\ No newline at end of file
diff --git a/paddle_samples/PaddleNLP/chinese-xlnet-mid/input_meta.py b/paddle_samples/PaddleNLP/chinese-xlnet-mid/input_meta.py
new file mode 100644
index 000000000..9ea1655e0
--- /dev/null
+++ b/paddle_samples/PaddleNLP/chinese-xlnet-mid/input_meta.py
@@ -0,0 +1,19 @@
+class Program_weight_tensor_data_0:
+    name = "data_0"
+    shape = [1, 9]
+    dtype = "int64"
+    data = [19, 11684, 121, 15954, 2090, 21957, 1039, 4, 3]
+
+
+class Program_weight_tensor_data_1:
+    name = "data_1"
+    shape = [1, 9]
+    dtype = "int64"
+    data = [0, 0, 0, 0, 0, 0, 0, 0, 2]
+
+
+class Program_weight_tensor_data_2:
+    name = "data_2"
+    shape = [1, 9]
+    dtype = "int64"
+    data = [1, 1, 1, 1, 1, 1, 1, 1, 1]
diff --git a/paddle_samples/PaddleNLP/chinese-xlnet-mid/model.py b/paddle_samples/PaddleNLP/chinese-xlnet-mid/model.py
new file mode 100644
index 000000000..1f9e0cdb8
--- /dev/null
+++ b/paddle_samples/PaddleNLP/chinese-xlnet-mid/model.py
@@ -0,0 +1,8389 @@
+import paddle
+
+
+class GraphModule(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self,
+        parameter_0,
+        parameter_1,
+        parameter_2,
+        parameter_3,
+        parameter_4,
+        parameter_5,
+        parameter_6,
+        parameter_7,
+        parameter_8,
+        parameter_9,
+        parameter_10,
+        parameter_11,
+        parameter_12,
+        parameter_13,
+        parameter_14,
+        parameter_15,
+        parameter_16,
+        parameter_17,
+        parameter_18,
+        parameter_19,
+        parameter_20,
+        parameter_21,
+        parameter_22,
+        parameter_23,
+        parameter_24,
+        parameter_25,
+        parameter_26,
+        parameter_27,
+        parameter_28,
+        parameter_29,
+        parameter_30,
+        parameter_31,
+        parameter_32,
+        parameter_33,
+        parameter_34,
+        parameter_35,
+        parameter_36,
+        parameter_37,
+        parameter_38,
+        parameter_39,
+        parameter_40,
+        parameter_41,
+        parameter_42,
+        parameter_43,
+        parameter_44,
+        parameter_45,
+        parameter_46,
+        parameter_47,
+        parameter_48,
+        parameter_49,
+        parameter_50,
+        parameter_51,
+        parameter_52,
+        parameter_53,
+        parameter_54,
+        parameter_55,
+        parameter_56,
+        parameter_57,
+        parameter_58,
+        parameter_59,
+        parameter_60,
+        parameter_61,
+        parameter_62,
+        parameter_63,
+        parameter_64,
+        parameter_65,
+        parameter_66,
+        parameter_67,
+        parameter_68,
+        parameter_69,
+        parameter_70,
+        parameter_71,
+        parameter_72,
+        parameter_73,
+        parameter_74,
+        parameter_75,
+        parameter_76,
+        parameter_77,
+        parameter_78,
+        parameter_79,
+        parameter_80,
+        parameter_81,
+        parameter_82,
+        parameter_83,
+        parameter_84,
+        parameter_85,
+        parameter_86,
+        parameter_87,
+        parameter_88,
+        parameter_89,
+        parameter_90,
+        parameter_91,
+        parameter_92,
+        parameter_93,
+        parameter_94,
+        parameter_95,
+        parameter_96,
+        parameter_97,
+        parameter_98,
+        parameter_99,
+        parameter_100,
+        parameter_101,
+        parameter_102,
+        parameter_103,
+        parameter_104,
+        parameter_105,
+        parameter_106,
+        parameter_107,
+        parameter_108,
+        parameter_109,
+        parameter_110,
+        parameter_111,
+        parameter_112,
+        parameter_113,
+        parameter_114,
+        parameter_115,
+        parameter_116,
+        parameter_117,
+        parameter_118,
+        parameter_119,
+        parameter_120,
+        parameter_121,
+        parameter_122,
+        parameter_123,
+        parameter_124,
+        parameter_125,
+        parameter_126,
+        parameter_127,
+        parameter_128,
+        parameter_129,
+        parameter_130,
+        parameter_131,
+        parameter_132,
+        parameter_133,
+        parameter_134,
+        parameter_135,
+        parameter_136,
+        parameter_137,
+        parameter_138,
+        parameter_139,
+        parameter_140,
+        parameter_141,
+        parameter_142,
+        parameter_143,
+        parameter_144,
+        parameter_145,
+        parameter_146,
+        parameter_147,
+        parameter_148,
+        parameter_149,
+        parameter_150,
+        parameter_151,
+        parameter_152,
+        parameter_153,
+        parameter_154,
+        parameter_155,
+        parameter_156,
+        parameter_157,
+        parameter_158,
+        parameter_159,
+        parameter_160,
+        parameter_161,
+        parameter_162,
+        parameter_163,
+        parameter_164,
+        parameter_165,
+        parameter_166,
+        parameter_167,
+        parameter_168,
+        parameter_169,
+        parameter_170,
+        parameter_171,
+        parameter_172,
+        parameter_173,
+        parameter_174,
+        parameter_175,
+        parameter_176,
+        parameter_177,
+        parameter_178,
+        parameter_179,
+        parameter_180,
+        parameter_181,
+        parameter_182,
+        parameter_183,
+        parameter_184,
+        parameter_185,
+        parameter_186,
+        parameter_187,
+        parameter_188,
+        parameter_189,
+        parameter_190,
+        parameter_191,
+        parameter_192,
+        parameter_193,
+        parameter_194,
+        parameter_195,
+        parameter_196,
+        parameter_197,
+        parameter_198,
+        parameter_199,
+        parameter_200,
+        parameter_201,
+        parameter_202,
+        parameter_203,
+        parameter_204,
+        parameter_205,
+        parameter_206,
+        parameter_207,
+        parameter_208,
+        parameter_209,
+        parameter_210,
+        parameter_211,
+        parameter_212,
+        parameter_213,
+        parameter_214,
+        parameter_215,
+        parameter_216,
+        parameter_217,
+        parameter_218,
+        parameter_219,
+        parameter_220,
+        parameter_221,
+        parameter_222,
+        parameter_223,
+        parameter_224,
+        parameter_225,
+        parameter_226,
+        parameter_227,
+        parameter_228,
+        parameter_229,
+        parameter_230,
+        parameter_231,
+        parameter_232,
+        parameter_233,
+        parameter_234,
+        parameter_235,
+        parameter_236,
+        parameter_237,
+        parameter_238,
+        parameter_239,
+        parameter_240,
+        parameter_241,
+        parameter_242,
+        parameter_243,
+        parameter_244,
+        parameter_245,
+        parameter_246,
+        parameter_247,
+        parameter_248,
+        parameter_249,
+        parameter_250,
+        parameter_251,
+        parameter_252,
+        parameter_253,
+        parameter_254,
+        parameter_255,
+        parameter_256,
+        parameter_257,
+        parameter_258,
+        parameter_259,
+        parameter_260,
+        parameter_261,
+        parameter_262,
+        parameter_263,
+        parameter_264,
+        parameter_265,
+        parameter_266,
+        parameter_267,
+        parameter_268,
+        parameter_269,
+        parameter_270,
+        parameter_271,
+        parameter_272,
+        parameter_273,
+        parameter_274,
+        parameter_275,
+        parameter_276,
+        parameter_277,
+        parameter_278,
+        parameter_279,
+        parameter_280,
+        parameter_281,
+        parameter_282,
+        parameter_283,
+        parameter_284,
+        parameter_285,
+        parameter_286,
+        parameter_287,
+        parameter_288,
+        parameter_289,
+        parameter_290,
+        parameter_291,
+        parameter_292,
+        parameter_293,
+        parameter_294,
+        parameter_295,
+        parameter_296,
+        parameter_297,
+        parameter_298,
+        parameter_299,
+        parameter_300,
+        parameter_301,
+        parameter_302,
+        parameter_303,
+        parameter_304,
+        parameter_305,
+        parameter_306,
+        parameter_307,
+        parameter_308,
+        parameter_309,
+        parameter_310,
+        parameter_311,
+        parameter_312,
+        parameter_313,
+        parameter_314,
+        parameter_315,
+        parameter_316,
+        parameter_317,
+        parameter_318,
+        parameter_319,
+        parameter_320,
+        parameter_321,
+        parameter_322,
+        parameter_323,
+        parameter_324,
+        parameter_325,
+        parameter_326,
+        parameter_327,
+        parameter_328,
+        parameter_329,
+        parameter_330,
+        parameter_331,
+        parameter_332,
+        parameter_333,
+        parameter_334,
+        parameter_335,
+        parameter_336,
+        parameter_337,
+        parameter_338,
+        parameter_339,
+        parameter_340,
+        parameter_341,
+        parameter_342,
+        parameter_343,
+        parameter_344,
+        parameter_345,
+        parameter_346,
+        parameter_347,
+        parameter_348,
+        parameter_349,
+        parameter_350,
+        parameter_351,
+        parameter_352,
+        parameter_353,
+        parameter_354,
+        parameter_355,
+        parameter_356,
+        parameter_357,
+        parameter_358,
+        parameter_359,
+        parameter_360,
+        parameter_361,
+        parameter_362,
+        parameter_363,
+        parameter_364,
+        parameter_365,
+        parameter_366,
+        parameter_367,
+        parameter_368,
+        parameter_369,
+        parameter_370,
+        parameter_371,
+        parameter_372,
+        parameter_373,
+        parameter_374,
+        parameter_375,
+        parameter_376,
+        parameter_377,
+        parameter_378,
+        parameter_379,
+        parameter_380,
+        parameter_381,
+        parameter_382,
+        parameter_383,
+        parameter_384,
+        parameter_385,
+        parameter_386,
+        parameter_387,
+        parameter_388,
+        parameter_389,
+        parameter_390,
+        parameter_391,
+        parameter_392,
+        parameter_393,
+        parameter_394,
+        parameter_395,
+        parameter_396,
+        parameter_397,
+        parameter_398,
+        parameter_399,
+        parameter_400,
+        parameter_401,
+        parameter_402,
+        parameter_403,
+        parameter_404,
+        parameter_405,
+        parameter_406,
+        parameter_407,
+        parameter_408,
+        parameter_409,
+        data_0,
+        data_1,
+        data_2,
+    ):
+        # pd_op.transpose: (9x1xi64) <- (1x9xi64)
+        transpose_1 = paddle._C_ops.transpose(data_0, [1, 0])
+        del data_0
+
+        # pd_op.transpose: (9x1xi64) <- (1x9xi64)
+        transpose_2 = paddle._C_ops.transpose(data_1, [1, 0])
+        del data_1
+
+        # pd_op.transpose: (9x1xi64) <- (1x9xi64)
+        transpose_3 = paddle._C_ops.transpose(data_2, [1, 0])
+        del data_2
+
+        # pd_op.cast: (9x1xf32) <- (9x1xi64)
+        cast_0 = paddle._C_ops.cast(transpose_3, paddle.float32)
+        del transpose_3
+
+        # pd_op.full: (1xf32) <- ()
+        full_0 = paddle._C_ops.full(
+            [1], float("-1"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (9x1xf32) <- (9x1xf32, 1xf32)
+        scale_0 = paddle._C_ops.scale(cast_0, full_0, float("1"), True)
+        del cast_0
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_0 = [0]
+
+        # pd_op.unsqueeze: (1x9x1xf32) <- (9x1xf32, 1xi64)
+        unsqueeze_0 = paddle._C_ops.unsqueeze(scale_0, full_int_array_0)
+        del scale_0
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_1 = [-1]
+
+        # pd_op.unsqueeze: (1x9x1x1xf32) <- (1x9x1xf32, 1xi64)
+        unsqueeze_1 = paddle._C_ops.unsqueeze(unsqueeze_0, full_int_array_1)
+        del full_int_array_1, unsqueeze_0
+
+        # pd_op.full: (xf32) <- ()
+        full_1 = paddle._C_ops.full(
+            [], float("0"), paddle.float32, paddle.framework._current_expected_place()
+        )
+
+        # pd_op.greater_than: (1x9x1x1xb) <- (1x9x1x1xf32, xf32)
+        greater_than_0 = paddle._C_ops.greater_than(unsqueeze_1, full_1)
+        del unsqueeze_1
+
+        # pd_op.cast: (1x9x1x1xf32) <- (1x9x1x1xb)
+        cast_1 = paddle._C_ops.cast(greater_than_0, paddle.float32)
+        del greater_than_0
+
+        # pd_op.full: (9xf32) <- ()
+        full_2 = paddle._C_ops.full(
+            [9], float("1"), paddle.float32, paddle.framework._current_expected_place()
+        )
+
+        # pd_op.diag: (9x9xf32) <- (9xf32)
+        diag_0 = paddle._C_ops.diag(full_2, 0, float("0"))
+        del full_2
+
+        # pd_op.scale: (9x9xf32) <- (9x9xf32, 1xf32)
+        scale_1 = paddle._C_ops.scale(diag_0, full_0, float("0"), True)
+        del diag_0, full_0
+
+        # pd_op.cast: (9x9xf32) <- (9x9xf32)
+        cast_2 = paddle._C_ops.cast(scale_1, paddle.float32)
+        del scale_1
+
+        # pd_op.full_int_array: (2xi64) <- ()
+        full_int_array_2 = [2, 3]
+
+        # pd_op.unsqueeze: (9x9x1x1xf32) <- (9x9xf32, 2xi64)
+        unsqueeze_2 = paddle._C_ops.unsqueeze(cast_2, full_int_array_2)
+        del cast_2, full_int_array_2
+
+        # pd_op.add: (9x9x1x1xf32) <- (1x9x1x1xf32, 9x9x1x1xf32)
+        add_0 = paddle._C_ops.add(cast_1, unsqueeze_2)
+        del cast_1, unsqueeze_2
+
+        # pd_op.greater_than: (9x9x1x1xb) <- (9x9x1x1xf32, xf32)
+        greater_than_1 = paddle._C_ops.greater_than(add_0, full_1)
+        del add_0, full_1
+
+        # pd_op.cast: (9x9x1x1xf32) <- (9x9x1x1xb)
+        cast_3 = paddle._C_ops.cast(greater_than_1, paddle.float32)
+        del greater_than_1
+
+        # pd_op.embedding: (9x1x768xf32) <- (9x1xi64, 32000x768xf32)
+        embedding_0 = paddle._C_ops.embedding(transpose_1, parameter_408, -1, False)
+        del parameter_408, transpose_1
+
+        # pd_op.full: (1xf32) <- ()
+        full_3 = paddle._C_ops.full(
+            [1], float("0.1"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_0, dropout_1 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                embedding_0, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del embedding_0
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_3 = [1]
+
+        # pd_op.unsqueeze: (9x1x1xi64) <- (9x1xi64, 1xi64)
+        unsqueeze_3 = paddle._C_ops.unsqueeze(transpose_2, full_int_array_3)
+
+        # pd_op.unsqueeze: (1x9x1xi64) <- (9x1xi64, 1xi64)
+        unsqueeze_4 = paddle._C_ops.unsqueeze(transpose_2, full_int_array_0)
+        del full_int_array_0, transpose_2
+
+        # pd_op.not_equal: (9x9x1xb) <- (9x1x1xi64, 1x9x1xi64)
+        not_equal_0 = paddle._C_ops.not_equal(unsqueeze_3, unsqueeze_4)
+        del unsqueeze_3, unsqueeze_4
+
+        # pd_op.cast: (9x9x1xi64) <- (9x9x1xb)
+        cast_4 = paddle._C_ops.cast(not_equal_0, paddle.int64)
+        del not_equal_0
+
+        # pd_op.full: (1xi32) <- ()
+        full_4 = paddle._C_ops.full(
+            [1], float("2"), paddle.int32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.one_hot: (9x9x1x2xf32) <- (9x9x1xi64, 1xi32)
+        one_hot_0 = paddle._C_ops.one_hot(
+            cast_4 % paddle.cast(full_4, cast_4.dtype), full_4
+        )
+        del cast_4, full_4
+
+        # pd_op.cast: (9x9x1x2xf32) <- (9x9x1x2xf32)
+        cast_5 = paddle._C_ops.cast(one_hot_0, paddle.float32)
+        del one_hot_0
+
+        # pd_op.full: (1xf64) <- ()
+        full_5 = paddle._C_ops.full(
+            [1], float("0"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full: (1xf64) <- ()
+        full_6 = paddle._C_ops.full(
+            [1], float("768"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full: (1xf64) <- ()
+        full_7 = paddle._C_ops.full(
+            [1], float("2"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.arange: (384xf32) <- (1xf64, 1xf64, 1xf64)
+        arange_0 = paddle.arange(full_5, full_6, full_7, dtype="float32")
+        del full_6, full_7
+
+        # pd_op.full: (1xf32) <- ()
+        full_8 = paddle._C_ops.full(
+            [1], float("0.00130208"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (384xf32) <- (384xf32, 1xf32)
+        scale_2 = paddle._C_ops.scale(arange_0, full_8, float("0"), True)
+        del arange_0, full_8
+
+        # pd_op.full: (384xf32) <- ()
+        full_9 = paddle._C_ops.full(
+            [384],
+            float("10000"),
+            paddle.float32,
+            paddle.framework._current_expected_place(),
+        )
+
+        # pd_op.elementwise_pow: (384xf32) <- (384xf32, 384xf32)
+        elementwise_pow_0 = paddle._C_ops.elementwise_pow(full_9, scale_2)
+        del full_9, scale_2
+
+        # pd_op.full: (384xf32) <- ()
+        full_10 = paddle._C_ops.full(
+            [384],
+            float("1"),
+            paddle.float32,
+            paddle.framework._current_expected_place(),
+        )
+
+        # pd_op.divide: (384xf32) <- (384xf32, 384xf32)
+        divide_0 = paddle._C_ops.divide(full_10, elementwise_pow_0)
+        del elementwise_pow_0, full_10
+
+        # pd_op.full: (1xf64) <- ()
+        full_11 = paddle._C_ops.full(
+            [1], float("9"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full: (1xf64) <- ()
+        full_12 = paddle._C_ops.full(
+            [1], float("-9"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full: (1xf64) <- ()
+        full_13 = paddle._C_ops.full(
+            [1], float("-1"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.arange: (18xf32) <- (1xf64, 1xf64, 1xf64)
+        arange_1 = paddle.arange(full_11, full_12, full_13, dtype="float32")
+        del full_12, full_13
+
+        # builtin.combine: ([18xf32, 384xf32]) <- (18xf32, 384xf32)
+        combine_0 = [arange_1, divide_0]
+        del arange_1, divide_0
+
+        # pd_op.einsum: (18x384xf32, [0xf32, 0xf32], [18xf32, 384xf32]) <- ([18xf32, 384xf32])
+        einsum_0, einsum_1, einsum_2 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_0, "i,d->id"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_0
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_0,
+            split_1,
+        ) = einsum_1
+        del einsum_1
+
+        # builtin.split: (18xf32, 384xf32) <- ([18xf32, 384xf32])
+        (
+            split_2,
+            split_3,
+        ) = einsum_2
+        del einsum_2
+
+        # pd_op.sin: (18x384xf32) <- (18x384xf32)
+        sin_0 = paddle._C_ops.sin(einsum_0)
+
+        # pd_op.cos: (18x384xf32) <- (18x384xf32)
+        cos_0 = paddle._C_ops.cos(einsum_0)
+        del einsum_0
+
+        # pd_op.full: (1xi32) <- ()
+        full_14 = paddle._C_ops.full(
+            [1], float("-1"), paddle.int32, paddle.core.CPUPlace()
+        )
+
+        # builtin.combine: ([18x384xf32, 18x384xf32]) <- (18x384xf32, 18x384xf32)
+        combine_1 = [sin_0, cos_0]
+        del cos_0, sin_0
+
+        # pd_op.concat: (18x768xf32) <- ([18x384xf32, 18x384xf32], 1xi32)
+        concat_0 = paddle._C_ops.concat(combine_1, full_14)
+        del combine_1, full_14
+
+        # pd_op.unsqueeze: (18x1x768xf32) <- (18x768xf32, 1xi64)
+        unsqueeze_5 = paddle._C_ops.unsqueeze(concat_0, full_int_array_3)
+        del concat_0
+
+        # pd_op.full_int_array: (3xi64) <- ()
+        full_int_array_4 = [-1, 1, -1]
+
+        # pd_op.expand: (18x1x768xf32) <- (18x1x768xf32, 3xi64)
+        expand_0 = paddle._C_ops.expand(unsqueeze_5, full_int_array_4)
+        del full_int_array_4, unsqueeze_5
+
+        # pd_op.dropout: (18x1x768xf32, 18x1x768xui8) <- (18x1x768xf32, None, 1xf32)
+        dropout_2, dropout_3 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                expand_0, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del expand_0
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_0 = paddle._C_ops.matmul(dropout_0, parameter_407, False, False)
+        del parameter_407
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_5 = [9, 1, 12, 64]
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_0 = paddle._C_ops.reshape(matmul_0, full_int_array_5)
+        del matmul_0
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_1 = paddle._C_ops.matmul(dropout_0, parameter_406, False, False)
+        del parameter_406
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_1 = paddle._C_ops.reshape(matmul_1, full_int_array_5)
+        del matmul_1
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_2 = paddle._C_ops.matmul(dropout_0, parameter_405, False, False)
+        del parameter_405
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_2 = paddle._C_ops.reshape(matmul_2, full_int_array_5)
+        del matmul_2
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_3 = paddle._C_ops.matmul(dropout_2, parameter_403, False, False)
+        del parameter_403
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_6 = [18, -1, 12, 64]
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_3 = paddle._C_ops.reshape(matmul_3, full_int_array_6)
+        del matmul_3
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_1 = paddle._C_ops.add(reshape_0, parameter_400)
+        del parameter_400
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_2 = [add_1, reshape_1]
+        del add_1, reshape_1
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_3, einsum_4, einsum_5 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_2, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_2
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_4,
+            split_5,
+        ) = einsum_4
+        del einsum_4
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_6,
+            split_7,
+        ) = einsum_5
+        del einsum_5
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_2 = paddle._C_ops.add(reshape_0, parameter_402)
+        del parameter_402
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_3 = [add_2, reshape_3]
+        del add_2, reshape_3
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_6, einsum_7, einsum_8 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_3, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_3
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_8,
+            split_9,
+        ) = einsum_7
+        del einsum_7
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_10,
+            split_11,
+        ) = einsum_8
+        del einsum_8
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_7 = [1, 12, 18, 9]
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_4 = paddle._C_ops.reshape(einsum_6, full_int_array_7)
+        del einsum_6
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_8 = [2147483647]
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_0 = paddle._C_ops.slice(
+            reshape_4, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_4
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_9 = [1, 12, 9, 17]
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_5 = paddle._C_ops.reshape(slice_0, full_int_array_9)
+        del slice_0
+
+        # pd_op.full: (1xf64) <- ()
+        full_15 = paddle._C_ops.full(
+            [1], float("1"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.arange: (9xi64) <- (1xf64, 1xf64, 1xf64)
+        arange_2 = paddle.arange(full_5, full_11, full_15, dtype="int64")
+        del full_11, full_15, full_5
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_0 = paddle._C_ops.index_select(reshape_5, arange_2, 3)
+        del reshape_5
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_3 = paddle._C_ops.add(reshape_0, parameter_401)
+        del parameter_401, reshape_0
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_4 = [add_3, parameter_399]
+        del add_3, parameter_399
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_9, einsum_10, einsum_11 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_4, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_4
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_12,
+            split_13,
+        ) = einsum_10
+        del einsum_10
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_14,
+            split_15,
+        ) = einsum_11
+        del einsum_11
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_5 = [cast_5, einsum_9]
+        del einsum_9
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_12, einsum_13, einsum_14 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_5, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_5
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_16,
+            split_17,
+        ) = einsum_13
+        del einsum_13
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_18,
+            split_19,
+        ) = einsum_14
+        del einsum_14
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_4 = paddle._C_ops.add(einsum_3, index_select_0)
+        del einsum_3, index_select_0
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_5 = paddle._C_ops.add(add_4, einsum_12)
+        del add_4, einsum_12
+
+        # pd_op.full: (1xf32) <- ()
+        full_16 = paddle._C_ops.full(
+            [1], float("0.125"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_3 = paddle._C_ops.scale(add_5, full_16, float("0"), True)
+        del add_5
+
+        # pd_op.transpose: (1x1x9x9xf32) <- (9x9x1x1xf32)
+        transpose_4 = paddle._C_ops.transpose(cast_3, [2, 3, 0, 1])
+        del cast_3
+
+        # pd_op.full: (1xf32) <- ()
+        full_17 = paddle._C_ops.full(
+            [1], float("1e+30"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x1x9x9xf32) <- (1x1x9x9xf32, 1xf32)
+        scale_4 = paddle._C_ops.scale(transpose_4, full_17, float("0"), True)
+        del full_17, transpose_4
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_0 = paddle._C_ops.subtract(scale_3, scale_4)
+        del scale_3
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_0 = paddle._C_ops.softmax(subtract_0, 3)
+        del subtract_0
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_4, dropout_5 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_0, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_0
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_6 = [dropout_4, reshape_2]
+        del dropout_4, reshape_2
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_15, einsum_16, einsum_17 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_6, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_6
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_20,
+            split_21,
+        ) = einsum_16
+        del einsum_16
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_22,
+            split_23,
+        ) = einsum_17
+        del einsum_17
+
+        # pd_op.full_int_array: (3xi64) <- ()
+        full_int_array_10 = [9, 1, 768]
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_6 = paddle._C_ops.reshape(einsum_15, full_int_array_10)
+        del einsum_15
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_7 = [reshape_6, parameter_404]
+        del parameter_404, reshape_6
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_18, einsum_19, einsum_20 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_7, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_7
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_24,
+            split_25,
+        ) = einsum_19
+        del einsum_19
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_26,
+            split_27,
+        ) = einsum_20
+        del einsum_20
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_6, dropout_7 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_18, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_18
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_6 = paddle._C_ops.add(dropout_6, dropout_0)
+        del dropout_0, dropout_6
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_0, layer_norm_1, layer_norm_2 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_6, parameter_398, parameter_397, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_6, parameter_397, parameter_398
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_4 = paddle._C_ops.matmul(layer_norm_0, parameter_394, False, False)
+        del parameter_394
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_7 = paddle._C_ops.add(matmul_4, parameter_393)
+        del matmul_4, parameter_393
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_0 = paddle._C_ops.relu(add_7)
+        del add_7
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_8, dropout_9 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_0, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_0
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_5 = paddle._C_ops.matmul(dropout_8, parameter_392, False, False)
+        del dropout_8, parameter_392
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_8 = paddle._C_ops.add(matmul_5, parameter_391)
+        del matmul_5, parameter_391
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_10, dropout_11 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_8, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_8
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_9 = paddle._C_ops.add(dropout_10, layer_norm_0)
+        del dropout_10, layer_norm_0
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_3, layer_norm_4, layer_norm_5 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_9, parameter_396, parameter_395, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_9, parameter_395, parameter_396
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_6 = paddle._C_ops.matmul(layer_norm_3, parameter_390, False, False)
+        del parameter_390
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_7 = paddle._C_ops.reshape(matmul_6, full_int_array_5)
+        del matmul_6
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_7 = paddle._C_ops.matmul(layer_norm_3, parameter_389, False, False)
+        del parameter_389
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_8 = paddle._C_ops.reshape(matmul_7, full_int_array_5)
+        del matmul_7
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_8 = paddle._C_ops.matmul(layer_norm_3, parameter_388, False, False)
+        del parameter_388
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_9 = paddle._C_ops.reshape(matmul_8, full_int_array_5)
+        del matmul_8
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_9 = paddle._C_ops.matmul(dropout_2, parameter_386, False, False)
+        del parameter_386
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_10 = paddle._C_ops.reshape(matmul_9, full_int_array_6)
+        del matmul_9
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_10 = paddle._C_ops.add(reshape_7, parameter_383)
+        del parameter_383
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_8 = [add_10, reshape_8]
+        del add_10, reshape_8
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_21, einsum_22, einsum_23 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_8, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_8
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_28,
+            split_29,
+        ) = einsum_22
+        del einsum_22
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_30,
+            split_31,
+        ) = einsum_23
+        del einsum_23
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_11 = paddle._C_ops.add(reshape_7, parameter_385)
+        del parameter_385
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_9 = [add_11, reshape_10]
+        del add_11, reshape_10
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_24, einsum_25, einsum_26 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_9, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_9
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_32,
+            split_33,
+        ) = einsum_25
+        del einsum_25
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_34,
+            split_35,
+        ) = einsum_26
+        del einsum_26
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_11 = paddle._C_ops.reshape(einsum_24, full_int_array_7)
+        del einsum_24
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_1 = paddle._C_ops.slice(
+            reshape_11, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_11
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_12 = paddle._C_ops.reshape(slice_1, full_int_array_9)
+        del slice_1
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_1 = paddle._C_ops.index_select(reshape_12, arange_2, 3)
+        del reshape_12
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_12 = paddle._C_ops.add(reshape_7, parameter_384)
+        del parameter_384, reshape_7
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_10 = [add_12, parameter_382]
+        del add_12, parameter_382
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_27, einsum_28, einsum_29 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_10, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_10
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_36,
+            split_37,
+        ) = einsum_28
+        del einsum_28
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_38,
+            split_39,
+        ) = einsum_29
+        del einsum_29
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_11 = [cast_5, einsum_27]
+        del einsum_27
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_30, einsum_31, einsum_32 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_11, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_11
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_40,
+            split_41,
+        ) = einsum_31
+        del einsum_31
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_42,
+            split_43,
+        ) = einsum_32
+        del einsum_32
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_13 = paddle._C_ops.add(einsum_21, index_select_1)
+        del einsum_21, index_select_1
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_14 = paddle._C_ops.add(add_13, einsum_30)
+        del add_13, einsum_30
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_5 = paddle._C_ops.scale(add_14, full_16, float("0"), True)
+        del add_14
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_1 = paddle._C_ops.subtract(scale_5, scale_4)
+        del scale_5
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_1 = paddle._C_ops.softmax(subtract_1, 3)
+        del subtract_1
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_12, dropout_13 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_1, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_1
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_12 = [dropout_12, reshape_9]
+        del dropout_12, reshape_9
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_33, einsum_34, einsum_35 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_12, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_12
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_44,
+            split_45,
+        ) = einsum_34
+        del einsum_34
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_46,
+            split_47,
+        ) = einsum_35
+        del einsum_35
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_13 = paddle._C_ops.reshape(einsum_33, full_int_array_10)
+        del einsum_33
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_13 = [reshape_13, parameter_387]
+        del parameter_387, reshape_13
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_36, einsum_37, einsum_38 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_13, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_13
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_48,
+            split_49,
+        ) = einsum_37
+        del einsum_37
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_50,
+            split_51,
+        ) = einsum_38
+        del einsum_38
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_14, dropout_15 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_36, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_36
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_15 = paddle._C_ops.add(dropout_14, layer_norm_3)
+        del dropout_14, layer_norm_3
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_6, layer_norm_7, layer_norm_8 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_15, parameter_381, parameter_380, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_15, parameter_380, parameter_381
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_10 = paddle._C_ops.matmul(layer_norm_6, parameter_377, False, False)
+        del parameter_377
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_16 = paddle._C_ops.add(matmul_10, parameter_376)
+        del matmul_10, parameter_376
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_1 = paddle._C_ops.relu(add_16)
+        del add_16
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_16, dropout_17 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_1, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_1
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_11 = paddle._C_ops.matmul(dropout_16, parameter_375, False, False)
+        del dropout_16, parameter_375
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_17 = paddle._C_ops.add(matmul_11, parameter_374)
+        del matmul_11, parameter_374
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_18, dropout_19 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_17, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_17
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_18 = paddle._C_ops.add(dropout_18, layer_norm_6)
+        del dropout_18, layer_norm_6
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_9, layer_norm_10, layer_norm_11 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_18, parameter_379, parameter_378, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_18, parameter_378, parameter_379
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_12 = paddle._C_ops.matmul(layer_norm_9, parameter_373, False, False)
+        del parameter_373
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_14 = paddle._C_ops.reshape(matmul_12, full_int_array_5)
+        del matmul_12
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_13 = paddle._C_ops.matmul(layer_norm_9, parameter_372, False, False)
+        del parameter_372
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_15 = paddle._C_ops.reshape(matmul_13, full_int_array_5)
+        del matmul_13
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_14 = paddle._C_ops.matmul(layer_norm_9, parameter_371, False, False)
+        del parameter_371
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_16 = paddle._C_ops.reshape(matmul_14, full_int_array_5)
+        del matmul_14
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_15 = paddle._C_ops.matmul(dropout_2, parameter_369, False, False)
+        del parameter_369
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_17 = paddle._C_ops.reshape(matmul_15, full_int_array_6)
+        del matmul_15
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_19 = paddle._C_ops.add(reshape_14, parameter_366)
+        del parameter_366
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_14 = [add_19, reshape_15]
+        del add_19, reshape_15
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_39, einsum_40, einsum_41 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_14, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_14
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_52,
+            split_53,
+        ) = einsum_40
+        del einsum_40
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_54,
+            split_55,
+        ) = einsum_41
+        del einsum_41
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_20 = paddle._C_ops.add(reshape_14, parameter_368)
+        del parameter_368
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_15 = [add_20, reshape_17]
+        del add_20, reshape_17
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_42, einsum_43, einsum_44 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_15, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_15
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_56,
+            split_57,
+        ) = einsum_43
+        del einsum_43
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_58,
+            split_59,
+        ) = einsum_44
+        del einsum_44
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_18 = paddle._C_ops.reshape(einsum_42, full_int_array_7)
+        del einsum_42
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_2 = paddle._C_ops.slice(
+            reshape_18, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_18
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_19 = paddle._C_ops.reshape(slice_2, full_int_array_9)
+        del slice_2
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_2 = paddle._C_ops.index_select(reshape_19, arange_2, 3)
+        del reshape_19
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_21 = paddle._C_ops.add(reshape_14, parameter_367)
+        del parameter_367, reshape_14
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_16 = [add_21, parameter_365]
+        del add_21, parameter_365
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_45, einsum_46, einsum_47 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_16, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_16
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_60,
+            split_61,
+        ) = einsum_46
+        del einsum_46
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_62,
+            split_63,
+        ) = einsum_47
+        del einsum_47
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_17 = [cast_5, einsum_45]
+        del einsum_45
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_48, einsum_49, einsum_50 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_17, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_17
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_64,
+            split_65,
+        ) = einsum_49
+        del einsum_49
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_66,
+            split_67,
+        ) = einsum_50
+        del einsum_50
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_22 = paddle._C_ops.add(einsum_39, index_select_2)
+        del einsum_39, index_select_2
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_23 = paddle._C_ops.add(add_22, einsum_48)
+        del add_22, einsum_48
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_6 = paddle._C_ops.scale(add_23, full_16, float("0"), True)
+        del add_23
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_2 = paddle._C_ops.subtract(scale_6, scale_4)
+        del scale_6
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_2 = paddle._C_ops.softmax(subtract_2, 3)
+        del subtract_2
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_20, dropout_21 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_2, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_2
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_18 = [dropout_20, reshape_16]
+        del dropout_20, reshape_16
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_51, einsum_52, einsum_53 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_18, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_18
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_68,
+            split_69,
+        ) = einsum_52
+        del einsum_52
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_70,
+            split_71,
+        ) = einsum_53
+        del einsum_53
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_20 = paddle._C_ops.reshape(einsum_51, full_int_array_10)
+        del einsum_51
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_19 = [reshape_20, parameter_370]
+        del parameter_370, reshape_20
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_54, einsum_55, einsum_56 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_19, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_19
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_72,
+            split_73,
+        ) = einsum_55
+        del einsum_55
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_74,
+            split_75,
+        ) = einsum_56
+        del einsum_56
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_22, dropout_23 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_54, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_54
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_24 = paddle._C_ops.add(dropout_22, layer_norm_9)
+        del dropout_22, layer_norm_9
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_12, layer_norm_13, layer_norm_14 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_24, parameter_364, parameter_363, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_24, parameter_363, parameter_364
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_16 = paddle._C_ops.matmul(layer_norm_12, parameter_360, False, False)
+        del parameter_360
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_25 = paddle._C_ops.add(matmul_16, parameter_359)
+        del matmul_16, parameter_359
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_2 = paddle._C_ops.relu(add_25)
+        del add_25
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_24, dropout_25 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_2, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_2
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_17 = paddle._C_ops.matmul(dropout_24, parameter_358, False, False)
+        del dropout_24, parameter_358
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_26 = paddle._C_ops.add(matmul_17, parameter_357)
+        del matmul_17, parameter_357
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_26, dropout_27 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_26, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_26
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_27 = paddle._C_ops.add(dropout_26, layer_norm_12)
+        del dropout_26, layer_norm_12
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_15, layer_norm_16, layer_norm_17 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_27, parameter_362, parameter_361, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_27, parameter_361, parameter_362
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_18 = paddle._C_ops.matmul(layer_norm_15, parameter_356, False, False)
+        del parameter_356
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_21 = paddle._C_ops.reshape(matmul_18, full_int_array_5)
+        del matmul_18
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_19 = paddle._C_ops.matmul(layer_norm_15, parameter_355, False, False)
+        del parameter_355
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_22 = paddle._C_ops.reshape(matmul_19, full_int_array_5)
+        del matmul_19
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_20 = paddle._C_ops.matmul(layer_norm_15, parameter_354, False, False)
+        del parameter_354
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_23 = paddle._C_ops.reshape(matmul_20, full_int_array_5)
+        del matmul_20
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_21 = paddle._C_ops.matmul(dropout_2, parameter_352, False, False)
+        del parameter_352
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_24 = paddle._C_ops.reshape(matmul_21, full_int_array_6)
+        del matmul_21
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_28 = paddle._C_ops.add(reshape_21, parameter_349)
+        del parameter_349
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_20 = [add_28, reshape_22]
+        del add_28, reshape_22
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_57, einsum_58, einsum_59 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_20, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_20
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_76,
+            split_77,
+        ) = einsum_58
+        del einsum_58
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_78,
+            split_79,
+        ) = einsum_59
+        del einsum_59
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_29 = paddle._C_ops.add(reshape_21, parameter_351)
+        del parameter_351
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_21 = [add_29, reshape_24]
+        del add_29, reshape_24
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_60, einsum_61, einsum_62 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_21, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_21
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_80,
+            split_81,
+        ) = einsum_61
+        del einsum_61
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_82,
+            split_83,
+        ) = einsum_62
+        del einsum_62
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_25 = paddle._C_ops.reshape(einsum_60, full_int_array_7)
+        del einsum_60
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_3 = paddle._C_ops.slice(
+            reshape_25, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_25
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_26 = paddle._C_ops.reshape(slice_3, full_int_array_9)
+        del slice_3
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_3 = paddle._C_ops.index_select(reshape_26, arange_2, 3)
+        del reshape_26
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_30 = paddle._C_ops.add(reshape_21, parameter_350)
+        del parameter_350, reshape_21
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_22 = [add_30, parameter_348]
+        del add_30, parameter_348
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_63, einsum_64, einsum_65 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_22, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_22
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_84,
+            split_85,
+        ) = einsum_64
+        del einsum_64
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_86,
+            split_87,
+        ) = einsum_65
+        del einsum_65
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_23 = [cast_5, einsum_63]
+        del einsum_63
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_66, einsum_67, einsum_68 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_23, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_23
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_88,
+            split_89,
+        ) = einsum_67
+        del einsum_67
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_90,
+            split_91,
+        ) = einsum_68
+        del einsum_68
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_31 = paddle._C_ops.add(einsum_57, index_select_3)
+        del einsum_57, index_select_3
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_32 = paddle._C_ops.add(add_31, einsum_66)
+        del add_31, einsum_66
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_7 = paddle._C_ops.scale(add_32, full_16, float("0"), True)
+        del add_32
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_3 = paddle._C_ops.subtract(scale_7, scale_4)
+        del scale_7
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_3 = paddle._C_ops.softmax(subtract_3, 3)
+        del subtract_3
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_28, dropout_29 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_3, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_3
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_24 = [dropout_28, reshape_23]
+        del dropout_28, reshape_23
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_69, einsum_70, einsum_71 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_24, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_24
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_92,
+            split_93,
+        ) = einsum_70
+        del einsum_70
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_94,
+            split_95,
+        ) = einsum_71
+        del einsum_71
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_27 = paddle._C_ops.reshape(einsum_69, full_int_array_10)
+        del einsum_69
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_25 = [reshape_27, parameter_353]
+        del parameter_353, reshape_27
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_72, einsum_73, einsum_74 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_25, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_25
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_96,
+            split_97,
+        ) = einsum_73
+        del einsum_73
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_98,
+            split_99,
+        ) = einsum_74
+        del einsum_74
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_30, dropout_31 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_72, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_72
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_33 = paddle._C_ops.add(dropout_30, layer_norm_15)
+        del dropout_30, layer_norm_15
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_18, layer_norm_19, layer_norm_20 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_33, parameter_347, parameter_346, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_33, parameter_346, parameter_347
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_22 = paddle._C_ops.matmul(layer_norm_18, parameter_343, False, False)
+        del parameter_343
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_34 = paddle._C_ops.add(matmul_22, parameter_342)
+        del matmul_22, parameter_342
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_3 = paddle._C_ops.relu(add_34)
+        del add_34
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_32, dropout_33 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_3, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_3
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_23 = paddle._C_ops.matmul(dropout_32, parameter_341, False, False)
+        del dropout_32, parameter_341
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_35 = paddle._C_ops.add(matmul_23, parameter_340)
+        del matmul_23, parameter_340
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_34, dropout_35 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_35, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_35
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_36 = paddle._C_ops.add(dropout_34, layer_norm_18)
+        del dropout_34, layer_norm_18
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_21, layer_norm_22, layer_norm_23 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_36, parameter_345, parameter_344, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_36, parameter_344, parameter_345
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_24 = paddle._C_ops.matmul(layer_norm_21, parameter_339, False, False)
+        del parameter_339
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_28 = paddle._C_ops.reshape(matmul_24, full_int_array_5)
+        del matmul_24
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_25 = paddle._C_ops.matmul(layer_norm_21, parameter_338, False, False)
+        del parameter_338
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_29 = paddle._C_ops.reshape(matmul_25, full_int_array_5)
+        del matmul_25
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_26 = paddle._C_ops.matmul(layer_norm_21, parameter_337, False, False)
+        del parameter_337
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_30 = paddle._C_ops.reshape(matmul_26, full_int_array_5)
+        del matmul_26
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_27 = paddle._C_ops.matmul(dropout_2, parameter_335, False, False)
+        del parameter_335
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_31 = paddle._C_ops.reshape(matmul_27, full_int_array_6)
+        del matmul_27
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_37 = paddle._C_ops.add(reshape_28, parameter_332)
+        del parameter_332
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_26 = [add_37, reshape_29]
+        del add_37, reshape_29
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_75, einsum_76, einsum_77 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_26, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_26
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_100,
+            split_101,
+        ) = einsum_76
+        del einsum_76
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_102,
+            split_103,
+        ) = einsum_77
+        del einsum_77
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_38 = paddle._C_ops.add(reshape_28, parameter_334)
+        del parameter_334
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_27 = [add_38, reshape_31]
+        del add_38, reshape_31
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_78, einsum_79, einsum_80 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_27, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_27
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_104,
+            split_105,
+        ) = einsum_79
+        del einsum_79
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_106,
+            split_107,
+        ) = einsum_80
+        del einsum_80
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_32 = paddle._C_ops.reshape(einsum_78, full_int_array_7)
+        del einsum_78
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_4 = paddle._C_ops.slice(
+            reshape_32, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_32
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_33 = paddle._C_ops.reshape(slice_4, full_int_array_9)
+        del slice_4
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_4 = paddle._C_ops.index_select(reshape_33, arange_2, 3)
+        del reshape_33
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_39 = paddle._C_ops.add(reshape_28, parameter_333)
+        del parameter_333, reshape_28
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_28 = [add_39, parameter_331]
+        del add_39, parameter_331
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_81, einsum_82, einsum_83 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_28, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_28
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_108,
+            split_109,
+        ) = einsum_82
+        del einsum_82
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_110,
+            split_111,
+        ) = einsum_83
+        del einsum_83
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_29 = [cast_5, einsum_81]
+        del einsum_81
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_84, einsum_85, einsum_86 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_29, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_29
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_112,
+            split_113,
+        ) = einsum_85
+        del einsum_85
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_114,
+            split_115,
+        ) = einsum_86
+        del einsum_86
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_40 = paddle._C_ops.add(einsum_75, index_select_4)
+        del einsum_75, index_select_4
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_41 = paddle._C_ops.add(add_40, einsum_84)
+        del add_40, einsum_84
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_8 = paddle._C_ops.scale(add_41, full_16, float("0"), True)
+        del add_41
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_4 = paddle._C_ops.subtract(scale_8, scale_4)
+        del scale_8
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_4 = paddle._C_ops.softmax(subtract_4, 3)
+        del subtract_4
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_36, dropout_37 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_4, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_4
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_30 = [dropout_36, reshape_30]
+        del dropout_36, reshape_30
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_87, einsum_88, einsum_89 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_30, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_30
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_116,
+            split_117,
+        ) = einsum_88
+        del einsum_88
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_118,
+            split_119,
+        ) = einsum_89
+        del einsum_89
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_34 = paddle._C_ops.reshape(einsum_87, full_int_array_10)
+        del einsum_87
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_31 = [reshape_34, parameter_336]
+        del parameter_336, reshape_34
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_90, einsum_91, einsum_92 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_31, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_31
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_120,
+            split_121,
+        ) = einsum_91
+        del einsum_91
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_122,
+            split_123,
+        ) = einsum_92
+        del einsum_92
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_38, dropout_39 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_90, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_90
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_42 = paddle._C_ops.add(dropout_38, layer_norm_21)
+        del dropout_38, layer_norm_21
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_24, layer_norm_25, layer_norm_26 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_42, parameter_330, parameter_329, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_42, parameter_329, parameter_330
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_28 = paddle._C_ops.matmul(layer_norm_24, parameter_326, False, False)
+        del parameter_326
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_43 = paddle._C_ops.add(matmul_28, parameter_325)
+        del matmul_28, parameter_325
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_4 = paddle._C_ops.relu(add_43)
+        del add_43
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_40, dropout_41 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_4, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_4
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_29 = paddle._C_ops.matmul(dropout_40, parameter_324, False, False)
+        del dropout_40, parameter_324
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_44 = paddle._C_ops.add(matmul_29, parameter_323)
+        del matmul_29, parameter_323
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_42, dropout_43 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_44, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_44
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_45 = paddle._C_ops.add(dropout_42, layer_norm_24)
+        del dropout_42, layer_norm_24
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_27, layer_norm_28, layer_norm_29 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_45, parameter_328, parameter_327, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_45, parameter_327, parameter_328
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_30 = paddle._C_ops.matmul(layer_norm_27, parameter_322, False, False)
+        del parameter_322
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_35 = paddle._C_ops.reshape(matmul_30, full_int_array_5)
+        del matmul_30
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_31 = paddle._C_ops.matmul(layer_norm_27, parameter_321, False, False)
+        del parameter_321
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_36 = paddle._C_ops.reshape(matmul_31, full_int_array_5)
+        del matmul_31
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_32 = paddle._C_ops.matmul(layer_norm_27, parameter_320, False, False)
+        del parameter_320
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_37 = paddle._C_ops.reshape(matmul_32, full_int_array_5)
+        del matmul_32
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_33 = paddle._C_ops.matmul(dropout_2, parameter_318, False, False)
+        del parameter_318
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_38 = paddle._C_ops.reshape(matmul_33, full_int_array_6)
+        del matmul_33
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_46 = paddle._C_ops.add(reshape_35, parameter_315)
+        del parameter_315
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_32 = [add_46, reshape_36]
+        del add_46, reshape_36
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_93, einsum_94, einsum_95 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_32, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_32
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_124,
+            split_125,
+        ) = einsum_94
+        del einsum_94
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_126,
+            split_127,
+        ) = einsum_95
+        del einsum_95
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_47 = paddle._C_ops.add(reshape_35, parameter_317)
+        del parameter_317
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_33 = [add_47, reshape_38]
+        del add_47, reshape_38
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_96, einsum_97, einsum_98 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_33, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_33
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_128,
+            split_129,
+        ) = einsum_97
+        del einsum_97
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_130,
+            split_131,
+        ) = einsum_98
+        del einsum_98
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_39 = paddle._C_ops.reshape(einsum_96, full_int_array_7)
+        del einsum_96
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_5 = paddle._C_ops.slice(
+            reshape_39, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_39
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_40 = paddle._C_ops.reshape(slice_5, full_int_array_9)
+        del slice_5
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_5 = paddle._C_ops.index_select(reshape_40, arange_2, 3)
+        del reshape_40
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_48 = paddle._C_ops.add(reshape_35, parameter_316)
+        del parameter_316, reshape_35
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_34 = [add_48, parameter_314]
+        del add_48, parameter_314
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_99, einsum_100, einsum_101 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_34, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_34
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_132,
+            split_133,
+        ) = einsum_100
+        del einsum_100
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_134,
+            split_135,
+        ) = einsum_101
+        del einsum_101
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_35 = [cast_5, einsum_99]
+        del einsum_99
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_102, einsum_103, einsum_104 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_35, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_35
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_136,
+            split_137,
+        ) = einsum_103
+        del einsum_103
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_138,
+            split_139,
+        ) = einsum_104
+        del einsum_104
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_49 = paddle._C_ops.add(einsum_93, index_select_5)
+        del einsum_93, index_select_5
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_50 = paddle._C_ops.add(add_49, einsum_102)
+        del add_49, einsum_102
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_9 = paddle._C_ops.scale(add_50, full_16, float("0"), True)
+        del add_50
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_5 = paddle._C_ops.subtract(scale_9, scale_4)
+        del scale_9
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_5 = paddle._C_ops.softmax(subtract_5, 3)
+        del subtract_5
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_44, dropout_45 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_5, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_5
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_36 = [dropout_44, reshape_37]
+        del dropout_44, reshape_37
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_105, einsum_106, einsum_107 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_36, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_36
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_140,
+            split_141,
+        ) = einsum_106
+        del einsum_106
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_142,
+            split_143,
+        ) = einsum_107
+        del einsum_107
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_41 = paddle._C_ops.reshape(einsum_105, full_int_array_10)
+        del einsum_105
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_37 = [reshape_41, parameter_319]
+        del parameter_319, reshape_41
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_108, einsum_109, einsum_110 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_37, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_37
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_144,
+            split_145,
+        ) = einsum_109
+        del einsum_109
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_146,
+            split_147,
+        ) = einsum_110
+        del einsum_110
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_46, dropout_47 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_108, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_108
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_51 = paddle._C_ops.add(dropout_46, layer_norm_27)
+        del dropout_46, layer_norm_27
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_30, layer_norm_31, layer_norm_32 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_51, parameter_313, parameter_312, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_51, parameter_312, parameter_313
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_34 = paddle._C_ops.matmul(layer_norm_30, parameter_309, False, False)
+        del parameter_309
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_52 = paddle._C_ops.add(matmul_34, parameter_308)
+        del matmul_34, parameter_308
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_5 = paddle._C_ops.relu(add_52)
+        del add_52
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_48, dropout_49 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_5, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_5
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_35 = paddle._C_ops.matmul(dropout_48, parameter_307, False, False)
+        del dropout_48, parameter_307
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_53 = paddle._C_ops.add(matmul_35, parameter_306)
+        del matmul_35, parameter_306
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_50, dropout_51 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_53, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_53
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_54 = paddle._C_ops.add(dropout_50, layer_norm_30)
+        del dropout_50, layer_norm_30
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_33, layer_norm_34, layer_norm_35 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_54, parameter_311, parameter_310, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_54, parameter_310, parameter_311
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_36 = paddle._C_ops.matmul(layer_norm_33, parameter_305, False, False)
+        del parameter_305
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_42 = paddle._C_ops.reshape(matmul_36, full_int_array_5)
+        del matmul_36
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_37 = paddle._C_ops.matmul(layer_norm_33, parameter_304, False, False)
+        del parameter_304
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_43 = paddle._C_ops.reshape(matmul_37, full_int_array_5)
+        del matmul_37
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_38 = paddle._C_ops.matmul(layer_norm_33, parameter_303, False, False)
+        del parameter_303
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_44 = paddle._C_ops.reshape(matmul_38, full_int_array_5)
+        del matmul_38
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_39 = paddle._C_ops.matmul(dropout_2, parameter_301, False, False)
+        del parameter_301
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_45 = paddle._C_ops.reshape(matmul_39, full_int_array_6)
+        del matmul_39
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_55 = paddle._C_ops.add(reshape_42, parameter_298)
+        del parameter_298
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_38 = [add_55, reshape_43]
+        del add_55, reshape_43
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_111, einsum_112, einsum_113 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_38, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_38
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_148,
+            split_149,
+        ) = einsum_112
+        del einsum_112
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_150,
+            split_151,
+        ) = einsum_113
+        del einsum_113
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_56 = paddle._C_ops.add(reshape_42, parameter_300)
+        del parameter_300
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_39 = [add_56, reshape_45]
+        del add_56, reshape_45
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_114, einsum_115, einsum_116 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_39, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_39
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_152,
+            split_153,
+        ) = einsum_115
+        del einsum_115
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_154,
+            split_155,
+        ) = einsum_116
+        del einsum_116
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_46 = paddle._C_ops.reshape(einsum_114, full_int_array_7)
+        del einsum_114
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_6 = paddle._C_ops.slice(
+            reshape_46, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_46
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_47 = paddle._C_ops.reshape(slice_6, full_int_array_9)
+        del slice_6
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_6 = paddle._C_ops.index_select(reshape_47, arange_2, 3)
+        del reshape_47
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_57 = paddle._C_ops.add(reshape_42, parameter_299)
+        del parameter_299, reshape_42
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_40 = [add_57, parameter_297]
+        del add_57, parameter_297
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_117, einsum_118, einsum_119 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_40, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_40
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_156,
+            split_157,
+        ) = einsum_118
+        del einsum_118
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_158,
+            split_159,
+        ) = einsum_119
+        del einsum_119
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_41 = [cast_5, einsum_117]
+        del einsum_117
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_120, einsum_121, einsum_122 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_41, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_41
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_160,
+            split_161,
+        ) = einsum_121
+        del einsum_121
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_162,
+            split_163,
+        ) = einsum_122
+        del einsum_122
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_58 = paddle._C_ops.add(einsum_111, index_select_6)
+        del einsum_111, index_select_6
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_59 = paddle._C_ops.add(add_58, einsum_120)
+        del add_58, einsum_120
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_10 = paddle._C_ops.scale(add_59, full_16, float("0"), True)
+        del add_59
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_6 = paddle._C_ops.subtract(scale_10, scale_4)
+        del scale_10
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_6 = paddle._C_ops.softmax(subtract_6, 3)
+        del subtract_6
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_52, dropout_53 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_6, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_6
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_42 = [dropout_52, reshape_44]
+        del dropout_52, reshape_44
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_123, einsum_124, einsum_125 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_42, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_42
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_164,
+            split_165,
+        ) = einsum_124
+        del einsum_124
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_166,
+            split_167,
+        ) = einsum_125
+        del einsum_125
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_48 = paddle._C_ops.reshape(einsum_123, full_int_array_10)
+        del einsum_123
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_43 = [reshape_48, parameter_302]
+        del parameter_302, reshape_48
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_126, einsum_127, einsum_128 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_43, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_43
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_168,
+            split_169,
+        ) = einsum_127
+        del einsum_127
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_170,
+            split_171,
+        ) = einsum_128
+        del einsum_128
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_54, dropout_55 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_126, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_126
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_60 = paddle._C_ops.add(dropout_54, layer_norm_33)
+        del dropout_54, layer_norm_33
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_36, layer_norm_37, layer_norm_38 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_60, parameter_296, parameter_295, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_60, parameter_295, parameter_296
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_40 = paddle._C_ops.matmul(layer_norm_36, parameter_292, False, False)
+        del parameter_292
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_61 = paddle._C_ops.add(matmul_40, parameter_291)
+        del matmul_40, parameter_291
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_6 = paddle._C_ops.relu(add_61)
+        del add_61
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_56, dropout_57 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_6, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_6
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_41 = paddle._C_ops.matmul(dropout_56, parameter_290, False, False)
+        del dropout_56, parameter_290
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_62 = paddle._C_ops.add(matmul_41, parameter_289)
+        del matmul_41, parameter_289
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_58, dropout_59 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_62, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_62
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_63 = paddle._C_ops.add(dropout_58, layer_norm_36)
+        del dropout_58, layer_norm_36
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_39, layer_norm_40, layer_norm_41 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_63, parameter_294, parameter_293, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_63, parameter_293, parameter_294
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_42 = paddle._C_ops.matmul(layer_norm_39, parameter_288, False, False)
+        del parameter_288
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_49 = paddle._C_ops.reshape(matmul_42, full_int_array_5)
+        del matmul_42
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_43 = paddle._C_ops.matmul(layer_norm_39, parameter_287, False, False)
+        del parameter_287
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_50 = paddle._C_ops.reshape(matmul_43, full_int_array_5)
+        del matmul_43
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_44 = paddle._C_ops.matmul(layer_norm_39, parameter_286, False, False)
+        del parameter_286
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_51 = paddle._C_ops.reshape(matmul_44, full_int_array_5)
+        del matmul_44
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_45 = paddle._C_ops.matmul(dropout_2, parameter_284, False, False)
+        del parameter_284
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_52 = paddle._C_ops.reshape(matmul_45, full_int_array_6)
+        del matmul_45
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_64 = paddle._C_ops.add(reshape_49, parameter_281)
+        del parameter_281
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_44 = [add_64, reshape_50]
+        del add_64, reshape_50
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_129, einsum_130, einsum_131 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_44, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_44
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_172,
+            split_173,
+        ) = einsum_130
+        del einsum_130
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_174,
+            split_175,
+        ) = einsum_131
+        del einsum_131
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_65 = paddle._C_ops.add(reshape_49, parameter_283)
+        del parameter_283
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_45 = [add_65, reshape_52]
+        del add_65, reshape_52
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_132, einsum_133, einsum_134 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_45, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_45
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_176,
+            split_177,
+        ) = einsum_133
+        del einsum_133
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_178,
+            split_179,
+        ) = einsum_134
+        del einsum_134
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_53 = paddle._C_ops.reshape(einsum_132, full_int_array_7)
+        del einsum_132
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_7 = paddle._C_ops.slice(
+            reshape_53, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_53
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_54 = paddle._C_ops.reshape(slice_7, full_int_array_9)
+        del slice_7
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_7 = paddle._C_ops.index_select(reshape_54, arange_2, 3)
+        del reshape_54
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_66 = paddle._C_ops.add(reshape_49, parameter_282)
+        del parameter_282, reshape_49
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_46 = [add_66, parameter_280]
+        del add_66, parameter_280
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_135, einsum_136, einsum_137 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_46, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_46
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_180,
+            split_181,
+        ) = einsum_136
+        del einsum_136
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_182,
+            split_183,
+        ) = einsum_137
+        del einsum_137
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_47 = [cast_5, einsum_135]
+        del einsum_135
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_138, einsum_139, einsum_140 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_47, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_47
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_184,
+            split_185,
+        ) = einsum_139
+        del einsum_139
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_186,
+            split_187,
+        ) = einsum_140
+        del einsum_140
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_67 = paddle._C_ops.add(einsum_129, index_select_7)
+        del einsum_129, index_select_7
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_68 = paddle._C_ops.add(add_67, einsum_138)
+        del add_67, einsum_138
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_11 = paddle._C_ops.scale(add_68, full_16, float("0"), True)
+        del add_68
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_7 = paddle._C_ops.subtract(scale_11, scale_4)
+        del scale_11
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_7 = paddle._C_ops.softmax(subtract_7, 3)
+        del subtract_7
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_60, dropout_61 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_7, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_7
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_48 = [dropout_60, reshape_51]
+        del dropout_60, reshape_51
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_141, einsum_142, einsum_143 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_48, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_48
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_188,
+            split_189,
+        ) = einsum_142
+        del einsum_142
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_190,
+            split_191,
+        ) = einsum_143
+        del einsum_143
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_55 = paddle._C_ops.reshape(einsum_141, full_int_array_10)
+        del einsum_141
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_49 = [reshape_55, parameter_285]
+        del parameter_285, reshape_55
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_144, einsum_145, einsum_146 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_49, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_49
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_192,
+            split_193,
+        ) = einsum_145
+        del einsum_145
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_194,
+            split_195,
+        ) = einsum_146
+        del einsum_146
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_62, dropout_63 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_144, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_144
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_69 = paddle._C_ops.add(dropout_62, layer_norm_39)
+        del dropout_62, layer_norm_39
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_42, layer_norm_43, layer_norm_44 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_69, parameter_279, parameter_278, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_69, parameter_278, parameter_279
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_46 = paddle._C_ops.matmul(layer_norm_42, parameter_275, False, False)
+        del parameter_275
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_70 = paddle._C_ops.add(matmul_46, parameter_274)
+        del matmul_46, parameter_274
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_7 = paddle._C_ops.relu(add_70)
+        del add_70
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_64, dropout_65 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_7, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_7
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_47 = paddle._C_ops.matmul(dropout_64, parameter_273, False, False)
+        del dropout_64, parameter_273
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_71 = paddle._C_ops.add(matmul_47, parameter_272)
+        del matmul_47, parameter_272
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_66, dropout_67 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_71, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_71
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_72 = paddle._C_ops.add(dropout_66, layer_norm_42)
+        del dropout_66, layer_norm_42
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_45, layer_norm_46, layer_norm_47 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_72, parameter_277, parameter_276, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_72, parameter_276, parameter_277
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_48 = paddle._C_ops.matmul(layer_norm_45, parameter_271, False, False)
+        del parameter_271
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_56 = paddle._C_ops.reshape(matmul_48, full_int_array_5)
+        del matmul_48
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_49 = paddle._C_ops.matmul(layer_norm_45, parameter_270, False, False)
+        del parameter_270
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_57 = paddle._C_ops.reshape(matmul_49, full_int_array_5)
+        del matmul_49
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_50 = paddle._C_ops.matmul(layer_norm_45, parameter_269, False, False)
+        del parameter_269
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_58 = paddle._C_ops.reshape(matmul_50, full_int_array_5)
+        del matmul_50
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_51 = paddle._C_ops.matmul(dropout_2, parameter_267, False, False)
+        del parameter_267
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_59 = paddle._C_ops.reshape(matmul_51, full_int_array_6)
+        del matmul_51
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_73 = paddle._C_ops.add(reshape_56, parameter_264)
+        del parameter_264
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_50 = [add_73, reshape_57]
+        del add_73, reshape_57
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_147, einsum_148, einsum_149 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_50, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_50
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_196,
+            split_197,
+        ) = einsum_148
+        del einsum_148
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_198,
+            split_199,
+        ) = einsum_149
+        del einsum_149
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_74 = paddle._C_ops.add(reshape_56, parameter_266)
+        del parameter_266
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_51 = [add_74, reshape_59]
+        del add_74, reshape_59
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_150, einsum_151, einsum_152 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_51, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_51
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_200,
+            split_201,
+        ) = einsum_151
+        del einsum_151
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_202,
+            split_203,
+        ) = einsum_152
+        del einsum_152
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_60 = paddle._C_ops.reshape(einsum_150, full_int_array_7)
+        del einsum_150
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_8 = paddle._C_ops.slice(
+            reshape_60, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_60
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_61 = paddle._C_ops.reshape(slice_8, full_int_array_9)
+        del slice_8
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_8 = paddle._C_ops.index_select(reshape_61, arange_2, 3)
+        del reshape_61
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_75 = paddle._C_ops.add(reshape_56, parameter_265)
+        del parameter_265, reshape_56
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_52 = [add_75, parameter_263]
+        del add_75, parameter_263
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_153, einsum_154, einsum_155 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_52, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_52
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_204,
+            split_205,
+        ) = einsum_154
+        del einsum_154
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_206,
+            split_207,
+        ) = einsum_155
+        del einsum_155
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_53 = [cast_5, einsum_153]
+        del einsum_153
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_156, einsum_157, einsum_158 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_53, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_53
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_208,
+            split_209,
+        ) = einsum_157
+        del einsum_157
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_210,
+            split_211,
+        ) = einsum_158
+        del einsum_158
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_76 = paddle._C_ops.add(einsum_147, index_select_8)
+        del einsum_147, index_select_8
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_77 = paddle._C_ops.add(add_76, einsum_156)
+        del add_76, einsum_156
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_12 = paddle._C_ops.scale(add_77, full_16, float("0"), True)
+        del add_77
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_8 = paddle._C_ops.subtract(scale_12, scale_4)
+        del scale_12
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_8 = paddle._C_ops.softmax(subtract_8, 3)
+        del subtract_8
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_68, dropout_69 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_8, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_8
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_54 = [dropout_68, reshape_58]
+        del dropout_68, reshape_58
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_159, einsum_160, einsum_161 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_54, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_54
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_212,
+            split_213,
+        ) = einsum_160
+        del einsum_160
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_214,
+            split_215,
+        ) = einsum_161
+        del einsum_161
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_62 = paddle._C_ops.reshape(einsum_159, full_int_array_10)
+        del einsum_159
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_55 = [reshape_62, parameter_268]
+        del parameter_268, reshape_62
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_162, einsum_163, einsum_164 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_55, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_55
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_216,
+            split_217,
+        ) = einsum_163
+        del einsum_163
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_218,
+            split_219,
+        ) = einsum_164
+        del einsum_164
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_70, dropout_71 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_162, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_162
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_78 = paddle._C_ops.add(dropout_70, layer_norm_45)
+        del dropout_70, layer_norm_45
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_48, layer_norm_49, layer_norm_50 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_78, parameter_262, parameter_261, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_78, parameter_261, parameter_262
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_52 = paddle._C_ops.matmul(layer_norm_48, parameter_258, False, False)
+        del parameter_258
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_79 = paddle._C_ops.add(matmul_52, parameter_257)
+        del matmul_52, parameter_257
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_8 = paddle._C_ops.relu(add_79)
+        del add_79
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_72, dropout_73 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_8, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_8
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_53 = paddle._C_ops.matmul(dropout_72, parameter_256, False, False)
+        del dropout_72, parameter_256
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_80 = paddle._C_ops.add(matmul_53, parameter_255)
+        del matmul_53, parameter_255
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_74, dropout_75 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_80, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_80
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_81 = paddle._C_ops.add(dropout_74, layer_norm_48)
+        del dropout_74, layer_norm_48
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_51, layer_norm_52, layer_norm_53 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_81, parameter_260, parameter_259, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_81, parameter_259, parameter_260
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_54 = paddle._C_ops.matmul(layer_norm_51, parameter_254, False, False)
+        del parameter_254
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_63 = paddle._C_ops.reshape(matmul_54, full_int_array_5)
+        del matmul_54
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_55 = paddle._C_ops.matmul(layer_norm_51, parameter_253, False, False)
+        del parameter_253
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_64 = paddle._C_ops.reshape(matmul_55, full_int_array_5)
+        del matmul_55
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_56 = paddle._C_ops.matmul(layer_norm_51, parameter_252, False, False)
+        del parameter_252
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_65 = paddle._C_ops.reshape(matmul_56, full_int_array_5)
+        del matmul_56
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_57 = paddle._C_ops.matmul(dropout_2, parameter_250, False, False)
+        del parameter_250
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_66 = paddle._C_ops.reshape(matmul_57, full_int_array_6)
+        del matmul_57
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_82 = paddle._C_ops.add(reshape_63, parameter_247)
+        del parameter_247
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_56 = [add_82, reshape_64]
+        del add_82, reshape_64
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_165, einsum_166, einsum_167 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_56, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_56
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_220,
+            split_221,
+        ) = einsum_166
+        del einsum_166
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_222,
+            split_223,
+        ) = einsum_167
+        del einsum_167
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_83 = paddle._C_ops.add(reshape_63, parameter_249)
+        del parameter_249
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_57 = [add_83, reshape_66]
+        del add_83, reshape_66
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_168, einsum_169, einsum_170 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_57, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_57
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_224,
+            split_225,
+        ) = einsum_169
+        del einsum_169
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_226,
+            split_227,
+        ) = einsum_170
+        del einsum_170
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_67 = paddle._C_ops.reshape(einsum_168, full_int_array_7)
+        del einsum_168
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_9 = paddle._C_ops.slice(
+            reshape_67, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_67
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_68 = paddle._C_ops.reshape(slice_9, full_int_array_9)
+        del slice_9
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_9 = paddle._C_ops.index_select(reshape_68, arange_2, 3)
+        del reshape_68
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_84 = paddle._C_ops.add(reshape_63, parameter_248)
+        del parameter_248, reshape_63
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_58 = [add_84, parameter_246]
+        del add_84, parameter_246
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_171, einsum_172, einsum_173 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_58, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_58
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_228,
+            split_229,
+        ) = einsum_172
+        del einsum_172
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_230,
+            split_231,
+        ) = einsum_173
+        del einsum_173
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_59 = [cast_5, einsum_171]
+        del einsum_171
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_174, einsum_175, einsum_176 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_59, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_59
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_232,
+            split_233,
+        ) = einsum_175
+        del einsum_175
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_234,
+            split_235,
+        ) = einsum_176
+        del einsum_176
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_85 = paddle._C_ops.add(einsum_165, index_select_9)
+        del einsum_165, index_select_9
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_86 = paddle._C_ops.add(add_85, einsum_174)
+        del add_85, einsum_174
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_13 = paddle._C_ops.scale(add_86, full_16, float("0"), True)
+        del add_86
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_9 = paddle._C_ops.subtract(scale_13, scale_4)
+        del scale_13
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_9 = paddle._C_ops.softmax(subtract_9, 3)
+        del subtract_9
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_76, dropout_77 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_9, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_9
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_60 = [dropout_76, reshape_65]
+        del dropout_76, reshape_65
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_177, einsum_178, einsum_179 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_60, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_60
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_236,
+            split_237,
+        ) = einsum_178
+        del einsum_178
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_238,
+            split_239,
+        ) = einsum_179
+        del einsum_179
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_69 = paddle._C_ops.reshape(einsum_177, full_int_array_10)
+        del einsum_177
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_61 = [reshape_69, parameter_251]
+        del parameter_251, reshape_69
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_180, einsum_181, einsum_182 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_61, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_61
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_240,
+            split_241,
+        ) = einsum_181
+        del einsum_181
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_242,
+            split_243,
+        ) = einsum_182
+        del einsum_182
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_78, dropout_79 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_180, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_180
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_87 = paddle._C_ops.add(dropout_78, layer_norm_51)
+        del dropout_78, layer_norm_51
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_54, layer_norm_55, layer_norm_56 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_87, parameter_245, parameter_244, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_87, parameter_244, parameter_245
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_58 = paddle._C_ops.matmul(layer_norm_54, parameter_241, False, False)
+        del parameter_241
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_88 = paddle._C_ops.add(matmul_58, parameter_240)
+        del matmul_58, parameter_240
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_9 = paddle._C_ops.relu(add_88)
+        del add_88
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_80, dropout_81 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_9, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_9
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_59 = paddle._C_ops.matmul(dropout_80, parameter_239, False, False)
+        del dropout_80, parameter_239
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_89 = paddle._C_ops.add(matmul_59, parameter_238)
+        del matmul_59, parameter_238
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_82, dropout_83 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_89, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_89
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_90 = paddle._C_ops.add(dropout_82, layer_norm_54)
+        del dropout_82, layer_norm_54
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_57, layer_norm_58, layer_norm_59 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_90, parameter_243, parameter_242, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_90, parameter_242, parameter_243
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_60 = paddle._C_ops.matmul(layer_norm_57, parameter_237, False, False)
+        del parameter_237
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_70 = paddle._C_ops.reshape(matmul_60, full_int_array_5)
+        del matmul_60
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_61 = paddle._C_ops.matmul(layer_norm_57, parameter_236, False, False)
+        del parameter_236
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_71 = paddle._C_ops.reshape(matmul_61, full_int_array_5)
+        del matmul_61
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_62 = paddle._C_ops.matmul(layer_norm_57, parameter_235, False, False)
+        del parameter_235
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_72 = paddle._C_ops.reshape(matmul_62, full_int_array_5)
+        del matmul_62
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_63 = paddle._C_ops.matmul(dropout_2, parameter_233, False, False)
+        del parameter_233
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_73 = paddle._C_ops.reshape(matmul_63, full_int_array_6)
+        del matmul_63
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_91 = paddle._C_ops.add(reshape_70, parameter_230)
+        del parameter_230
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_62 = [add_91, reshape_71]
+        del add_91, reshape_71
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_183, einsum_184, einsum_185 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_62, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_62
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_244,
+            split_245,
+        ) = einsum_184
+        del einsum_184
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_246,
+            split_247,
+        ) = einsum_185
+        del einsum_185
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_92 = paddle._C_ops.add(reshape_70, parameter_232)
+        del parameter_232
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_63 = [add_92, reshape_73]
+        del add_92, reshape_73
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_186, einsum_187, einsum_188 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_63, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_63
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_248,
+            split_249,
+        ) = einsum_187
+        del einsum_187
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_250,
+            split_251,
+        ) = einsum_188
+        del einsum_188
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_74 = paddle._C_ops.reshape(einsum_186, full_int_array_7)
+        del einsum_186
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_10 = paddle._C_ops.slice(
+            reshape_74, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_74
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_75 = paddle._C_ops.reshape(slice_10, full_int_array_9)
+        del slice_10
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_10 = paddle._C_ops.index_select(reshape_75, arange_2, 3)
+        del reshape_75
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_93 = paddle._C_ops.add(reshape_70, parameter_231)
+        del parameter_231, reshape_70
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_64 = [add_93, parameter_229]
+        del add_93, parameter_229
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_189, einsum_190, einsum_191 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_64, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_64
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_252,
+            split_253,
+        ) = einsum_190
+        del einsum_190
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_254,
+            split_255,
+        ) = einsum_191
+        del einsum_191
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_65 = [cast_5, einsum_189]
+        del einsum_189
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_192, einsum_193, einsum_194 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_65, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_65
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_256,
+            split_257,
+        ) = einsum_193
+        del einsum_193
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_258,
+            split_259,
+        ) = einsum_194
+        del einsum_194
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_94 = paddle._C_ops.add(einsum_183, index_select_10)
+        del einsum_183, index_select_10
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_95 = paddle._C_ops.add(add_94, einsum_192)
+        del add_94, einsum_192
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_14 = paddle._C_ops.scale(add_95, full_16, float("0"), True)
+        del add_95
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_10 = paddle._C_ops.subtract(scale_14, scale_4)
+        del scale_14
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_10 = paddle._C_ops.softmax(subtract_10, 3)
+        del subtract_10
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_84, dropout_85 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_10, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_10
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_66 = [dropout_84, reshape_72]
+        del dropout_84, reshape_72
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_195, einsum_196, einsum_197 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_66, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_66
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_260,
+            split_261,
+        ) = einsum_196
+        del einsum_196
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_262,
+            split_263,
+        ) = einsum_197
+        del einsum_197
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_76 = paddle._C_ops.reshape(einsum_195, full_int_array_10)
+        del einsum_195
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_67 = [reshape_76, parameter_234]
+        del parameter_234, reshape_76
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_198, einsum_199, einsum_200 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_67, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_67
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_264,
+            split_265,
+        ) = einsum_199
+        del einsum_199
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_266,
+            split_267,
+        ) = einsum_200
+        del einsum_200
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_86, dropout_87 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_198, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_198
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_96 = paddle._C_ops.add(dropout_86, layer_norm_57)
+        del dropout_86, layer_norm_57
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_60, layer_norm_61, layer_norm_62 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_96, parameter_228, parameter_227, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_96, parameter_227, parameter_228
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_64 = paddle._C_ops.matmul(layer_norm_60, parameter_224, False, False)
+        del parameter_224
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_97 = paddle._C_ops.add(matmul_64, parameter_223)
+        del matmul_64, parameter_223
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_10 = paddle._C_ops.relu(add_97)
+        del add_97
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_88, dropout_89 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_10, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_10
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_65 = paddle._C_ops.matmul(dropout_88, parameter_222, False, False)
+        del dropout_88, parameter_222
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_98 = paddle._C_ops.add(matmul_65, parameter_221)
+        del matmul_65, parameter_221
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_90, dropout_91 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_98, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_98
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_99 = paddle._C_ops.add(dropout_90, layer_norm_60)
+        del dropout_90, layer_norm_60
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_63, layer_norm_64, layer_norm_65 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_99, parameter_226, parameter_225, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_99, parameter_225, parameter_226
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_66 = paddle._C_ops.matmul(layer_norm_63, parameter_220, False, False)
+        del parameter_220
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_77 = paddle._C_ops.reshape(matmul_66, full_int_array_5)
+        del matmul_66
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_67 = paddle._C_ops.matmul(layer_norm_63, parameter_219, False, False)
+        del parameter_219
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_78 = paddle._C_ops.reshape(matmul_67, full_int_array_5)
+        del matmul_67
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_68 = paddle._C_ops.matmul(layer_norm_63, parameter_218, False, False)
+        del parameter_218
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_79 = paddle._C_ops.reshape(matmul_68, full_int_array_5)
+        del matmul_68
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_69 = paddle._C_ops.matmul(dropout_2, parameter_216, False, False)
+        del parameter_216
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_80 = paddle._C_ops.reshape(matmul_69, full_int_array_6)
+        del matmul_69
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_100 = paddle._C_ops.add(reshape_77, parameter_213)
+        del parameter_213
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_68 = [add_100, reshape_78]
+        del add_100, reshape_78
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_201, einsum_202, einsum_203 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_68, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_68
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_268,
+            split_269,
+        ) = einsum_202
+        del einsum_202
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_270,
+            split_271,
+        ) = einsum_203
+        del einsum_203
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_101 = paddle._C_ops.add(reshape_77, parameter_215)
+        del parameter_215
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_69 = [add_101, reshape_80]
+        del add_101, reshape_80
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_204, einsum_205, einsum_206 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_69, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_69
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_272,
+            split_273,
+        ) = einsum_205
+        del einsum_205
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_274,
+            split_275,
+        ) = einsum_206
+        del einsum_206
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_81 = paddle._C_ops.reshape(einsum_204, full_int_array_7)
+        del einsum_204
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_11 = paddle._C_ops.slice(
+            reshape_81, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_81
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_82 = paddle._C_ops.reshape(slice_11, full_int_array_9)
+        del slice_11
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_11 = paddle._C_ops.index_select(reshape_82, arange_2, 3)
+        del reshape_82
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_102 = paddle._C_ops.add(reshape_77, parameter_214)
+        del parameter_214, reshape_77
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_70 = [add_102, parameter_212]
+        del add_102, parameter_212
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_207, einsum_208, einsum_209 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_70, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_70
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_276,
+            split_277,
+        ) = einsum_208
+        del einsum_208
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_278,
+            split_279,
+        ) = einsum_209
+        del einsum_209
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_71 = [cast_5, einsum_207]
+        del einsum_207
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_210, einsum_211, einsum_212 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_71, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_71
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_280,
+            split_281,
+        ) = einsum_211
+        del einsum_211
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_282,
+            split_283,
+        ) = einsum_212
+        del einsum_212
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_103 = paddle._C_ops.add(einsum_201, index_select_11)
+        del einsum_201, index_select_11
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_104 = paddle._C_ops.add(add_103, einsum_210)
+        del add_103, einsum_210
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_15 = paddle._C_ops.scale(add_104, full_16, float("0"), True)
+        del add_104
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_11 = paddle._C_ops.subtract(scale_15, scale_4)
+        del scale_15
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_11 = paddle._C_ops.softmax(subtract_11, 3)
+        del subtract_11
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_92, dropout_93 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_11, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_11
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_72 = [dropout_92, reshape_79]
+        del dropout_92, reshape_79
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_213, einsum_214, einsum_215 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_72, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_72
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_284,
+            split_285,
+        ) = einsum_214
+        del einsum_214
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_286,
+            split_287,
+        ) = einsum_215
+        del einsum_215
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_83 = paddle._C_ops.reshape(einsum_213, full_int_array_10)
+        del einsum_213
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_73 = [reshape_83, parameter_217]
+        del parameter_217, reshape_83
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_216, einsum_217, einsum_218 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_73, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_73
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_288,
+            split_289,
+        ) = einsum_217
+        del einsum_217
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_290,
+            split_291,
+        ) = einsum_218
+        del einsum_218
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_94, dropout_95 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_216, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_216
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_105 = paddle._C_ops.add(dropout_94, layer_norm_63)
+        del dropout_94, layer_norm_63
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_66, layer_norm_67, layer_norm_68 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_105, parameter_211, parameter_210, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_105, parameter_210, parameter_211
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_70 = paddle._C_ops.matmul(layer_norm_66, parameter_207, False, False)
+        del parameter_207
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_106 = paddle._C_ops.add(matmul_70, parameter_206)
+        del matmul_70, parameter_206
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_11 = paddle._C_ops.relu(add_106)
+        del add_106
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_96, dropout_97 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_11, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_11
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_71 = paddle._C_ops.matmul(dropout_96, parameter_205, False, False)
+        del dropout_96, parameter_205
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_107 = paddle._C_ops.add(matmul_71, parameter_204)
+        del matmul_71, parameter_204
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_98, dropout_99 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_107, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_107
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_108 = paddle._C_ops.add(dropout_98, layer_norm_66)
+        del dropout_98, layer_norm_66
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_69, layer_norm_70, layer_norm_71 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_108, parameter_209, parameter_208, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_108, parameter_208, parameter_209
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_72 = paddle._C_ops.matmul(layer_norm_69, parameter_203, False, False)
+        del parameter_203
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_84 = paddle._C_ops.reshape(matmul_72, full_int_array_5)
+        del matmul_72
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_73 = paddle._C_ops.matmul(layer_norm_69, parameter_202, False, False)
+        del parameter_202
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_85 = paddle._C_ops.reshape(matmul_73, full_int_array_5)
+        del matmul_73
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_74 = paddle._C_ops.matmul(layer_norm_69, parameter_201, False, False)
+        del parameter_201
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_86 = paddle._C_ops.reshape(matmul_74, full_int_array_5)
+        del matmul_74
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_75 = paddle._C_ops.matmul(dropout_2, parameter_199, False, False)
+        del parameter_199
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_87 = paddle._C_ops.reshape(matmul_75, full_int_array_6)
+        del matmul_75
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_109 = paddle._C_ops.add(reshape_84, parameter_196)
+        del parameter_196
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_74 = [add_109, reshape_85]
+        del add_109, reshape_85
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_219, einsum_220, einsum_221 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_74, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_74
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_292,
+            split_293,
+        ) = einsum_220
+        del einsum_220
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_294,
+            split_295,
+        ) = einsum_221
+        del einsum_221
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_110 = paddle._C_ops.add(reshape_84, parameter_198)
+        del parameter_198
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_75 = [add_110, reshape_87]
+        del add_110, reshape_87
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_222, einsum_223, einsum_224 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_75, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_75
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_296,
+            split_297,
+        ) = einsum_223
+        del einsum_223
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_298,
+            split_299,
+        ) = einsum_224
+        del einsum_224
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_88 = paddle._C_ops.reshape(einsum_222, full_int_array_7)
+        del einsum_222
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_12 = paddle._C_ops.slice(
+            reshape_88, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_88
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_89 = paddle._C_ops.reshape(slice_12, full_int_array_9)
+        del slice_12
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_12 = paddle._C_ops.index_select(reshape_89, arange_2, 3)
+        del reshape_89
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_111 = paddle._C_ops.add(reshape_84, parameter_197)
+        del parameter_197, reshape_84
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_76 = [add_111, parameter_195]
+        del add_111, parameter_195
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_225, einsum_226, einsum_227 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_76, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_76
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_300,
+            split_301,
+        ) = einsum_226
+        del einsum_226
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_302,
+            split_303,
+        ) = einsum_227
+        del einsum_227
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_77 = [cast_5, einsum_225]
+        del einsum_225
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_228, einsum_229, einsum_230 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_77, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_77
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_304,
+            split_305,
+        ) = einsum_229
+        del einsum_229
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_306,
+            split_307,
+        ) = einsum_230
+        del einsum_230
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_112 = paddle._C_ops.add(einsum_219, index_select_12)
+        del einsum_219, index_select_12
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_113 = paddle._C_ops.add(add_112, einsum_228)
+        del add_112, einsum_228
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_16 = paddle._C_ops.scale(add_113, full_16, float("0"), True)
+        del add_113
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_12 = paddle._C_ops.subtract(scale_16, scale_4)
+        del scale_16
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_12 = paddle._C_ops.softmax(subtract_12, 3)
+        del subtract_12
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_100, dropout_101 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_12, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_12
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_78 = [dropout_100, reshape_86]
+        del dropout_100, reshape_86
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_231, einsum_232, einsum_233 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_78, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_78
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_308,
+            split_309,
+        ) = einsum_232
+        del einsum_232
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_310,
+            split_311,
+        ) = einsum_233
+        del einsum_233
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_90 = paddle._C_ops.reshape(einsum_231, full_int_array_10)
+        del einsum_231
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_79 = [reshape_90, parameter_200]
+        del parameter_200, reshape_90
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_234, einsum_235, einsum_236 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_79, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_79
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_312,
+            split_313,
+        ) = einsum_235
+        del einsum_235
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_314,
+            split_315,
+        ) = einsum_236
+        del einsum_236
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_102, dropout_103 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_234, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_234
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_114 = paddle._C_ops.add(dropout_102, layer_norm_69)
+        del dropout_102, layer_norm_69
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_72, layer_norm_73, layer_norm_74 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_114, parameter_194, parameter_193, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_114, parameter_193, parameter_194
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_76 = paddle._C_ops.matmul(layer_norm_72, parameter_190, False, False)
+        del parameter_190
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_115 = paddle._C_ops.add(matmul_76, parameter_189)
+        del matmul_76, parameter_189
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_12 = paddle._C_ops.relu(add_115)
+        del add_115
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_104, dropout_105 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_12, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_12
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_77 = paddle._C_ops.matmul(dropout_104, parameter_188, False, False)
+        del dropout_104, parameter_188
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_116 = paddle._C_ops.add(matmul_77, parameter_187)
+        del matmul_77, parameter_187
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_106, dropout_107 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_116, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_116
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_117 = paddle._C_ops.add(dropout_106, layer_norm_72)
+        del dropout_106, layer_norm_72
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_75, layer_norm_76, layer_norm_77 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_117, parameter_192, parameter_191, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_117, parameter_191, parameter_192
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_78 = paddle._C_ops.matmul(layer_norm_75, parameter_186, False, False)
+        del parameter_186
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_91 = paddle._C_ops.reshape(matmul_78, full_int_array_5)
+        del matmul_78
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_79 = paddle._C_ops.matmul(layer_norm_75, parameter_185, False, False)
+        del parameter_185
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_92 = paddle._C_ops.reshape(matmul_79, full_int_array_5)
+        del matmul_79
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_80 = paddle._C_ops.matmul(layer_norm_75, parameter_184, False, False)
+        del parameter_184
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_93 = paddle._C_ops.reshape(matmul_80, full_int_array_5)
+        del matmul_80
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_81 = paddle._C_ops.matmul(dropout_2, parameter_182, False, False)
+        del parameter_182
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_94 = paddle._C_ops.reshape(matmul_81, full_int_array_6)
+        del matmul_81
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_118 = paddle._C_ops.add(reshape_91, parameter_179)
+        del parameter_179
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_80 = [add_118, reshape_92]
+        del add_118, reshape_92
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_237, einsum_238, einsum_239 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_80, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_80
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_316,
+            split_317,
+        ) = einsum_238
+        del einsum_238
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_318,
+            split_319,
+        ) = einsum_239
+        del einsum_239
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_119 = paddle._C_ops.add(reshape_91, parameter_181)
+        del parameter_181
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_81 = [add_119, reshape_94]
+        del add_119, reshape_94
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_240, einsum_241, einsum_242 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_81, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_81
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_320,
+            split_321,
+        ) = einsum_241
+        del einsum_241
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_322,
+            split_323,
+        ) = einsum_242
+        del einsum_242
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_95 = paddle._C_ops.reshape(einsum_240, full_int_array_7)
+        del einsum_240
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_13 = paddle._C_ops.slice(
+            reshape_95, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_95
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_96 = paddle._C_ops.reshape(slice_13, full_int_array_9)
+        del slice_13
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_13 = paddle._C_ops.index_select(reshape_96, arange_2, 3)
+        del reshape_96
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_120 = paddle._C_ops.add(reshape_91, parameter_180)
+        del parameter_180, reshape_91
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_82 = [add_120, parameter_178]
+        del add_120, parameter_178
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_243, einsum_244, einsum_245 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_82, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_82
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_324,
+            split_325,
+        ) = einsum_244
+        del einsum_244
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_326,
+            split_327,
+        ) = einsum_245
+        del einsum_245
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_83 = [cast_5, einsum_243]
+        del einsum_243
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_246, einsum_247, einsum_248 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_83, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_83
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_328,
+            split_329,
+        ) = einsum_247
+        del einsum_247
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_330,
+            split_331,
+        ) = einsum_248
+        del einsum_248
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_121 = paddle._C_ops.add(einsum_237, index_select_13)
+        del einsum_237, index_select_13
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_122 = paddle._C_ops.add(add_121, einsum_246)
+        del add_121, einsum_246
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_17 = paddle._C_ops.scale(add_122, full_16, float("0"), True)
+        del add_122
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_13 = paddle._C_ops.subtract(scale_17, scale_4)
+        del scale_17
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_13 = paddle._C_ops.softmax(subtract_13, 3)
+        del subtract_13
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_108, dropout_109 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_13, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_13
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_84 = [dropout_108, reshape_93]
+        del dropout_108, reshape_93
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_249, einsum_250, einsum_251 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_84, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_84
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_332,
+            split_333,
+        ) = einsum_250
+        del einsum_250
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_334,
+            split_335,
+        ) = einsum_251
+        del einsum_251
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_97 = paddle._C_ops.reshape(einsum_249, full_int_array_10)
+        del einsum_249
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_85 = [reshape_97, parameter_183]
+        del parameter_183, reshape_97
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_252, einsum_253, einsum_254 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_85, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_85
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_336,
+            split_337,
+        ) = einsum_253
+        del einsum_253
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_338,
+            split_339,
+        ) = einsum_254
+        del einsum_254
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_110, dropout_111 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_252, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_252
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_123 = paddle._C_ops.add(dropout_110, layer_norm_75)
+        del dropout_110, layer_norm_75
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_78, layer_norm_79, layer_norm_80 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_123, parameter_177, parameter_176, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_123, parameter_176, parameter_177
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_82 = paddle._C_ops.matmul(layer_norm_78, parameter_173, False, False)
+        del parameter_173
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_124 = paddle._C_ops.add(matmul_82, parameter_172)
+        del matmul_82, parameter_172
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_13 = paddle._C_ops.relu(add_124)
+        del add_124
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_112, dropout_113 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_13, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_13
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_83 = paddle._C_ops.matmul(dropout_112, parameter_171, False, False)
+        del dropout_112, parameter_171
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_125 = paddle._C_ops.add(matmul_83, parameter_170)
+        del matmul_83, parameter_170
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_114, dropout_115 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_125, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_125
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_126 = paddle._C_ops.add(dropout_114, layer_norm_78)
+        del dropout_114, layer_norm_78
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_81, layer_norm_82, layer_norm_83 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_126, parameter_175, parameter_174, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_126, parameter_174, parameter_175
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_84 = paddle._C_ops.matmul(layer_norm_81, parameter_169, False, False)
+        del parameter_169
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_98 = paddle._C_ops.reshape(matmul_84, full_int_array_5)
+        del matmul_84
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_85 = paddle._C_ops.matmul(layer_norm_81, parameter_168, False, False)
+        del parameter_168
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_99 = paddle._C_ops.reshape(matmul_85, full_int_array_5)
+        del matmul_85
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_86 = paddle._C_ops.matmul(layer_norm_81, parameter_167, False, False)
+        del parameter_167
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_100 = paddle._C_ops.reshape(matmul_86, full_int_array_5)
+        del matmul_86
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_87 = paddle._C_ops.matmul(dropout_2, parameter_165, False, False)
+        del parameter_165
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_101 = paddle._C_ops.reshape(matmul_87, full_int_array_6)
+        del matmul_87
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_127 = paddle._C_ops.add(reshape_98, parameter_162)
+        del parameter_162
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_86 = [add_127, reshape_99]
+        del add_127, reshape_99
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_255, einsum_256, einsum_257 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_86, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_86
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_340,
+            split_341,
+        ) = einsum_256
+        del einsum_256
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_342,
+            split_343,
+        ) = einsum_257
+        del einsum_257
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_128 = paddle._C_ops.add(reshape_98, parameter_164)
+        del parameter_164
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_87 = [add_128, reshape_101]
+        del add_128, reshape_101
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_258, einsum_259, einsum_260 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_87, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_87
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_344,
+            split_345,
+        ) = einsum_259
+        del einsum_259
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_346,
+            split_347,
+        ) = einsum_260
+        del einsum_260
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_102 = paddle._C_ops.reshape(einsum_258, full_int_array_7)
+        del einsum_258
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_14 = paddle._C_ops.slice(
+            reshape_102, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_102
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_103 = paddle._C_ops.reshape(slice_14, full_int_array_9)
+        del slice_14
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_14 = paddle._C_ops.index_select(reshape_103, arange_2, 3)
+        del reshape_103
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_129 = paddle._C_ops.add(reshape_98, parameter_163)
+        del parameter_163, reshape_98
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_88 = [add_129, parameter_161]
+        del add_129, parameter_161
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_261, einsum_262, einsum_263 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_88, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_88
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_348,
+            split_349,
+        ) = einsum_262
+        del einsum_262
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_350,
+            split_351,
+        ) = einsum_263
+        del einsum_263
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_89 = [cast_5, einsum_261]
+        del einsum_261
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_264, einsum_265, einsum_266 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_89, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_89
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_352,
+            split_353,
+        ) = einsum_265
+        del einsum_265
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_354,
+            split_355,
+        ) = einsum_266
+        del einsum_266
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_130 = paddle._C_ops.add(einsum_255, index_select_14)
+        del einsum_255, index_select_14
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_131 = paddle._C_ops.add(add_130, einsum_264)
+        del add_130, einsum_264
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_18 = paddle._C_ops.scale(add_131, full_16, float("0"), True)
+        del add_131
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_14 = paddle._C_ops.subtract(scale_18, scale_4)
+        del scale_18
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_14 = paddle._C_ops.softmax(subtract_14, 3)
+        del subtract_14
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_116, dropout_117 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_14, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_14
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_90 = [dropout_116, reshape_100]
+        del dropout_116, reshape_100
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_267, einsum_268, einsum_269 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_90, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_90
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_356,
+            split_357,
+        ) = einsum_268
+        del einsum_268
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_358,
+            split_359,
+        ) = einsum_269
+        del einsum_269
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_104 = paddle._C_ops.reshape(einsum_267, full_int_array_10)
+        del einsum_267
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_91 = [reshape_104, parameter_166]
+        del parameter_166, reshape_104
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_270, einsum_271, einsum_272 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_91, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_91
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_360,
+            split_361,
+        ) = einsum_271
+        del einsum_271
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_362,
+            split_363,
+        ) = einsum_272
+        del einsum_272
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_118, dropout_119 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_270, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_270
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_132 = paddle._C_ops.add(dropout_118, layer_norm_81)
+        del dropout_118, layer_norm_81
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_84, layer_norm_85, layer_norm_86 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_132, parameter_160, parameter_159, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_132, parameter_159, parameter_160
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_88 = paddle._C_ops.matmul(layer_norm_84, parameter_156, False, False)
+        del parameter_156
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_133 = paddle._C_ops.add(matmul_88, parameter_155)
+        del matmul_88, parameter_155
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_14 = paddle._C_ops.relu(add_133)
+        del add_133
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_120, dropout_121 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_14, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_14
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_89 = paddle._C_ops.matmul(dropout_120, parameter_154, False, False)
+        del dropout_120, parameter_154
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_134 = paddle._C_ops.add(matmul_89, parameter_153)
+        del matmul_89, parameter_153
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_122, dropout_123 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_134, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_134
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_135 = paddle._C_ops.add(dropout_122, layer_norm_84)
+        del dropout_122, layer_norm_84
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_87, layer_norm_88, layer_norm_89 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_135, parameter_158, parameter_157, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_135, parameter_157, parameter_158
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_90 = paddle._C_ops.matmul(layer_norm_87, parameter_152, False, False)
+        del parameter_152
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_105 = paddle._C_ops.reshape(matmul_90, full_int_array_5)
+        del matmul_90
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_91 = paddle._C_ops.matmul(layer_norm_87, parameter_151, False, False)
+        del parameter_151
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_106 = paddle._C_ops.reshape(matmul_91, full_int_array_5)
+        del matmul_91
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_92 = paddle._C_ops.matmul(layer_norm_87, parameter_150, False, False)
+        del parameter_150
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_107 = paddle._C_ops.reshape(matmul_92, full_int_array_5)
+        del matmul_92
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_93 = paddle._C_ops.matmul(dropout_2, parameter_148, False, False)
+        del parameter_148
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_108 = paddle._C_ops.reshape(matmul_93, full_int_array_6)
+        del matmul_93
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_136 = paddle._C_ops.add(reshape_105, parameter_145)
+        del parameter_145
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_92 = [add_136, reshape_106]
+        del add_136, reshape_106
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_273, einsum_274, einsum_275 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_92, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_92
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_364,
+            split_365,
+        ) = einsum_274
+        del einsum_274
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_366,
+            split_367,
+        ) = einsum_275
+        del einsum_275
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_137 = paddle._C_ops.add(reshape_105, parameter_147)
+        del parameter_147
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_93 = [add_137, reshape_108]
+        del add_137, reshape_108
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_276, einsum_277, einsum_278 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_93, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_93
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_368,
+            split_369,
+        ) = einsum_277
+        del einsum_277
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_370,
+            split_371,
+        ) = einsum_278
+        del einsum_278
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_109 = paddle._C_ops.reshape(einsum_276, full_int_array_7)
+        del einsum_276
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_15 = paddle._C_ops.slice(
+            reshape_109, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_109
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_110 = paddle._C_ops.reshape(slice_15, full_int_array_9)
+        del slice_15
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_15 = paddle._C_ops.index_select(reshape_110, arange_2, 3)
+        del reshape_110
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_138 = paddle._C_ops.add(reshape_105, parameter_146)
+        del parameter_146, reshape_105
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_94 = [add_138, parameter_144]
+        del add_138, parameter_144
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_279, einsum_280, einsum_281 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_94, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_94
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_372,
+            split_373,
+        ) = einsum_280
+        del einsum_280
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_374,
+            split_375,
+        ) = einsum_281
+        del einsum_281
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_95 = [cast_5, einsum_279]
+        del einsum_279
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_282, einsum_283, einsum_284 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_95, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_95
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_376,
+            split_377,
+        ) = einsum_283
+        del einsum_283
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_378,
+            split_379,
+        ) = einsum_284
+        del einsum_284
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_139 = paddle._C_ops.add(einsum_273, index_select_15)
+        del einsum_273, index_select_15
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_140 = paddle._C_ops.add(add_139, einsum_282)
+        del add_139, einsum_282
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_19 = paddle._C_ops.scale(add_140, full_16, float("0"), True)
+        del add_140
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_15 = paddle._C_ops.subtract(scale_19, scale_4)
+        del scale_19
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_15 = paddle._C_ops.softmax(subtract_15, 3)
+        del subtract_15
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_124, dropout_125 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_15, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_15
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_96 = [dropout_124, reshape_107]
+        del dropout_124, reshape_107
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_285, einsum_286, einsum_287 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_96, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_96
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_380,
+            split_381,
+        ) = einsum_286
+        del einsum_286
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_382,
+            split_383,
+        ) = einsum_287
+        del einsum_287
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_111 = paddle._C_ops.reshape(einsum_285, full_int_array_10)
+        del einsum_285
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_97 = [reshape_111, parameter_149]
+        del parameter_149, reshape_111
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_288, einsum_289, einsum_290 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_97, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_97
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_384,
+            split_385,
+        ) = einsum_289
+        del einsum_289
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_386,
+            split_387,
+        ) = einsum_290
+        del einsum_290
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_126, dropout_127 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_288, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_288
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_141 = paddle._C_ops.add(dropout_126, layer_norm_87)
+        del dropout_126, layer_norm_87
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_90, layer_norm_91, layer_norm_92 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_141, parameter_143, parameter_142, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_141, parameter_142, parameter_143
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_94 = paddle._C_ops.matmul(layer_norm_90, parameter_139, False, False)
+        del parameter_139
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_142 = paddle._C_ops.add(matmul_94, parameter_138)
+        del matmul_94, parameter_138
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_15 = paddle._C_ops.relu(add_142)
+        del add_142
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_128, dropout_129 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_15, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_15
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_95 = paddle._C_ops.matmul(dropout_128, parameter_137, False, False)
+        del dropout_128, parameter_137
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_143 = paddle._C_ops.add(matmul_95, parameter_136)
+        del matmul_95, parameter_136
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_130, dropout_131 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_143, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_143
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_144 = paddle._C_ops.add(dropout_130, layer_norm_90)
+        del dropout_130, layer_norm_90
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_93, layer_norm_94, layer_norm_95 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_144, parameter_141, parameter_140, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_144, parameter_140, parameter_141
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_96 = paddle._C_ops.matmul(layer_norm_93, parameter_135, False, False)
+        del parameter_135
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_112 = paddle._C_ops.reshape(matmul_96, full_int_array_5)
+        del matmul_96
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_97 = paddle._C_ops.matmul(layer_norm_93, parameter_134, False, False)
+        del parameter_134
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_113 = paddle._C_ops.reshape(matmul_97, full_int_array_5)
+        del matmul_97
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_98 = paddle._C_ops.matmul(layer_norm_93, parameter_133, False, False)
+        del parameter_133
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_114 = paddle._C_ops.reshape(matmul_98, full_int_array_5)
+        del matmul_98
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_99 = paddle._C_ops.matmul(dropout_2, parameter_131, False, False)
+        del parameter_131
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_115 = paddle._C_ops.reshape(matmul_99, full_int_array_6)
+        del matmul_99
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_145 = paddle._C_ops.add(reshape_112, parameter_128)
+        del parameter_128
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_98 = [add_145, reshape_113]
+        del add_145, reshape_113
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_291, einsum_292, einsum_293 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_98, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_98
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_388,
+            split_389,
+        ) = einsum_292
+        del einsum_292
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_390,
+            split_391,
+        ) = einsum_293
+        del einsum_293
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_146 = paddle._C_ops.add(reshape_112, parameter_130)
+        del parameter_130
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_99 = [add_146, reshape_115]
+        del add_146, reshape_115
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_294, einsum_295, einsum_296 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_99, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_99
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_392,
+            split_393,
+        ) = einsum_295
+        del einsum_295
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_394,
+            split_395,
+        ) = einsum_296
+        del einsum_296
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_116 = paddle._C_ops.reshape(einsum_294, full_int_array_7)
+        del einsum_294
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_16 = paddle._C_ops.slice(
+            reshape_116, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_116
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_117 = paddle._C_ops.reshape(slice_16, full_int_array_9)
+        del slice_16
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_16 = paddle._C_ops.index_select(reshape_117, arange_2, 3)
+        del reshape_117
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_147 = paddle._C_ops.add(reshape_112, parameter_129)
+        del parameter_129, reshape_112
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_100 = [add_147, parameter_127]
+        del add_147, parameter_127
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_297, einsum_298, einsum_299 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_100, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_100
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_396,
+            split_397,
+        ) = einsum_298
+        del einsum_298
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_398,
+            split_399,
+        ) = einsum_299
+        del einsum_299
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_101 = [cast_5, einsum_297]
+        del einsum_297
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_300, einsum_301, einsum_302 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_101, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_101
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_400,
+            split_401,
+        ) = einsum_301
+        del einsum_301
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_402,
+            split_403,
+        ) = einsum_302
+        del einsum_302
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_148 = paddle._C_ops.add(einsum_291, index_select_16)
+        del einsum_291, index_select_16
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_149 = paddle._C_ops.add(add_148, einsum_300)
+        del add_148, einsum_300
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_20 = paddle._C_ops.scale(add_149, full_16, float("0"), True)
+        del add_149
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_16 = paddle._C_ops.subtract(scale_20, scale_4)
+        del scale_20
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_16 = paddle._C_ops.softmax(subtract_16, 3)
+        del subtract_16
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_132, dropout_133 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_16, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_16
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_102 = [dropout_132, reshape_114]
+        del dropout_132, reshape_114
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_303, einsum_304, einsum_305 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_102, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_102
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_404,
+            split_405,
+        ) = einsum_304
+        del einsum_304
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_406,
+            split_407,
+        ) = einsum_305
+        del einsum_305
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_118 = paddle._C_ops.reshape(einsum_303, full_int_array_10)
+        del einsum_303
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_103 = [reshape_118, parameter_132]
+        del parameter_132, reshape_118
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_306, einsum_307, einsum_308 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_103, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_103
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_408,
+            split_409,
+        ) = einsum_307
+        del einsum_307
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_410,
+            split_411,
+        ) = einsum_308
+        del einsum_308
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_134, dropout_135 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_306, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_306
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_150 = paddle._C_ops.add(dropout_134, layer_norm_93)
+        del dropout_134, layer_norm_93
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_96, layer_norm_97, layer_norm_98 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_150, parameter_126, parameter_125, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_150, parameter_125, parameter_126
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_100 = paddle._C_ops.matmul(layer_norm_96, parameter_122, False, False)
+        del parameter_122
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_151 = paddle._C_ops.add(matmul_100, parameter_121)
+        del matmul_100, parameter_121
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_16 = paddle._C_ops.relu(add_151)
+        del add_151
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_136, dropout_137 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_16, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_16
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_101 = paddle._C_ops.matmul(dropout_136, parameter_120, False, False)
+        del dropout_136, parameter_120
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_152 = paddle._C_ops.add(matmul_101, parameter_119)
+        del matmul_101, parameter_119
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_138, dropout_139 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_152, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_152
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_153 = paddle._C_ops.add(dropout_138, layer_norm_96)
+        del dropout_138, layer_norm_96
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_99, layer_norm_100, layer_norm_101 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_153, parameter_124, parameter_123, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_153, parameter_123, parameter_124
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_102 = paddle._C_ops.matmul(layer_norm_99, parameter_118, False, False)
+        del parameter_118
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_119 = paddle._C_ops.reshape(matmul_102, full_int_array_5)
+        del matmul_102
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_103 = paddle._C_ops.matmul(layer_norm_99, parameter_117, False, False)
+        del parameter_117
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_120 = paddle._C_ops.reshape(matmul_103, full_int_array_5)
+        del matmul_103
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_104 = paddle._C_ops.matmul(layer_norm_99, parameter_116, False, False)
+        del parameter_116
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_121 = paddle._C_ops.reshape(matmul_104, full_int_array_5)
+        del matmul_104
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_105 = paddle._C_ops.matmul(dropout_2, parameter_114, False, False)
+        del parameter_114
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_122 = paddle._C_ops.reshape(matmul_105, full_int_array_6)
+        del matmul_105
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_154 = paddle._C_ops.add(reshape_119, parameter_111)
+        del parameter_111
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_104 = [add_154, reshape_120]
+        del add_154, reshape_120
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_309, einsum_310, einsum_311 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_104, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_104
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_412,
+            split_413,
+        ) = einsum_310
+        del einsum_310
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_414,
+            split_415,
+        ) = einsum_311
+        del einsum_311
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_155 = paddle._C_ops.add(reshape_119, parameter_113)
+        del parameter_113
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_105 = [add_155, reshape_122]
+        del add_155, reshape_122
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_312, einsum_313, einsum_314 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_105, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_105
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_416,
+            split_417,
+        ) = einsum_313
+        del einsum_313
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_418,
+            split_419,
+        ) = einsum_314
+        del einsum_314
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_123 = paddle._C_ops.reshape(einsum_312, full_int_array_7)
+        del einsum_312
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_17 = paddle._C_ops.slice(
+            reshape_123, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_123
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_124 = paddle._C_ops.reshape(slice_17, full_int_array_9)
+        del slice_17
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_17 = paddle._C_ops.index_select(reshape_124, arange_2, 3)
+        del reshape_124
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_156 = paddle._C_ops.add(reshape_119, parameter_112)
+        del parameter_112, reshape_119
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_106 = [add_156, parameter_110]
+        del add_156, parameter_110
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_315, einsum_316, einsum_317 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_106, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_106
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_420,
+            split_421,
+        ) = einsum_316
+        del einsum_316
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_422,
+            split_423,
+        ) = einsum_317
+        del einsum_317
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_107 = [cast_5, einsum_315]
+        del einsum_315
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_318, einsum_319, einsum_320 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_107, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_107
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_424,
+            split_425,
+        ) = einsum_319
+        del einsum_319
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_426,
+            split_427,
+        ) = einsum_320
+        del einsum_320
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_157 = paddle._C_ops.add(einsum_309, index_select_17)
+        del einsum_309, index_select_17
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_158 = paddle._C_ops.add(add_157, einsum_318)
+        del add_157, einsum_318
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_21 = paddle._C_ops.scale(add_158, full_16, float("0"), True)
+        del add_158
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_17 = paddle._C_ops.subtract(scale_21, scale_4)
+        del scale_21
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_17 = paddle._C_ops.softmax(subtract_17, 3)
+        del subtract_17
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_140, dropout_141 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_17, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_17
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_108 = [dropout_140, reshape_121]
+        del dropout_140, reshape_121
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_321, einsum_322, einsum_323 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_108, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_108
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_428,
+            split_429,
+        ) = einsum_322
+        del einsum_322
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_430,
+            split_431,
+        ) = einsum_323
+        del einsum_323
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_125 = paddle._C_ops.reshape(einsum_321, full_int_array_10)
+        del einsum_321
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_109 = [reshape_125, parameter_115]
+        del parameter_115, reshape_125
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_324, einsum_325, einsum_326 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_109, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_109
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_432,
+            split_433,
+        ) = einsum_325
+        del einsum_325
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_434,
+            split_435,
+        ) = einsum_326
+        del einsum_326
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_142, dropout_143 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_324, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_324
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_159 = paddle._C_ops.add(dropout_142, layer_norm_99)
+        del dropout_142, layer_norm_99
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_102, layer_norm_103, layer_norm_104 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_159, parameter_109, parameter_108, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_159, parameter_108, parameter_109
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_106 = paddle._C_ops.matmul(layer_norm_102, parameter_105, False, False)
+        del parameter_105
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_160 = paddle._C_ops.add(matmul_106, parameter_104)
+        del matmul_106, parameter_104
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_17 = paddle._C_ops.relu(add_160)
+        del add_160
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_144, dropout_145 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_17, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_17
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_107 = paddle._C_ops.matmul(dropout_144, parameter_103, False, False)
+        del dropout_144, parameter_103
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_161 = paddle._C_ops.add(matmul_107, parameter_102)
+        del matmul_107, parameter_102
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_146, dropout_147 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_161, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_161
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_162 = paddle._C_ops.add(dropout_146, layer_norm_102)
+        del dropout_146, layer_norm_102
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_105, layer_norm_106, layer_norm_107 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_162, parameter_107, parameter_106, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_162, parameter_106, parameter_107
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_108 = paddle._C_ops.matmul(layer_norm_105, parameter_101, False, False)
+        del parameter_101
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_126 = paddle._C_ops.reshape(matmul_108, full_int_array_5)
+        del matmul_108
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_109 = paddle._C_ops.matmul(layer_norm_105, parameter_100, False, False)
+        del parameter_100
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_127 = paddle._C_ops.reshape(matmul_109, full_int_array_5)
+        del matmul_109
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_110 = paddle._C_ops.matmul(layer_norm_105, parameter_99, False, False)
+        del parameter_99
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_128 = paddle._C_ops.reshape(matmul_110, full_int_array_5)
+        del matmul_110
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_111 = paddle._C_ops.matmul(dropout_2, parameter_97, False, False)
+        del parameter_97
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_129 = paddle._C_ops.reshape(matmul_111, full_int_array_6)
+        del matmul_111
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_163 = paddle._C_ops.add(reshape_126, parameter_94)
+        del parameter_94
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_110 = [add_163, reshape_127]
+        del add_163, reshape_127
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_327, einsum_328, einsum_329 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_110, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_110
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_436,
+            split_437,
+        ) = einsum_328
+        del einsum_328
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_438,
+            split_439,
+        ) = einsum_329
+        del einsum_329
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_164 = paddle._C_ops.add(reshape_126, parameter_96)
+        del parameter_96
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_111 = [add_164, reshape_129]
+        del add_164, reshape_129
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_330, einsum_331, einsum_332 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_111, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_111
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_440,
+            split_441,
+        ) = einsum_331
+        del einsum_331
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_442,
+            split_443,
+        ) = einsum_332
+        del einsum_332
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_130 = paddle._C_ops.reshape(einsum_330, full_int_array_7)
+        del einsum_330
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_18 = paddle._C_ops.slice(
+            reshape_130, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_130
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_131 = paddle._C_ops.reshape(slice_18, full_int_array_9)
+        del slice_18
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_18 = paddle._C_ops.index_select(reshape_131, arange_2, 3)
+        del reshape_131
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_165 = paddle._C_ops.add(reshape_126, parameter_95)
+        del parameter_95, reshape_126
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_112 = [add_165, parameter_93]
+        del add_165, parameter_93
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_333, einsum_334, einsum_335 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_112, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_112
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_444,
+            split_445,
+        ) = einsum_334
+        del einsum_334
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_446,
+            split_447,
+        ) = einsum_335
+        del einsum_335
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_113 = [cast_5, einsum_333]
+        del einsum_333
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_336, einsum_337, einsum_338 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_113, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_113
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_448,
+            split_449,
+        ) = einsum_337
+        del einsum_337
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_450,
+            split_451,
+        ) = einsum_338
+        del einsum_338
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_166 = paddle._C_ops.add(einsum_327, index_select_18)
+        del einsum_327, index_select_18
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_167 = paddle._C_ops.add(add_166, einsum_336)
+        del add_166, einsum_336
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_22 = paddle._C_ops.scale(add_167, full_16, float("0"), True)
+        del add_167
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_18 = paddle._C_ops.subtract(scale_22, scale_4)
+        del scale_22
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_18 = paddle._C_ops.softmax(subtract_18, 3)
+        del subtract_18
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_148, dropout_149 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_18, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_18
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_114 = [dropout_148, reshape_128]
+        del dropout_148, reshape_128
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_339, einsum_340, einsum_341 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_114, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_114
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_452,
+            split_453,
+        ) = einsum_340
+        del einsum_340
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_454,
+            split_455,
+        ) = einsum_341
+        del einsum_341
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_132 = paddle._C_ops.reshape(einsum_339, full_int_array_10)
+        del einsum_339
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_115 = [reshape_132, parameter_98]
+        del parameter_98, reshape_132
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_342, einsum_343, einsum_344 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_115, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_115
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_456,
+            split_457,
+        ) = einsum_343
+        del einsum_343
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_458,
+            split_459,
+        ) = einsum_344
+        del einsum_344
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_150, dropout_151 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_342, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_342
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_168 = paddle._C_ops.add(dropout_150, layer_norm_105)
+        del dropout_150, layer_norm_105
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_108, layer_norm_109, layer_norm_110 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_168, parameter_92, parameter_91, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_168, parameter_91, parameter_92
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_112 = paddle._C_ops.matmul(layer_norm_108, parameter_88, False, False)
+        del parameter_88
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_169 = paddle._C_ops.add(matmul_112, parameter_87)
+        del matmul_112, parameter_87
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_18 = paddle._C_ops.relu(add_169)
+        del add_169
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_152, dropout_153 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_18, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_18
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_113 = paddle._C_ops.matmul(dropout_152, parameter_86, False, False)
+        del dropout_152, parameter_86
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_170 = paddle._C_ops.add(matmul_113, parameter_85)
+        del matmul_113, parameter_85
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_154, dropout_155 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_170, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_170
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_171 = paddle._C_ops.add(dropout_154, layer_norm_108)
+        del dropout_154, layer_norm_108
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_111, layer_norm_112, layer_norm_113 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_171, parameter_90, parameter_89, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_171, parameter_89, parameter_90
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_114 = paddle._C_ops.matmul(layer_norm_111, parameter_84, False, False)
+        del parameter_84
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_133 = paddle._C_ops.reshape(matmul_114, full_int_array_5)
+        del matmul_114
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_115 = paddle._C_ops.matmul(layer_norm_111, parameter_83, False, False)
+        del parameter_83
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_134 = paddle._C_ops.reshape(matmul_115, full_int_array_5)
+        del matmul_115
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_116 = paddle._C_ops.matmul(layer_norm_111, parameter_82, False, False)
+        del parameter_82
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_135 = paddle._C_ops.reshape(matmul_116, full_int_array_5)
+        del matmul_116
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_117 = paddle._C_ops.matmul(dropout_2, parameter_80, False, False)
+        del parameter_80
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_136 = paddle._C_ops.reshape(matmul_117, full_int_array_6)
+        del matmul_117
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_172 = paddle._C_ops.add(reshape_133, parameter_77)
+        del parameter_77
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_116 = [add_172, reshape_134]
+        del add_172, reshape_134
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_345, einsum_346, einsum_347 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_116, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_116
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_460,
+            split_461,
+        ) = einsum_346
+        del einsum_346
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_462,
+            split_463,
+        ) = einsum_347
+        del einsum_347
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_173 = paddle._C_ops.add(reshape_133, parameter_79)
+        del parameter_79
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_117 = [add_173, reshape_136]
+        del add_173, reshape_136
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_348, einsum_349, einsum_350 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_117, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_117
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_464,
+            split_465,
+        ) = einsum_349
+        del einsum_349
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_466,
+            split_467,
+        ) = einsum_350
+        del einsum_350
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_137 = paddle._C_ops.reshape(einsum_348, full_int_array_7)
+        del einsum_348
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_19 = paddle._C_ops.slice(
+            reshape_137, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_137
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_138 = paddle._C_ops.reshape(slice_19, full_int_array_9)
+        del slice_19
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_19 = paddle._C_ops.index_select(reshape_138, arange_2, 3)
+        del reshape_138
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_174 = paddle._C_ops.add(reshape_133, parameter_78)
+        del parameter_78, reshape_133
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_118 = [add_174, parameter_76]
+        del add_174, parameter_76
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_351, einsum_352, einsum_353 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_118, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_118
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_468,
+            split_469,
+        ) = einsum_352
+        del einsum_352
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_470,
+            split_471,
+        ) = einsum_353
+        del einsum_353
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_119 = [cast_5, einsum_351]
+        del einsum_351
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_354, einsum_355, einsum_356 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_119, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_119
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_472,
+            split_473,
+        ) = einsum_355
+        del einsum_355
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_474,
+            split_475,
+        ) = einsum_356
+        del einsum_356
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_175 = paddle._C_ops.add(einsum_345, index_select_19)
+        del einsum_345, index_select_19
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_176 = paddle._C_ops.add(add_175, einsum_354)
+        del add_175, einsum_354
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_23 = paddle._C_ops.scale(add_176, full_16, float("0"), True)
+        del add_176
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_19 = paddle._C_ops.subtract(scale_23, scale_4)
+        del scale_23
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_19 = paddle._C_ops.softmax(subtract_19, 3)
+        del subtract_19
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_156, dropout_157 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_19, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_19
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_120 = [dropout_156, reshape_135]
+        del dropout_156, reshape_135
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_357, einsum_358, einsum_359 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_120, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_120
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_476,
+            split_477,
+        ) = einsum_358
+        del einsum_358
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_478,
+            split_479,
+        ) = einsum_359
+        del einsum_359
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_139 = paddle._C_ops.reshape(einsum_357, full_int_array_10)
+        del einsum_357
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_121 = [reshape_139, parameter_81]
+        del parameter_81, reshape_139
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_360, einsum_361, einsum_362 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_121, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_121
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_480,
+            split_481,
+        ) = einsum_361
+        del einsum_361
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_482,
+            split_483,
+        ) = einsum_362
+        del einsum_362
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_158, dropout_159 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_360, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_360
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_177 = paddle._C_ops.add(dropout_158, layer_norm_111)
+        del dropout_158, layer_norm_111
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_114, layer_norm_115, layer_norm_116 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_177, parameter_75, parameter_74, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_177, parameter_74, parameter_75
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_118 = paddle._C_ops.matmul(layer_norm_114, parameter_71, False, False)
+        del parameter_71
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_178 = paddle._C_ops.add(matmul_118, parameter_70)
+        del matmul_118, parameter_70
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_19 = paddle._C_ops.relu(add_178)
+        del add_178
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_160, dropout_161 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_19, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_19
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_119 = paddle._C_ops.matmul(dropout_160, parameter_69, False, False)
+        del dropout_160, parameter_69
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_179 = paddle._C_ops.add(matmul_119, parameter_68)
+        del matmul_119, parameter_68
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_162, dropout_163 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_179, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_179
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_180 = paddle._C_ops.add(dropout_162, layer_norm_114)
+        del dropout_162, layer_norm_114
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_117, layer_norm_118, layer_norm_119 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_180, parameter_73, parameter_72, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_180, parameter_72, parameter_73
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_120 = paddle._C_ops.matmul(layer_norm_117, parameter_67, False, False)
+        del parameter_67
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_140 = paddle._C_ops.reshape(matmul_120, full_int_array_5)
+        del matmul_120
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_121 = paddle._C_ops.matmul(layer_norm_117, parameter_66, False, False)
+        del parameter_66
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_141 = paddle._C_ops.reshape(matmul_121, full_int_array_5)
+        del matmul_121
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_122 = paddle._C_ops.matmul(layer_norm_117, parameter_65, False, False)
+        del parameter_65
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_142 = paddle._C_ops.reshape(matmul_122, full_int_array_5)
+        del matmul_122
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_123 = paddle._C_ops.matmul(dropout_2, parameter_63, False, False)
+        del parameter_63
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_143 = paddle._C_ops.reshape(matmul_123, full_int_array_6)
+        del matmul_123
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_181 = paddle._C_ops.add(reshape_140, parameter_60)
+        del parameter_60
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_122 = [add_181, reshape_141]
+        del add_181, reshape_141
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_363, einsum_364, einsum_365 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_122, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_122
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_484,
+            split_485,
+        ) = einsum_364
+        del einsum_364
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_486,
+            split_487,
+        ) = einsum_365
+        del einsum_365
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_182 = paddle._C_ops.add(reshape_140, parameter_62)
+        del parameter_62
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_123 = [add_182, reshape_143]
+        del add_182, reshape_143
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_366, einsum_367, einsum_368 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_123, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_123
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_488,
+            split_489,
+        ) = einsum_367
+        del einsum_367
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_490,
+            split_491,
+        ) = einsum_368
+        del einsum_368
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_144 = paddle._C_ops.reshape(einsum_366, full_int_array_7)
+        del einsum_366
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_20 = paddle._C_ops.slice(
+            reshape_144, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_144
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_145 = paddle._C_ops.reshape(slice_20, full_int_array_9)
+        del slice_20
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_20 = paddle._C_ops.index_select(reshape_145, arange_2, 3)
+        del reshape_145
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_183 = paddle._C_ops.add(reshape_140, parameter_61)
+        del parameter_61, reshape_140
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_124 = [add_183, parameter_59]
+        del add_183, parameter_59
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_369, einsum_370, einsum_371 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_124, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_124
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_492,
+            split_493,
+        ) = einsum_370
+        del einsum_370
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_494,
+            split_495,
+        ) = einsum_371
+        del einsum_371
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_125 = [cast_5, einsum_369]
+        del einsum_369
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_372, einsum_373, einsum_374 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_125, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_125
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_496,
+            split_497,
+        ) = einsum_373
+        del einsum_373
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_498,
+            split_499,
+        ) = einsum_374
+        del einsum_374
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_184 = paddle._C_ops.add(einsum_363, index_select_20)
+        del einsum_363, index_select_20
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_185 = paddle._C_ops.add(add_184, einsum_372)
+        del add_184, einsum_372
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_24 = paddle._C_ops.scale(add_185, full_16, float("0"), True)
+        del add_185
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_20 = paddle._C_ops.subtract(scale_24, scale_4)
+        del scale_24
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_20 = paddle._C_ops.softmax(subtract_20, 3)
+        del subtract_20
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_164, dropout_165 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_20, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_20
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_126 = [dropout_164, reshape_142]
+        del dropout_164, reshape_142
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_375, einsum_376, einsum_377 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_126, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_126
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_500,
+            split_501,
+        ) = einsum_376
+        del einsum_376
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_502,
+            split_503,
+        ) = einsum_377
+        del einsum_377
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_146 = paddle._C_ops.reshape(einsum_375, full_int_array_10)
+        del einsum_375
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_127 = [reshape_146, parameter_64]
+        del parameter_64, reshape_146
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_378, einsum_379, einsum_380 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_127, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_127
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_504,
+            split_505,
+        ) = einsum_379
+        del einsum_379
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_506,
+            split_507,
+        ) = einsum_380
+        del einsum_380
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_166, dropout_167 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_378, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_378
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_186 = paddle._C_ops.add(dropout_166, layer_norm_117)
+        del dropout_166, layer_norm_117
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_120, layer_norm_121, layer_norm_122 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_186, parameter_58, parameter_57, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_186, parameter_57, parameter_58
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_124 = paddle._C_ops.matmul(layer_norm_120, parameter_54, False, False)
+        del parameter_54
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_187 = paddle._C_ops.add(matmul_124, parameter_53)
+        del matmul_124, parameter_53
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_20 = paddle._C_ops.relu(add_187)
+        del add_187
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_168, dropout_169 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_20, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_20
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_125 = paddle._C_ops.matmul(dropout_168, parameter_52, False, False)
+        del dropout_168, parameter_52
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_188 = paddle._C_ops.add(matmul_125, parameter_51)
+        del matmul_125, parameter_51
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_170, dropout_171 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_188, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_188
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_189 = paddle._C_ops.add(dropout_170, layer_norm_120)
+        del dropout_170, layer_norm_120
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_123, layer_norm_124, layer_norm_125 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_189, parameter_56, parameter_55, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_189, parameter_55, parameter_56
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_126 = paddle._C_ops.matmul(layer_norm_123, parameter_50, False, False)
+        del parameter_50
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_147 = paddle._C_ops.reshape(matmul_126, full_int_array_5)
+        del matmul_126
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_127 = paddle._C_ops.matmul(layer_norm_123, parameter_49, False, False)
+        del parameter_49
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_148 = paddle._C_ops.reshape(matmul_127, full_int_array_5)
+        del matmul_127
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_128 = paddle._C_ops.matmul(layer_norm_123, parameter_48, False, False)
+        del parameter_48
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_149 = paddle._C_ops.reshape(matmul_128, full_int_array_5)
+        del matmul_128
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_129 = paddle._C_ops.matmul(dropout_2, parameter_46, False, False)
+        del parameter_46
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_150 = paddle._C_ops.reshape(matmul_129, full_int_array_6)
+        del matmul_129
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_190 = paddle._C_ops.add(reshape_147, parameter_43)
+        del parameter_43
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_128 = [add_190, reshape_148]
+        del add_190, reshape_148
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_381, einsum_382, einsum_383 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_128, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_128
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_508,
+            split_509,
+        ) = einsum_382
+        del einsum_382
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_510,
+            split_511,
+        ) = einsum_383
+        del einsum_383
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_191 = paddle._C_ops.add(reshape_147, parameter_45)
+        del parameter_45
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_129 = [add_191, reshape_150]
+        del add_191, reshape_150
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_384, einsum_385, einsum_386 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_129, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_129
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_512,
+            split_513,
+        ) = einsum_385
+        del einsum_385
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_514,
+            split_515,
+        ) = einsum_386
+        del einsum_386
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_151 = paddle._C_ops.reshape(einsum_384, full_int_array_7)
+        del einsum_384
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_21 = paddle._C_ops.slice(
+            reshape_151, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_151
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_152 = paddle._C_ops.reshape(slice_21, full_int_array_9)
+        del slice_21
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_21 = paddle._C_ops.index_select(reshape_152, arange_2, 3)
+        del reshape_152
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_192 = paddle._C_ops.add(reshape_147, parameter_44)
+        del parameter_44, reshape_147
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_130 = [add_192, parameter_42]
+        del add_192, parameter_42
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_387, einsum_388, einsum_389 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_130, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_130
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_516,
+            split_517,
+        ) = einsum_388
+        del einsum_388
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_518,
+            split_519,
+        ) = einsum_389
+        del einsum_389
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_131 = [cast_5, einsum_387]
+        del einsum_387
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_390, einsum_391, einsum_392 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_131, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_131
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_520,
+            split_521,
+        ) = einsum_391
+        del einsum_391
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_522,
+            split_523,
+        ) = einsum_392
+        del einsum_392
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_193 = paddle._C_ops.add(einsum_381, index_select_21)
+        del einsum_381, index_select_21
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_194 = paddle._C_ops.add(add_193, einsum_390)
+        del add_193, einsum_390
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_25 = paddle._C_ops.scale(add_194, full_16, float("0"), True)
+        del add_194
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_21 = paddle._C_ops.subtract(scale_25, scale_4)
+        del scale_25
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_21 = paddle._C_ops.softmax(subtract_21, 3)
+        del subtract_21
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_172, dropout_173 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_21, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_21
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_132 = [dropout_172, reshape_149]
+        del dropout_172, reshape_149
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_393, einsum_394, einsum_395 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_132, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_132
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_524,
+            split_525,
+        ) = einsum_394
+        del einsum_394
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_526,
+            split_527,
+        ) = einsum_395
+        del einsum_395
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_153 = paddle._C_ops.reshape(einsum_393, full_int_array_10)
+        del einsum_393
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_133 = [reshape_153, parameter_47]
+        del parameter_47, reshape_153
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_396, einsum_397, einsum_398 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_133, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_133
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_528,
+            split_529,
+        ) = einsum_397
+        del einsum_397
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_530,
+            split_531,
+        ) = einsum_398
+        del einsum_398
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_174, dropout_175 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_396, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_396
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_195 = paddle._C_ops.add(dropout_174, layer_norm_123)
+        del dropout_174, layer_norm_123
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_126, layer_norm_127, layer_norm_128 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_195, parameter_41, parameter_40, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_195, parameter_40, parameter_41
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_130 = paddle._C_ops.matmul(layer_norm_126, parameter_37, False, False)
+        del parameter_37
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_196 = paddle._C_ops.add(matmul_130, parameter_36)
+        del matmul_130, parameter_36
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_21 = paddle._C_ops.relu(add_196)
+        del add_196
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_176, dropout_177 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_21, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_21
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_131 = paddle._C_ops.matmul(dropout_176, parameter_35, False, False)
+        del dropout_176, parameter_35
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_197 = paddle._C_ops.add(matmul_131, parameter_34)
+        del matmul_131, parameter_34
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_178, dropout_179 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_197, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_197
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_198 = paddle._C_ops.add(dropout_178, layer_norm_126)
+        del dropout_178, layer_norm_126
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_129, layer_norm_130, layer_norm_131 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_198, parameter_39, parameter_38, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_198, parameter_38, parameter_39
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_132 = paddle._C_ops.matmul(layer_norm_129, parameter_33, False, False)
+        del parameter_33
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_154 = paddle._C_ops.reshape(matmul_132, full_int_array_5)
+        del matmul_132
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_133 = paddle._C_ops.matmul(layer_norm_129, parameter_32, False, False)
+        del parameter_32
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_155 = paddle._C_ops.reshape(matmul_133, full_int_array_5)
+        del matmul_133
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_134 = paddle._C_ops.matmul(layer_norm_129, parameter_31, False, False)
+        del parameter_31
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_156 = paddle._C_ops.reshape(matmul_134, full_int_array_5)
+        del matmul_134
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_135 = paddle._C_ops.matmul(dropout_2, parameter_29, False, False)
+        del parameter_29
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_157 = paddle._C_ops.reshape(matmul_135, full_int_array_6)
+        del matmul_135
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_199 = paddle._C_ops.add(reshape_154, parameter_26)
+        del parameter_26
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_134 = [add_199, reshape_155]
+        del add_199, reshape_155
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_399, einsum_400, einsum_401 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_134, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_134
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_532,
+            split_533,
+        ) = einsum_400
+        del einsum_400
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_534,
+            split_535,
+        ) = einsum_401
+        del einsum_401
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_200 = paddle._C_ops.add(reshape_154, parameter_28)
+        del parameter_28
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_135 = [add_200, reshape_157]
+        del add_200, reshape_157
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_402, einsum_403, einsum_404 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_135, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_135
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_536,
+            split_537,
+        ) = einsum_403
+        del einsum_403
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_538,
+            split_539,
+        ) = einsum_404
+        del einsum_404
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_158 = paddle._C_ops.reshape(einsum_402, full_int_array_7)
+        del einsum_402
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_22 = paddle._C_ops.slice(
+            reshape_158, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_158
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_159 = paddle._C_ops.reshape(slice_22, full_int_array_9)
+        del slice_22
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_22 = paddle._C_ops.index_select(reshape_159, arange_2, 3)
+        del reshape_159
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_201 = paddle._C_ops.add(reshape_154, parameter_27)
+        del parameter_27, reshape_154
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_136 = [add_201, parameter_25]
+        del add_201, parameter_25
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_405, einsum_406, einsum_407 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_136, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_136
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_540,
+            split_541,
+        ) = einsum_406
+        del einsum_406
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_542,
+            split_543,
+        ) = einsum_407
+        del einsum_407
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_137 = [cast_5, einsum_405]
+        del einsum_405
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_408, einsum_409, einsum_410 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_137, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_137
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_544,
+            split_545,
+        ) = einsum_409
+        del einsum_409
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_546,
+            split_547,
+        ) = einsum_410
+        del einsum_410
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_202 = paddle._C_ops.add(einsum_399, index_select_22)
+        del einsum_399, index_select_22
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_203 = paddle._C_ops.add(add_202, einsum_408)
+        del add_202, einsum_408
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_26 = paddle._C_ops.scale(add_203, full_16, float("0"), True)
+        del add_203
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_22 = paddle._C_ops.subtract(scale_26, scale_4)
+        del scale_26
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_22 = paddle._C_ops.softmax(subtract_22, 3)
+        del subtract_22
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_180, dropout_181 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_22, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_22
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_138 = [dropout_180, reshape_156]
+        del dropout_180, reshape_156
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_411, einsum_412, einsum_413 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_138, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_138
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_548,
+            split_549,
+        ) = einsum_412
+        del einsum_412
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_550,
+            split_551,
+        ) = einsum_413
+        del einsum_413
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_160 = paddle._C_ops.reshape(einsum_411, full_int_array_10)
+        del einsum_411
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_139 = [reshape_160, parameter_30]
+        del parameter_30, reshape_160
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_414, einsum_415, einsum_416 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_139, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_139
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_552,
+            split_553,
+        ) = einsum_415
+        del einsum_415
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_554,
+            split_555,
+        ) = einsum_416
+        del einsum_416
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_182, dropout_183 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_414, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_414
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_204 = paddle._C_ops.add(dropout_182, layer_norm_129)
+        del dropout_182, layer_norm_129
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_132, layer_norm_133, layer_norm_134 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_204, parameter_24, parameter_23, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_204, parameter_23, parameter_24
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_136 = paddle._C_ops.matmul(layer_norm_132, parameter_20, False, False)
+        del parameter_20
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_205 = paddle._C_ops.add(matmul_136, parameter_19)
+        del matmul_136, parameter_19
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_22 = paddle._C_ops.relu(add_205)
+        del add_205
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_184, dropout_185 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_22, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_22
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_137 = paddle._C_ops.matmul(dropout_184, parameter_18, False, False)
+        del dropout_184, parameter_18
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_206 = paddle._C_ops.add(matmul_137, parameter_17)
+        del matmul_137, parameter_17
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_186, dropout_187 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_206, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_206
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_207 = paddle._C_ops.add(dropout_186, layer_norm_132)
+        del dropout_186, layer_norm_132
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_135, layer_norm_136, layer_norm_137 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_207, parameter_22, parameter_21, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_207, parameter_21, parameter_22
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_138 = paddle._C_ops.matmul(layer_norm_135, parameter_16, False, False)
+        del parameter_16
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_161 = paddle._C_ops.reshape(matmul_138, full_int_array_5)
+        del matmul_138
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_139 = paddle._C_ops.matmul(layer_norm_135, parameter_15, False, False)
+        del parameter_15
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_162 = paddle._C_ops.reshape(matmul_139, full_int_array_5)
+        del matmul_139
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x768xf32, 768x768xf32)
+        matmul_140 = paddle._C_ops.matmul(layer_norm_135, parameter_14, False, False)
+        del parameter_14
+
+        # pd_op.reshape: (9x1x12x64xf32) <- (9x1x768xf32, 4xi64)
+        reshape_163 = paddle._C_ops.reshape(matmul_140, full_int_array_5)
+        del full_int_array_5, matmul_140
+
+        # pd_op.matmul: (18x1x768xf32) <- (18x1x768xf32, 768x768xf32)
+        matmul_141 = paddle._C_ops.matmul(dropout_2, parameter_12, False, False)
+        del dropout_2, parameter_12
+
+        # pd_op.reshape: (18x1x12x64xf32) <- (18x1x768xf32, 4xi64)
+        reshape_164 = paddle._C_ops.reshape(matmul_141, full_int_array_6)
+        del full_int_array_6, matmul_141
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_208 = paddle._C_ops.add(reshape_161, parameter_9)
+        del parameter_9
+
+        # builtin.combine: ([9x1x12x64xf32, 9x1x12x64xf32]) <- (9x1x12x64xf32, 9x1x12x64xf32)
+        combine_140 = [add_208, reshape_162]
+        del add_208, reshape_162
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x1x12x64xf32, 9x1x12x64xf32]) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        einsum_417, einsum_418, einsum_419 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_140, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_140
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_556,
+            split_557,
+        ) = einsum_418
+        del einsum_418
+
+        # builtin.split: (9x1x12x64xf32, 9x1x12x64xf32) <- ([9x1x12x64xf32, 9x1x12x64xf32])
+        (
+            split_558,
+            split_559,
+        ) = einsum_419
+        del einsum_419
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_209 = paddle._C_ops.add(reshape_161, parameter_11)
+        del parameter_11
+
+        # builtin.combine: ([9x1x12x64xf32, 18x1x12x64xf32]) <- (9x1x12x64xf32, 18x1x12x64xf32)
+        combine_141 = [add_209, reshape_164]
+        del add_209, reshape_164
+
+        # pd_op.einsum: (1x12x9x18xf32, [0xf32, 0xf32], [9x1x12x64xf32, 18x1x12x64xf32]) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        einsum_420, einsum_421, einsum_422 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_141, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_141
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_560,
+            split_561,
+        ) = einsum_421
+        del einsum_421
+
+        # builtin.split: (9x1x12x64xf32, 18x1x12x64xf32) <- ([9x1x12x64xf32, 18x1x12x64xf32])
+        (
+            split_562,
+            split_563,
+        ) = einsum_422
+        del einsum_422
+
+        # pd_op.reshape: (1x12x18x9xf32) <- (1x12x9x18xf32, 4xi64)
+        reshape_165 = paddle._C_ops.reshape(einsum_420, full_int_array_7)
+        del einsum_420, full_int_array_7
+
+        # pd_op.slice: (1x12x17x9xf32) <- (1x12x18x9xf32, 1xi64, 1xi64)
+        slice_23 = paddle._C_ops.slice(
+            reshape_165, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del full_int_array_3, full_int_array_8, reshape_165
+
+        # pd_op.reshape: (1x12x9x17xf32) <- (1x12x17x9xf32, 4xi64)
+        reshape_166 = paddle._C_ops.reshape(slice_23, full_int_array_9)
+        del full_int_array_9, slice_23
+
+        # pd_op.index_select: (1x12x9x9xf32) <- (1x12x9x17xf32, 9xi64)
+        index_select_23 = paddle._C_ops.index_select(reshape_166, arange_2, 3)
+        del arange_2, reshape_166
+
+        # pd_op.add: (9x1x12x64xf32) <- (9x1x12x64xf32, 12x64xf32)
+        add_210 = paddle._C_ops.add(reshape_161, parameter_10)
+        del parameter_10, reshape_161
+
+        # builtin.combine: ([9x1x12x64xf32, 2x12x64xf32]) <- (9x1x12x64xf32, 2x12x64xf32)
+        combine_142 = [add_210, parameter_8]
+        del add_210, parameter_8
+
+        # pd_op.einsum: (9x1x12x2xf32, [0xf32, 0xf32], [9x1x12x64xf32, 2x12x64xf32]) <- ([9x1x12x64xf32, 2x12x64xf32])
+        einsum_423, einsum_424, einsum_425 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_142, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_142
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_564,
+            split_565,
+        ) = einsum_424
+        del einsum_424
+
+        # builtin.split: (9x1x12x64xf32, 2x12x64xf32) <- ([9x1x12x64xf32, 2x12x64xf32])
+        (
+            split_566,
+            split_567,
+        ) = einsum_425
+        del einsum_425
+
+        # builtin.combine: ([9x9x1x2xf32, 9x1x12x2xf32]) <- (9x9x1x2xf32, 9x1x12x2xf32)
+        combine_143 = [cast_5, einsum_423]
+        del cast_5, einsum_423
+
+        # pd_op.einsum: (1x12x9x9xf32, [0xf32, 0xf32], [9x9x1x2xf32, 9x1x12x2xf32]) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        einsum_426, einsum_427, einsum_428 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_143, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_143
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_568,
+            split_569,
+        ) = einsum_427
+        del einsum_427
+
+        # builtin.split: (9x9x1x2xf32, 9x1x12x2xf32) <- ([9x9x1x2xf32, 9x1x12x2xf32])
+        (
+            split_570,
+            split_571,
+        ) = einsum_428
+        del einsum_428
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_211 = paddle._C_ops.add(einsum_417, index_select_23)
+        del einsum_417, index_select_23
+
+        # pd_op.add: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x12x9x9xf32)
+        add_212 = paddle._C_ops.add(add_211, einsum_426)
+        del add_211, einsum_426
+
+        # pd_op.scale: (1x12x9x9xf32) <- (1x12x9x9xf32, 1xf32)
+        scale_27 = paddle._C_ops.scale(add_212, full_16, float("0"), True)
+        del add_212, full_16
+
+        # pd_op.subtract: (1x12x9x9xf32) <- (1x12x9x9xf32, 1x1x9x9xf32)
+        subtract_23 = paddle._C_ops.subtract(scale_27, scale_4)
+        del scale_27, scale_4
+
+        # pd_op.softmax: (1x12x9x9xf32) <- (1x12x9x9xf32)
+        softmax_23 = paddle._C_ops.softmax(subtract_23, 3)
+        del subtract_23
+
+        # pd_op.dropout: (1x12x9x9xf32, 1x12x9x9xui8) <- (1x12x9x9xf32, None, 1xf32)
+        dropout_188, dropout_189 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_23, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_23
+
+        # builtin.combine: ([1x12x9x9xf32, 9x1x12x64xf32]) <- (1x12x9x9xf32, 9x1x12x64xf32)
+        combine_144 = [dropout_188, reshape_163]
+        del dropout_188, reshape_163
+
+        # pd_op.einsum: (9x1x12x64xf32, [0xf32, 0xf32], [1x12x9x9xf32, 9x1x12x64xf32]) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        einsum_429, einsum_430, einsum_431 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_144, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_144
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_572,
+            split_573,
+        ) = einsum_430
+        del einsum_430
+
+        # builtin.split: (1x12x9x9xf32, 9x1x12x64xf32) <- ([1x12x9x9xf32, 9x1x12x64xf32])
+        (
+            split_574,
+            split_575,
+        ) = einsum_431
+        del einsum_431
+
+        # pd_op.reshape: (9x1x768xf32) <- (9x1x12x64xf32, 3xi64)
+        reshape_167 = paddle._C_ops.reshape(einsum_429, full_int_array_10)
+        del einsum_429, full_int_array_10
+
+        # builtin.combine: ([9x1x768xf32, 768x768xf32]) <- (9x1x768xf32, 768x768xf32)
+        combine_145 = [reshape_167, parameter_13]
+        del parameter_13, reshape_167
+
+        # pd_op.einsum: (9x1x768xf32, [0xf32, 0xf32], [9x1x768xf32, 768x768xf32]) <- ([9x1x768xf32, 768x768xf32])
+        einsum_432, einsum_433, einsum_434 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_145, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_145
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_576,
+            split_577,
+        ) = einsum_433
+        del einsum_433
+
+        # builtin.split: (9x1x768xf32, 768x768xf32) <- ([9x1x768xf32, 768x768xf32])
+        (
+            split_578,
+            split_579,
+        ) = einsum_434
+        del einsum_434
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_190, dropout_191 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_432, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_432
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_213 = paddle._C_ops.add(dropout_190, layer_norm_135)
+        del dropout_190, layer_norm_135
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_138, layer_norm_139, layer_norm_140 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_213, parameter_7, parameter_6, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_213, parameter_6, parameter_7
+
+        # pd_op.matmul: (9x1x3072xf32) <- (9x1x768xf32, 768x3072xf32)
+        matmul_142 = paddle._C_ops.matmul(layer_norm_138, parameter_3, False, False)
+        del parameter_3
+
+        # pd_op.add: (9x1x3072xf32) <- (9x1x3072xf32, 3072xf32)
+        add_214 = paddle._C_ops.add(matmul_142, parameter_2)
+        del matmul_142, parameter_2
+
+        # pd_op.relu: (9x1x3072xf32) <- (9x1x3072xf32)
+        relu_23 = paddle._C_ops.relu(add_214)
+        del add_214
+
+        # pd_op.dropout: (9x1x3072xf32, 9x1x3072xui8) <- (9x1x3072xf32, None, 1xf32)
+        dropout_192, dropout_193 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_23, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_23
+
+        # pd_op.matmul: (9x1x768xf32) <- (9x1x3072xf32, 3072x768xf32)
+        matmul_143 = paddle._C_ops.matmul(dropout_192, parameter_1, False, False)
+        del dropout_192, parameter_1
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 768xf32)
+        add_215 = paddle._C_ops.add(matmul_143, parameter_0)
+        del matmul_143, parameter_0
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_194, dropout_195 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_215, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_215
+
+        # pd_op.add: (9x1x768xf32) <- (9x1x768xf32, 9x1x768xf32)
+        add_216 = paddle._C_ops.add(dropout_194, layer_norm_138)
+        del dropout_194, layer_norm_138
+
+        # pd_op.layer_norm: (9x1x768xf32, 9x1xf32, 9x1xf32) <- (9x1x768xf32, 768xf32, 768xf32)
+        layer_norm_141, layer_norm_142, layer_norm_143 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_216, parameter_5, parameter_4, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_216, parameter_4, parameter_5
+
+        # pd_op.dropout: (9x1x768xf32, 9x1x768xui8) <- (9x1x768xf32, None, 1xf32)
+        dropout_196, dropout_197 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                layer_norm_141, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del full_3, layer_norm_141
+
+        # pd_op.transpose: (1x9x768xf32) <- (9x1x768xf32)
+        transpose_0 = paddle._C_ops.transpose(dropout_196, [1, 0, 2])
+        del dropout_196
+
+        return transpose_0
diff --git a/paddle_samples/PaddleNLP/chinese-xlnet-mid/weight_meta.py b/paddle_samples/PaddleNLP/chinese-xlnet-mid/weight_meta.py
new file mode 100644
index 000000000..150aa4ab9
--- /dev/null
+++ b/paddle_samples/PaddleNLP/chinese-xlnet-mid/weight_meta.py
@@ -0,0 +1,4076 @@
+class Program_weight_tensor_parameter_0:
+    name = "parameter_0"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_1:
+    name = "parameter_1"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0992385")
+    max_val = float("0.102875")
+    mean = float("-2.87104e-05")
+    std = float("0.019993")
+    data = None
+
+
+class Program_weight_tensor_parameter_2:
+    name = "parameter_2"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_3:
+    name = "parameter_3"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.104082")
+    max_val = float("0.0932895")
+    mean = float("-3.06962e-05")
+    std = float("0.0200036")
+    data = None
+
+
+class Program_weight_tensor_parameter_4:
+    name = "parameter_4"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_5:
+    name = "parameter_5"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_6:
+    name = "parameter_6"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_7:
+    name = "parameter_7"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_8:
+    name = "parameter_8"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0685599")
+    max_val = float("0.0658963")
+    mean = float("-0.00024417")
+    std = float("0.0204489")
+    data = None
+
+
+class Program_weight_tensor_parameter_9:
+    name = "parameter_9"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0651445")
+    max_val = float("0.0592371")
+    mean = float("-2.21256e-05")
+    std = float("0.019764")
+    data = None
+
+
+class Program_weight_tensor_parameter_10:
+    name = "parameter_10"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0556366")
+    max_val = float("0.0566928")
+    mean = float("0.000764188")
+    std = float("0.0203508")
+    data = None
+
+
+class Program_weight_tensor_parameter_11:
+    name = "parameter_11"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0525137")
+    max_val = float("0.06786")
+    mean = float("0.000539884")
+    std = float("0.0197949")
+    data = None
+
+
+class Program_weight_tensor_parameter_12:
+    name = "parameter_12"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.087482")
+    max_val = float("0.0960763")
+    mean = float("2.87498e-05")
+    std = float("0.0199794")
+    data = None
+
+
+class Program_weight_tensor_parameter_13:
+    name = "parameter_13"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0949354")
+    max_val = float("0.0935933")
+    mean = float("-6.02944e-06")
+    std = float("0.0200094")
+    data = None
+
+
+class Program_weight_tensor_parameter_14:
+    name = "parameter_14"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0916653")
+    max_val = float("0.1009")
+    mean = float("8.5081e-08")
+    std = float("0.0200007")
+    data = None
+
+
+class Program_weight_tensor_parameter_15:
+    name = "parameter_15"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.103925")
+    max_val = float("0.110142")
+    mean = float("1.3423e-05")
+    std = float("0.0200011")
+    data = None
+
+
+class Program_weight_tensor_parameter_16:
+    name = "parameter_16"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0951989")
+    max_val = float("0.0954426")
+    mean = float("1.36197e-05")
+    std = float("0.0200253")
+    data = None
+
+
+class Program_weight_tensor_parameter_17:
+    name = "parameter_17"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_18:
+    name = "parameter_18"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0996241")
+    max_val = float("0.102408")
+    mean = float("7.37253e-06")
+    std = float("0.020012")
+    data = None
+
+
+class Program_weight_tensor_parameter_19:
+    name = "parameter_19"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_20:
+    name = "parameter_20"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0932743")
+    max_val = float("0.0952639")
+    mean = float("-5.89792e-06")
+    std = float("0.0199952")
+    data = None
+
+
+class Program_weight_tensor_parameter_21:
+    name = "parameter_21"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_22:
+    name = "parameter_22"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_23:
+    name = "parameter_23"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_24:
+    name = "parameter_24"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_25:
+    name = "parameter_25"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0613574")
+    max_val = float("0.0706353")
+    mean = float("0.000535646")
+    std = float("0.0201446")
+    data = None
+
+
+class Program_weight_tensor_parameter_26:
+    name = "parameter_26"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0598611")
+    max_val = float("0.0500983")
+    mean = float("-0.00102849")
+    std = float("0.0188322")
+    data = None
+
+
+class Program_weight_tensor_parameter_27:
+    name = "parameter_27"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0727003")
+    max_val = float("0.0524254")
+    mean = float("-1.29274e-05")
+    std = float("0.0203901")
+    data = None
+
+
+class Program_weight_tensor_parameter_28:
+    name = "parameter_28"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0786873")
+    max_val = float("0.0639895")
+    mean = float("-0.000650006")
+    std = float("0.0200197")
+    data = None
+
+
+class Program_weight_tensor_parameter_29:
+    name = "parameter_29"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0899222")
+    max_val = float("0.102717")
+    mean = float("-1.20835e-05")
+    std = float("0.0199781")
+    data = None
+
+
+class Program_weight_tensor_parameter_30:
+    name = "parameter_30"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0949745")
+    max_val = float("0.0943316")
+    mean = float("1.70752e-05")
+    std = float("0.0199991")
+    data = None
+
+
+class Program_weight_tensor_parameter_31:
+    name = "parameter_31"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.091913")
+    max_val = float("0.0885693")
+    mean = float("1.49961e-05")
+    std = float("0.0200166")
+    data = None
+
+
+class Program_weight_tensor_parameter_32:
+    name = "parameter_32"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.10528")
+    max_val = float("0.101579")
+    mean = float("-4.47538e-05")
+    std = float("0.0199851")
+    data = None
+
+
+class Program_weight_tensor_parameter_33:
+    name = "parameter_33"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0964461")
+    max_val = float("0.0988549")
+    mean = float("-4.61212e-05")
+    std = float("0.0199858")
+    data = None
+
+
+class Program_weight_tensor_parameter_34:
+    name = "parameter_34"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_35:
+    name = "parameter_35"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.104177")
+    max_val = float("0.100672")
+    mean = float("1.29213e-05")
+    std = float("0.0200155")
+    data = None
+
+
+class Program_weight_tensor_parameter_36:
+    name = "parameter_36"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_37:
+    name = "parameter_37"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.103687")
+    max_val = float("0.0960466")
+    mean = float("-3.52816e-06")
+    std = float("0.0200105")
+    data = None
+
+
+class Program_weight_tensor_parameter_38:
+    name = "parameter_38"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_39:
+    name = "parameter_39"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_40:
+    name = "parameter_40"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_41:
+    name = "parameter_41"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_42:
+    name = "parameter_42"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0791351")
+    max_val = float("0.0603129")
+    mean = float("0.000312459")
+    std = float("0.0208899")
+    data = None
+
+
+class Program_weight_tensor_parameter_43:
+    name = "parameter_43"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0616493")
+    max_val = float("0.074826")
+    mean = float("0.00114984")
+    std = float("0.0216077")
+    data = None
+
+
+class Program_weight_tensor_parameter_44:
+    name = "parameter_44"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0519985")
+    max_val = float("0.0661832")
+    mean = float("-0.0010894")
+    std = float("0.0196826")
+    data = None
+
+
+class Program_weight_tensor_parameter_45:
+    name = "parameter_45"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0660265")
+    max_val = float("0.0629053")
+    mean = float("0.000248182")
+    std = float("0.0198376")
+    data = None
+
+
+class Program_weight_tensor_parameter_46:
+    name = "parameter_46"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0989029")
+    max_val = float("0.0909414")
+    mean = float("9.17577e-06")
+    std = float("0.0199848")
+    data = None
+
+
+class Program_weight_tensor_parameter_47:
+    name = "parameter_47"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0923946")
+    max_val = float("0.0960927")
+    mean = float("-3.97571e-05")
+    std = float("0.0200341")
+    data = None
+
+
+class Program_weight_tensor_parameter_48:
+    name = "parameter_48"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.104238")
+    max_val = float("0.0966158")
+    mean = float("2.18959e-05")
+    std = float("0.0200147")
+    data = None
+
+
+class Program_weight_tensor_parameter_49:
+    name = "parameter_49"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.095079")
+    max_val = float("0.0929343")
+    mean = float("5.30684e-05")
+    std = float("0.0200075")
+    data = None
+
+
+class Program_weight_tensor_parameter_50:
+    name = "parameter_50"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.094298")
+    max_val = float("0.100605")
+    mean = float("9.54803e-06")
+    std = float("0.0199927")
+    data = None
+
+
+class Program_weight_tensor_parameter_51:
+    name = "parameter_51"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_52:
+    name = "parameter_52"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0964315")
+    max_val = float("0.0995373")
+    mean = float("-1.751e-05")
+    std = float("0.0200114")
+    data = None
+
+
+class Program_weight_tensor_parameter_53:
+    name = "parameter_53"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_54:
+    name = "parameter_54"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0963421")
+    max_val = float("0.104976")
+    mean = float("-1.665e-05")
+    std = float("0.0199913")
+    data = None
+
+
+class Program_weight_tensor_parameter_55:
+    name = "parameter_55"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_56:
+    name = "parameter_56"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_57:
+    name = "parameter_57"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_58:
+    name = "parameter_58"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_59:
+    name = "parameter_59"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0641725")
+    max_val = float("0.0668724")
+    mean = float("-0.000221256")
+    std = float("0.0201518")
+    data = None
+
+
+class Program_weight_tensor_parameter_60:
+    name = "parameter_60"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.058031")
+    max_val = float("0.0713647")
+    mean = float("0.000548997")
+    std = float("0.0198842")
+    data = None
+
+
+class Program_weight_tensor_parameter_61:
+    name = "parameter_61"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0533915")
+    max_val = float("0.0730677")
+    mean = float("-0.0011631")
+    std = float("0.0215013")
+    data = None
+
+
+class Program_weight_tensor_parameter_62:
+    name = "parameter_62"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0613344")
+    max_val = float("0.0760659")
+    mean = float("-0.0004222")
+    std = float("0.0204239")
+    data = None
+
+
+class Program_weight_tensor_parameter_63:
+    name = "parameter_63"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.101617")
+    max_val = float("0.0938434")
+    mean = float("2.8609e-05")
+    std = float("0.0199775")
+    data = None
+
+
+class Program_weight_tensor_parameter_64:
+    name = "parameter_64"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0954409")
+    max_val = float("0.091044")
+    mean = float("1.51252e-05")
+    std = float("0.0199946")
+    data = None
+
+
+class Program_weight_tensor_parameter_65:
+    name = "parameter_65"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0960642")
+    max_val = float("0.0992712")
+    mean = float("-3.47097e-05")
+    std = float("0.0200068")
+    data = None
+
+
+class Program_weight_tensor_parameter_66:
+    name = "parameter_66"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0978587")
+    max_val = float("0.0919599")
+    mean = float("-3.31101e-06")
+    std = float("0.0199954")
+    data = None
+
+
+class Program_weight_tensor_parameter_67:
+    name = "parameter_67"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0969195")
+    max_val = float("0.100116")
+    mean = float("-9.60172e-06")
+    std = float("0.0200226")
+    data = None
+
+
+class Program_weight_tensor_parameter_68:
+    name = "parameter_68"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_69:
+    name = "parameter_69"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0996224")
+    max_val = float("0.104063")
+    mean = float("-8.36708e-06")
+    std = float("0.0199974")
+    data = None
+
+
+class Program_weight_tensor_parameter_70:
+    name = "parameter_70"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_71:
+    name = "parameter_71"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.100492")
+    max_val = float("0.110091")
+    mean = float("-2.8787e-05")
+    std = float("0.0200235")
+    data = None
+
+
+class Program_weight_tensor_parameter_72:
+    name = "parameter_72"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_73:
+    name = "parameter_73"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_74:
+    name = "parameter_74"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_75:
+    name = "parameter_75"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_76:
+    name = "parameter_76"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0772095")
+    max_val = float("0.0721144")
+    mean = float("0.000643863")
+    std = float("0.0204347")
+    data = None
+
+
+class Program_weight_tensor_parameter_77:
+    name = "parameter_77"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0637782")
+    max_val = float("0.0591677")
+    mean = float("0.000285496")
+    std = float("0.0197946")
+    data = None
+
+
+class Program_weight_tensor_parameter_78:
+    name = "parameter_78"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0705864")
+    max_val = float("0.0700757")
+    mean = float("0.000367466")
+    std = float("0.021182")
+    data = None
+
+
+class Program_weight_tensor_parameter_79:
+    name = "parameter_79"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0582186")
+    max_val = float("0.0636393")
+    mean = float("0.000220073")
+    std = float("0.0199499")
+    data = None
+
+
+class Program_weight_tensor_parameter_80:
+    name = "parameter_80"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0923404")
+    max_val = float("0.0956992")
+    mean = float("3.59996e-05")
+    std = float("0.0199696")
+    data = None
+
+
+class Program_weight_tensor_parameter_81:
+    name = "parameter_81"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0900342")
+    max_val = float("0.0959493")
+    mean = float("-3.62642e-06")
+    std = float("0.020046")
+    data = None
+
+
+class Program_weight_tensor_parameter_82:
+    name = "parameter_82"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.100684")
+    max_val = float("0.0980237")
+    mean = float("2.45752e-05")
+    std = float("0.0199999")
+    data = None
+
+
+class Program_weight_tensor_parameter_83:
+    name = "parameter_83"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0884012")
+    max_val = float("0.101642")
+    mean = float("1.70642e-06")
+    std = float("0.0200474")
+    data = None
+
+
+class Program_weight_tensor_parameter_84:
+    name = "parameter_84"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0967449")
+    max_val = float("0.0925632")
+    mean = float("-3.19217e-05")
+    std = float("0.0199837")
+    data = None
+
+
+class Program_weight_tensor_parameter_85:
+    name = "parameter_85"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_86:
+    name = "parameter_86"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0980523")
+    max_val = float("0.0972812")
+    mean = float("1.10406e-06")
+    std = float("0.0199914")
+    data = None
+
+
+class Program_weight_tensor_parameter_87:
+    name = "parameter_87"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_88:
+    name = "parameter_88"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.102435")
+    max_val = float("0.092993")
+    mean = float("1.72811e-05")
+    std = float("0.0199927")
+    data = None
+
+
+class Program_weight_tensor_parameter_89:
+    name = "parameter_89"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_90:
+    name = "parameter_90"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_91:
+    name = "parameter_91"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_92:
+    name = "parameter_92"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_93:
+    name = "parameter_93"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0604049")
+    max_val = float("0.066894")
+    mean = float("0.000595404")
+    std = float("0.0204146")
+    data = None
+
+
+class Program_weight_tensor_parameter_94:
+    name = "parameter_94"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0614719")
+    max_val = float("0.0783159")
+    mean = float("6.12256e-05")
+    std = float("0.0208108")
+    data = None
+
+
+class Program_weight_tensor_parameter_95:
+    name = "parameter_95"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0741644")
+    max_val = float("0.067392")
+    mean = float("0.00107734")
+    std = float("0.0202505")
+    data = None
+
+
+class Program_weight_tensor_parameter_96:
+    name = "parameter_96"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0629797")
+    max_val = float("0.062391")
+    mean = float("0.000485195")
+    std = float("0.0197896")
+    data = None
+
+
+class Program_weight_tensor_parameter_97:
+    name = "parameter_97"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.088169")
+    max_val = float("0.102024")
+    mean = float("3.27016e-05")
+    std = float("0.0199833")
+    data = None
+
+
+class Program_weight_tensor_parameter_98:
+    name = "parameter_98"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0988853")
+    max_val = float("0.0935851")
+    mean = float("-1.99768e-05")
+    std = float("0.0200103")
+    data = None
+
+
+class Program_weight_tensor_parameter_99:
+    name = "parameter_99"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0930175")
+    max_val = float("0.0980104")
+    mean = float("8.8307e-06")
+    std = float("0.0199685")
+    data = None
+
+
+class Program_weight_tensor_parameter_100:
+    name = "parameter_100"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.101789")
+    max_val = float("0.0885144")
+    mean = float("-2.49534e-05")
+    std = float("0.0200085")
+    data = None
+
+
+class Program_weight_tensor_parameter_101:
+    name = "parameter_101"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0991208")
+    max_val = float("0.0927385")
+    mean = float("-1.51644e-05")
+    std = float("0.0200208")
+    data = None
+
+
+class Program_weight_tensor_parameter_102:
+    name = "parameter_102"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_103:
+    name = "parameter_103"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.106835")
+    max_val = float("0.0931437")
+    mean = float("-1.74689e-05")
+    std = float("0.0200132")
+    data = None
+
+
+class Program_weight_tensor_parameter_104:
+    name = "parameter_104"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_105:
+    name = "parameter_105"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.105098")
+    max_val = float("0.102486")
+    mean = float("2.18372e-05")
+    std = float("0.0200075")
+    data = None
+
+
+class Program_weight_tensor_parameter_106:
+    name = "parameter_106"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_107:
+    name = "parameter_107"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_108:
+    name = "parameter_108"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_109:
+    name = "parameter_109"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_110:
+    name = "parameter_110"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0547854")
+    max_val = float("0.0617458")
+    mean = float("-0.000391915")
+    std = float("0.0198446")
+    data = None
+
+
+class Program_weight_tensor_parameter_111:
+    name = "parameter_111"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0608988")
+    max_val = float("0.0550923")
+    mean = float("-0.000148594")
+    std = float("0.019672")
+    data = None
+
+
+class Program_weight_tensor_parameter_112:
+    name = "parameter_112"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0556389")
+    max_val = float("0.0547488")
+    mean = float("-0.000872696")
+    std = float("0.0192708")
+    data = None
+
+
+class Program_weight_tensor_parameter_113:
+    name = "parameter_113"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0621615")
+    max_val = float("0.0675371")
+    mean = float("0.00106426")
+    std = float("0.0195946")
+    data = None
+
+
+class Program_weight_tensor_parameter_114:
+    name = "parameter_114"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0986293")
+    max_val = float("0.0882185")
+    mean = float("4.02172e-05")
+    std = float("0.0200042")
+    data = None
+
+
+class Program_weight_tensor_parameter_115:
+    name = "parameter_115"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0916852")
+    max_val = float("0.107538")
+    mean = float("8.26962e-07")
+    std = float("0.0199815")
+    data = None
+
+
+class Program_weight_tensor_parameter_116:
+    name = "parameter_116"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0972519")
+    max_val = float("0.0972278")
+    mean = float("7.19291e-06")
+    std = float("0.019994")
+    data = None
+
+
+class Program_weight_tensor_parameter_117:
+    name = "parameter_117"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0909221")
+    max_val = float("0.0950249")
+    mean = float("2.08483e-05")
+    std = float("0.0199805")
+    data = None
+
+
+class Program_weight_tensor_parameter_118:
+    name = "parameter_118"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.100268")
+    max_val = float("0.0911824")
+    mean = float("5.09453e-07")
+    std = float("0.0199752")
+    data = None
+
+
+class Program_weight_tensor_parameter_119:
+    name = "parameter_119"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_120:
+    name = "parameter_120"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0976225")
+    max_val = float("0.102656")
+    mean = float("-1.5584e-06")
+    std = float("0.0200065")
+    data = None
+
+
+class Program_weight_tensor_parameter_121:
+    name = "parameter_121"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_122:
+    name = "parameter_122"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0999746")
+    max_val = float("0.108195")
+    mean = float("1.10189e-05")
+    std = float("0.0200027")
+    data = None
+
+
+class Program_weight_tensor_parameter_123:
+    name = "parameter_123"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_124:
+    name = "parameter_124"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_125:
+    name = "parameter_125"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_126:
+    name = "parameter_126"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_127:
+    name = "parameter_127"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0665081")
+    max_val = float("0.0582528")
+    mean = float("0.000132215")
+    std = float("0.0197637")
+    data = None
+
+
+class Program_weight_tensor_parameter_128:
+    name = "parameter_128"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.07594")
+    max_val = float("0.0596196")
+    mean = float("0.000565413")
+    std = float("0.0200123")
+    data = None
+
+
+class Program_weight_tensor_parameter_129:
+    name = "parameter_129"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0611587")
+    max_val = float("0.0609616")
+    mean = float("0.00161991")
+    std = float("0.0208715")
+    data = None
+
+
+class Program_weight_tensor_parameter_130:
+    name = "parameter_130"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0701204")
+    max_val = float("0.0499476")
+    mean = float("-0.00112387")
+    std = float("0.0196437")
+    data = None
+
+
+class Program_weight_tensor_parameter_131:
+    name = "parameter_131"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.103737")
+    max_val = float("0.092225")
+    mean = float("1.58704e-05")
+    std = float("0.0199673")
+    data = None
+
+
+class Program_weight_tensor_parameter_132:
+    name = "parameter_132"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0978492")
+    max_val = float("0.11001")
+    mean = float("-1.83536e-05")
+    std = float("0.0200326")
+    data = None
+
+
+class Program_weight_tensor_parameter_133:
+    name = "parameter_133"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0957246")
+    max_val = float("0.0955106")
+    mean = float("-7.83377e-06")
+    std = float("0.0200067")
+    data = None
+
+
+class Program_weight_tensor_parameter_134:
+    name = "parameter_134"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.101451")
+    max_val = float("0.0968324")
+    mean = float("7.90246e-06")
+    std = float("0.0200042")
+    data = None
+
+
+class Program_weight_tensor_parameter_135:
+    name = "parameter_135"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0982089")
+    max_val = float("0.0935521")
+    mean = float("-1.45475e-05")
+    std = float("0.020008")
+    data = None
+
+
+class Program_weight_tensor_parameter_136:
+    name = "parameter_136"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_137:
+    name = "parameter_137"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0976562")
+    max_val = float("0.103721")
+    mean = float("5.02149e-06")
+    std = float("0.019994")
+    data = None
+
+
+class Program_weight_tensor_parameter_138:
+    name = "parameter_138"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_139:
+    name = "parameter_139"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0998944")
+    max_val = float("0.0967743")
+    mean = float("-1.02369e-05")
+    std = float("0.0200073")
+    data = None
+
+
+class Program_weight_tensor_parameter_140:
+    name = "parameter_140"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_141:
+    name = "parameter_141"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_142:
+    name = "parameter_142"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_143:
+    name = "parameter_143"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_144:
+    name = "parameter_144"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0688756")
+    max_val = float("0.0572315")
+    mean = float("0.000214926")
+    std = float("0.0201615")
+    data = None
+
+
+class Program_weight_tensor_parameter_145:
+    name = "parameter_145"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0580335")
+    max_val = float("0.0685004")
+    mean = float("-0.000466845")
+    std = float("0.0198299")
+    data = None
+
+
+class Program_weight_tensor_parameter_146:
+    name = "parameter_146"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0569562")
+    max_val = float("0.0720309")
+    mean = float("-0.000175533")
+    std = float("0.0193007")
+    data = None
+
+
+class Program_weight_tensor_parameter_147:
+    name = "parameter_147"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0580931")
+    max_val = float("0.0665461")
+    mean = float("0.000556311")
+    std = float("0.0207681")
+    data = None
+
+
+class Program_weight_tensor_parameter_148:
+    name = "parameter_148"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0911821")
+    max_val = float("0.0995433")
+    mean = float("-3.68664e-05")
+    std = float("0.020012")
+    data = None
+
+
+class Program_weight_tensor_parameter_149:
+    name = "parameter_149"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0916099")
+    max_val = float("0.092629")
+    mean = float("-6.98546e-05")
+    std = float("0.0199792")
+    data = None
+
+
+class Program_weight_tensor_parameter_150:
+    name = "parameter_150"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0930813")
+    max_val = float("0.0881732")
+    mean = float("-1.58426e-05")
+    std = float("0.0199799")
+    data = None
+
+
+class Program_weight_tensor_parameter_151:
+    name = "parameter_151"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0982105")
+    max_val = float("0.100704")
+    mean = float("-1.72037e-05")
+    std = float("0.0199786")
+    data = None
+
+
+class Program_weight_tensor_parameter_152:
+    name = "parameter_152"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0915965")
+    max_val = float("0.0910249")
+    mean = float("-1.76717e-05")
+    std = float("0.019996")
+    data = None
+
+
+class Program_weight_tensor_parameter_153:
+    name = "parameter_153"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_154:
+    name = "parameter_154"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0986691")
+    max_val = float("0.0982143")
+    mean = float("-5.27154e-06")
+    std = float("0.0200064")
+    data = None
+
+
+class Program_weight_tensor_parameter_155:
+    name = "parameter_155"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_156:
+    name = "parameter_156"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0996236")
+    max_val = float("0.104955")
+    mean = float("-5.87859e-06")
+    std = float("0.0200043")
+    data = None
+
+
+class Program_weight_tensor_parameter_157:
+    name = "parameter_157"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_158:
+    name = "parameter_158"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_159:
+    name = "parameter_159"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_160:
+    name = "parameter_160"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_161:
+    name = "parameter_161"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0555636")
+    max_val = float("0.0782154")
+    mean = float("0.000150856")
+    std = float("0.0193841")
+    data = None
+
+
+class Program_weight_tensor_parameter_162:
+    name = "parameter_162"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.071413")
+    max_val = float("0.0603286")
+    mean = float("0.000112784")
+    std = float("0.020672")
+    data = None
+
+
+class Program_weight_tensor_parameter_163:
+    name = "parameter_163"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0584897")
+    max_val = float("0.0601304")
+    mean = float("-0.000590898")
+    std = float("0.0200303")
+    data = None
+
+
+class Program_weight_tensor_parameter_164:
+    name = "parameter_164"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0555021")
+    max_val = float("0.0644254")
+    mean = float("3.81104e-05")
+    std = float("0.0193433")
+    data = None
+
+
+class Program_weight_tensor_parameter_165:
+    name = "parameter_165"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.11498")
+    max_val = float("0.0938218")
+    mean = float("-3.18497e-05")
+    std = float("0.0199834")
+    data = None
+
+
+class Program_weight_tensor_parameter_166:
+    name = "parameter_166"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0924328")
+    max_val = float("0.0910954")
+    mean = float("2.18907e-05")
+    std = float("0.0199983")
+    data = None
+
+
+class Program_weight_tensor_parameter_167:
+    name = "parameter_167"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0949582")
+    max_val = float("0.090847")
+    mean = float("2.31445e-05")
+    std = float("0.0199713")
+    data = None
+
+
+class Program_weight_tensor_parameter_168:
+    name = "parameter_168"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0951275")
+    max_val = float("0.0979711")
+    mean = float("-3.66977e-06")
+    std = float("0.0200251")
+    data = None
+
+
+class Program_weight_tensor_parameter_169:
+    name = "parameter_169"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.10068")
+    max_val = float("0.0961136")
+    mean = float("-1.67871e-05")
+    std = float("0.0199725")
+    data = None
+
+
+class Program_weight_tensor_parameter_170:
+    name = "parameter_170"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_171:
+    name = "parameter_171"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.10016")
+    max_val = float("0.105534")
+    mean = float("-1.36326e-06")
+    std = float("0.0200011")
+    data = None
+
+
+class Program_weight_tensor_parameter_172:
+    name = "parameter_172"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_173:
+    name = "parameter_173"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.10346")
+    max_val = float("0.10523")
+    mean = float("-7.98683e-06")
+    std = float("0.0199884")
+    data = None
+
+
+class Program_weight_tensor_parameter_174:
+    name = "parameter_174"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_175:
+    name = "parameter_175"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_176:
+    name = "parameter_176"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_177:
+    name = "parameter_177"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_178:
+    name = "parameter_178"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0911722")
+    max_val = float("0.0610926")
+    mean = float("-0.0007564")
+    std = float("0.0203989")
+    data = None
+
+
+class Program_weight_tensor_parameter_179:
+    name = "parameter_179"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0605921")
+    max_val = float("0.0597681")
+    mean = float("-0.00103852")
+    std = float("0.0196262")
+    data = None
+
+
+class Program_weight_tensor_parameter_180:
+    name = "parameter_180"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0705175")
+    max_val = float("0.0598262")
+    mean = float("0.000798028")
+    std = float("0.0199337")
+    data = None
+
+
+class Program_weight_tensor_parameter_181:
+    name = "parameter_181"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.087768")
+    max_val = float("0.0571883")
+    mean = float("-2.54134e-05")
+    std = float("0.0206943")
+    data = None
+
+
+class Program_weight_tensor_parameter_182:
+    name = "parameter_182"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0866027")
+    max_val = float("0.0895184")
+    mean = float("-1.29514e-05")
+    std = float("0.0200018")
+    data = None
+
+
+class Program_weight_tensor_parameter_183:
+    name = "parameter_183"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0960835")
+    max_val = float("0.0997401")
+    mean = float("-3.28011e-05")
+    std = float("0.0200195")
+    data = None
+
+
+class Program_weight_tensor_parameter_184:
+    name = "parameter_184"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0935858")
+    max_val = float("0.096516")
+    mean = float("-2.14898e-05")
+    std = float("0.0200093")
+    data = None
+
+
+class Program_weight_tensor_parameter_185:
+    name = "parameter_185"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0941115")
+    max_val = float("0.093538")
+    mean = float("1.04981e-05")
+    std = float("0.0200066")
+    data = None
+
+
+class Program_weight_tensor_parameter_186:
+    name = "parameter_186"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0968532")
+    max_val = float("0.100836")
+    mean = float("-2.9778e-06")
+    std = float("0.0199773")
+    data = None
+
+
+class Program_weight_tensor_parameter_187:
+    name = "parameter_187"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_188:
+    name = "parameter_188"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0981239")
+    max_val = float("0.0954288")
+    mean = float("-3.92532e-06")
+    std = float("0.0200006")
+    data = None
+
+
+class Program_weight_tensor_parameter_189:
+    name = "parameter_189"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_190:
+    name = "parameter_190"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.101791")
+    max_val = float("0.0980343")
+    mean = float("1.28602e-06")
+    std = float("0.0200032")
+    data = None
+
+
+class Program_weight_tensor_parameter_191:
+    name = "parameter_191"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_192:
+    name = "parameter_192"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_193:
+    name = "parameter_193"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_194:
+    name = "parameter_194"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_195:
+    name = "parameter_195"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.065808")
+    max_val = float("0.0693584")
+    mean = float("0.000553734")
+    std = float("0.0205792")
+    data = None
+
+
+class Program_weight_tensor_parameter_196:
+    name = "parameter_196"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.066239")
+    max_val = float("0.0623056")
+    mean = float("-0.000268497")
+    std = float("0.0201592")
+    data = None
+
+
+class Program_weight_tensor_parameter_197:
+    name = "parameter_197"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0632628")
+    max_val = float("0.0560296")
+    mean = float("0.000155703")
+    std = float("0.0197991")
+    data = None
+
+
+class Program_weight_tensor_parameter_198:
+    name = "parameter_198"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0657599")
+    max_val = float("0.0579888")
+    mean = float("0.000647882")
+    std = float("0.0196466")
+    data = None
+
+
+class Program_weight_tensor_parameter_199:
+    name = "parameter_199"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0975053")
+    max_val = float("0.105776")
+    mean = float("1.88585e-06")
+    std = float("0.02001")
+    data = None
+
+
+class Program_weight_tensor_parameter_200:
+    name = "parameter_200"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0982419")
+    max_val = float("0.0904129")
+    mean = float("3.3203e-07")
+    std = float("0.019978")
+    data = None
+
+
+class Program_weight_tensor_parameter_201:
+    name = "parameter_201"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0888827")
+    max_val = float("0.0927375")
+    mean = float("4.33929e-06")
+    std = float("0.0200373")
+    data = None
+
+
+class Program_weight_tensor_parameter_202:
+    name = "parameter_202"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0893663")
+    max_val = float("0.0907593")
+    mean = float("-3.60806e-05")
+    std = float("0.0199947")
+    data = None
+
+
+class Program_weight_tensor_parameter_203:
+    name = "parameter_203"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.102359")
+    max_val = float("0.0893301")
+    mean = float("-2.53916e-05")
+    std = float("0.0199954")
+    data = None
+
+
+class Program_weight_tensor_parameter_204:
+    name = "parameter_204"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_205:
+    name = "parameter_205"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0988273")
+    max_val = float("0.100349")
+    mean = float("3.3034e-06")
+    std = float("0.0199979")
+    data = None
+
+
+class Program_weight_tensor_parameter_206:
+    name = "parameter_206"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_207:
+    name = "parameter_207"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0940568")
+    max_val = float("0.0932684")
+    mean = float("-1.35575e-05")
+    std = float("0.0199953")
+    data = None
+
+
+class Program_weight_tensor_parameter_208:
+    name = "parameter_208"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_209:
+    name = "parameter_209"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_210:
+    name = "parameter_210"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_211:
+    name = "parameter_211"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_212:
+    name = "parameter_212"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0678443")
+    max_val = float("0.0617379")
+    mean = float("-0.000463799")
+    std = float("0.0197932")
+    data = None
+
+
+class Program_weight_tensor_parameter_213:
+    name = "parameter_213"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0669562")
+    max_val = float("0.0779093")
+    mean = float("0.000177708")
+    std = float("0.0197129")
+    data = None
+
+
+class Program_weight_tensor_parameter_214:
+    name = "parameter_214"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0635541")
+    max_val = float("0.055896")
+    mean = float("0.000457738")
+    std = float("0.0206174")
+    data = None
+
+
+class Program_weight_tensor_parameter_215:
+    name = "parameter_215"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0667683")
+    max_val = float("0.0568582")
+    mean = float("0.00164259")
+    std = float("0.0200097")
+    data = None
+
+
+class Program_weight_tensor_parameter_216:
+    name = "parameter_216"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0894739")
+    max_val = float("0.0899545")
+    mean = float("1.34708e-05")
+    std = float("0.0200055")
+    data = None
+
+
+class Program_weight_tensor_parameter_217:
+    name = "parameter_217"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0920965")
+    max_val = float("0.0973643")
+    mean = float("-2.36284e-05")
+    std = float("0.0199965")
+    data = None
+
+
+class Program_weight_tensor_parameter_218:
+    name = "parameter_218"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0883198")
+    max_val = float("0.09633")
+    mean = float("-2.10673e-05")
+    std = float("0.0199832")
+    data = None
+
+
+class Program_weight_tensor_parameter_219:
+    name = "parameter_219"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0941603")
+    max_val = float("0.101693")
+    mean = float("-1.7868e-05")
+    std = float("0.0200022")
+    data = None
+
+
+class Program_weight_tensor_parameter_220:
+    name = "parameter_220"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.098849")
+    max_val = float("0.100235")
+    mean = float("6.90275e-06")
+    std = float("0.0200029")
+    data = None
+
+
+class Program_weight_tensor_parameter_221:
+    name = "parameter_221"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_222:
+    name = "parameter_222"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0939877")
+    max_val = float("0.100145")
+    mean = float("4.96509e-06")
+    std = float("0.0200054")
+    data = None
+
+
+class Program_weight_tensor_parameter_223:
+    name = "parameter_223"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_224:
+    name = "parameter_224"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0976788")
+    max_val = float("0.0981202")
+    mean = float("2.95791e-06")
+    std = float("0.0199983")
+    data = None
+
+
+class Program_weight_tensor_parameter_225:
+    name = "parameter_225"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_226:
+    name = "parameter_226"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_227:
+    name = "parameter_227"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_228:
+    name = "parameter_228"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_229:
+    name = "parameter_229"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0716485")
+    max_val = float("0.0704555")
+    mean = float("-0.000306408")
+    std = float("0.0195271")
+    data = None
+
+
+class Program_weight_tensor_parameter_230:
+    name = "parameter_230"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0631485")
+    max_val = float("0.0608671")
+    mean = float("-0.000783707")
+    std = float("0.020623")
+    data = None
+
+
+class Program_weight_tensor_parameter_231:
+    name = "parameter_231"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0612836")
+    max_val = float("0.0664824")
+    mean = float("0.00169708")
+    std = float("0.0201459")
+    data = None
+
+
+class Program_weight_tensor_parameter_232:
+    name = "parameter_232"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0617977")
+    max_val = float("0.0757013")
+    mean = float("-0.000942041")
+    std = float("0.020341")
+    data = None
+
+
+class Program_weight_tensor_parameter_233:
+    name = "parameter_233"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0959046")
+    max_val = float("0.0905154")
+    mean = float("1.3832e-06")
+    std = float("0.0199986")
+    data = None
+
+
+class Program_weight_tensor_parameter_234:
+    name = "parameter_234"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0897725")
+    max_val = float("0.0903606")
+    mean = float("4.65935e-05")
+    std = float("0.0200113")
+    data = None
+
+
+class Program_weight_tensor_parameter_235:
+    name = "parameter_235"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0944441")
+    max_val = float("0.0876855")
+    mean = float("-6.82516e-06")
+    std = float("0.0199964")
+    data = None
+
+
+class Program_weight_tensor_parameter_236:
+    name = "parameter_236"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0988587")
+    max_val = float("0.0928436")
+    mean = float("-1.03861e-05")
+    std = float("0.0199971")
+    data = None
+
+
+class Program_weight_tensor_parameter_237:
+    name = "parameter_237"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0938641")
+    max_val = float("0.0884428")
+    mean = float("-7.12503e-05")
+    std = float("0.020003")
+    data = None
+
+
+class Program_weight_tensor_parameter_238:
+    name = "parameter_238"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_239:
+    name = "parameter_239"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.104088")
+    max_val = float("0.100674")
+    mean = float("-1.2354e-05")
+    std = float("0.0200057")
+    data = None
+
+
+class Program_weight_tensor_parameter_240:
+    name = "parameter_240"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_241:
+    name = "parameter_241"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0940645")
+    max_val = float("0.0992474")
+    mean = float("-5.84661e-06")
+    std = float("0.0200021")
+    data = None
+
+
+class Program_weight_tensor_parameter_242:
+    name = "parameter_242"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_243:
+    name = "parameter_243"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_244:
+    name = "parameter_244"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_245:
+    name = "parameter_245"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_246:
+    name = "parameter_246"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0733826")
+    max_val = float("0.079843")
+    mean = float("0.000584072")
+    std = float("0.0198898")
+    data = None
+
+
+class Program_weight_tensor_parameter_247:
+    name = "parameter_247"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0745035")
+    max_val = float("0.0678794")
+    mean = float("-0.000124665")
+    std = float("0.0214412")
+    data = None
+
+
+class Program_weight_tensor_parameter_248:
+    name = "parameter_248"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0676443")
+    max_val = float("0.0639757")
+    mean = float("6.72338e-05")
+    std = float("0.0196634")
+    data = None
+
+
+class Program_weight_tensor_parameter_249:
+    name = "parameter_249"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0600977")
+    max_val = float("0.0674271")
+    mean = float("-0.00155526")
+    std = float("0.0202025")
+    data = None
+
+
+class Program_weight_tensor_parameter_250:
+    name = "parameter_250"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0895744")
+    max_val = float("0.0918048")
+    mean = float("1.12035e-05")
+    std = float("0.020037")
+    data = None
+
+
+class Program_weight_tensor_parameter_251:
+    name = "parameter_251"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0942934")
+    max_val = float("0.0938409")
+    mean = float("-1.89081e-05")
+    std = float("0.0199621")
+    data = None
+
+
+class Program_weight_tensor_parameter_252:
+    name = "parameter_252"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0951055")
+    max_val = float("0.0909101")
+    mean = float("-1.28031e-05")
+    std = float("0.0200067")
+    data = None
+
+
+class Program_weight_tensor_parameter_253:
+    name = "parameter_253"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0920752")
+    max_val = float("0.0936272")
+    mean = float("-2.56825e-05")
+    std = float("0.020006")
+    data = None
+
+
+class Program_weight_tensor_parameter_254:
+    name = "parameter_254"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0959548")
+    max_val = float("0.104327")
+    mean = float("3.32833e-05")
+    std = float("0.0199962")
+    data = None
+
+
+class Program_weight_tensor_parameter_255:
+    name = "parameter_255"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_256:
+    name = "parameter_256"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.111898")
+    max_val = float("0.0971337")
+    mean = float("-2.79149e-06")
+    std = float("0.0199909")
+    data = None
+
+
+class Program_weight_tensor_parameter_257:
+    name = "parameter_257"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_258:
+    name = "parameter_258"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.102543")
+    max_val = float("0.103272")
+    mean = float("-6.29099e-06")
+    std = float("0.0199911")
+    data = None
+
+
+class Program_weight_tensor_parameter_259:
+    name = "parameter_259"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_260:
+    name = "parameter_260"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_261:
+    name = "parameter_261"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_262:
+    name = "parameter_262"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_263:
+    name = "parameter_263"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0662811")
+    max_val = float("0.0793151")
+    mean = float("0.000483764")
+    std = float("0.0204926")
+    data = None
+
+
+class Program_weight_tensor_parameter_264:
+    name = "parameter_264"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0720237")
+    max_val = float("0.052449")
+    mean = float("-0.000461651")
+    std = float("0.0193515")
+    data = None
+
+
+class Program_weight_tensor_parameter_265:
+    name = "parameter_265"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0613431")
+    max_val = float("0.0557613")
+    mean = float("-0.000658027")
+    std = float("0.0211547")
+    data = None
+
+
+class Program_weight_tensor_parameter_266:
+    name = "parameter_266"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0708073")
+    max_val = float("0.0662909")
+    mean = float("0.000246101")
+    std = float("0.0199279")
+    data = None
+
+
+class Program_weight_tensor_parameter_267:
+    name = "parameter_267"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0930493")
+    max_val = float("0.089579")
+    mean = float("1.37958e-05")
+    std = float("0.0199762")
+    data = None
+
+
+class Program_weight_tensor_parameter_268:
+    name = "parameter_268"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0937447")
+    max_val = float("0.0937204")
+    mean = float("-6.0748e-06")
+    std = float("0.0200273")
+    data = None
+
+
+class Program_weight_tensor_parameter_269:
+    name = "parameter_269"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0956679")
+    max_val = float("0.0934901")
+    mean = float("-1.70272e-05")
+    std = float("0.0200205")
+    data = None
+
+
+class Program_weight_tensor_parameter_270:
+    name = "parameter_270"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.100066")
+    max_val = float("0.0950881")
+    mean = float("2.65998e-05")
+    std = float("0.020013")
+    data = None
+
+
+class Program_weight_tensor_parameter_271:
+    name = "parameter_271"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0982124")
+    max_val = float("0.0920513")
+    mean = float("-3.49801e-05")
+    std = float("0.020012")
+    data = None
+
+
+class Program_weight_tensor_parameter_272:
+    name = "parameter_272"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_273:
+    name = "parameter_273"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.104792")
+    max_val = float("0.104386")
+    mean = float("-4.51588e-06")
+    std = float("0.0199906")
+    data = None
+
+
+class Program_weight_tensor_parameter_274:
+    name = "parameter_274"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_275:
+    name = "parameter_275"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0990749")
+    max_val = float("0.101253")
+    mean = float("1.19078e-05")
+    std = float("0.0199982")
+    data = None
+
+
+class Program_weight_tensor_parameter_276:
+    name = "parameter_276"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_277:
+    name = "parameter_277"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_278:
+    name = "parameter_278"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_279:
+    name = "parameter_279"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_280:
+    name = "parameter_280"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.068423")
+    max_val = float("0.0730132")
+    mean = float("-0.000575068")
+    std = float("0.019558")
+    data = None
+
+
+class Program_weight_tensor_parameter_281:
+    name = "parameter_281"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0573987")
+    max_val = float("0.0538829")
+    mean = float("-0.00022042")
+    std = float("0.0199234")
+    data = None
+
+
+class Program_weight_tensor_parameter_282:
+    name = "parameter_282"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0614962")
+    max_val = float("0.0730006")
+    mean = float("-0.00100148")
+    std = float("0.0199159")
+    data = None
+
+
+class Program_weight_tensor_parameter_283:
+    name = "parameter_283"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0621623")
+    max_val = float("0.0578604")
+    mean = float("0.00118948")
+    std = float("0.0205245")
+    data = None
+
+
+class Program_weight_tensor_parameter_284:
+    name = "parameter_284"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0905907")
+    max_val = float("0.0886716")
+    mean = float("-1.19075e-05")
+    std = float("0.0199876")
+    data = None
+
+
+class Program_weight_tensor_parameter_285:
+    name = "parameter_285"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0940388")
+    max_val = float("0.0943725")
+    mean = float("-6.69905e-05")
+    std = float("0.0199879")
+    data = None
+
+
+class Program_weight_tensor_parameter_286:
+    name = "parameter_286"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0986769")
+    max_val = float("0.0952609")
+    mean = float("-6.63148e-06")
+    std = float("0.0199987")
+    data = None
+
+
+class Program_weight_tensor_parameter_287:
+    name = "parameter_287"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0968987")
+    max_val = float("0.0907544")
+    mean = float("-4.93519e-06")
+    std = float("0.0199667")
+    data = None
+
+
+class Program_weight_tensor_parameter_288:
+    name = "parameter_288"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.101947")
+    max_val = float("0.0935621")
+    mean = float("5.38941e-05")
+    std = float("0.0200096")
+    data = None
+
+
+class Program_weight_tensor_parameter_289:
+    name = "parameter_289"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_290:
+    name = "parameter_290"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.116381")
+    max_val = float("0.0972943")
+    mean = float("-1.31732e-05")
+    std = float("0.0200032")
+    data = None
+
+
+class Program_weight_tensor_parameter_291:
+    name = "parameter_291"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_292:
+    name = "parameter_292"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0968845")
+    max_val = float("0.104371")
+    mean = float("-1.48792e-05")
+    std = float("0.0200016")
+    data = None
+
+
+class Program_weight_tensor_parameter_293:
+    name = "parameter_293"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_294:
+    name = "parameter_294"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_295:
+    name = "parameter_295"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_296:
+    name = "parameter_296"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_297:
+    name = "parameter_297"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0608554")
+    max_val = float("0.0718633")
+    mean = float("-0.000365229")
+    std = float("0.019887")
+    data = None
+
+
+class Program_weight_tensor_parameter_298:
+    name = "parameter_298"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0620168")
+    max_val = float("0.0497455")
+    mean = float("-0.00156908")
+    std = float("0.0200533")
+    data = None
+
+
+class Program_weight_tensor_parameter_299:
+    name = "parameter_299"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0796537")
+    max_val = float("0.0544007")
+    mean = float("0.000334917")
+    std = float("0.0197958")
+    data = None
+
+
+class Program_weight_tensor_parameter_300:
+    name = "parameter_300"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0624352")
+    max_val = float("0.0535833")
+    mean = float("-0.000144914")
+    std = float("0.0197108")
+    data = None
+
+
+class Program_weight_tensor_parameter_301:
+    name = "parameter_301"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0979601")
+    max_val = float("0.0979061")
+    mean = float("2.01737e-05")
+    std = float("0.0200193")
+    data = None
+
+
+class Program_weight_tensor_parameter_302:
+    name = "parameter_302"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0933974")
+    max_val = float("0.0889042")
+    mean = float("-2.23701e-05")
+    std = float("0.0199779")
+    data = None
+
+
+class Program_weight_tensor_parameter_303:
+    name = "parameter_303"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0922369")
+    max_val = float("0.0956675")
+    mean = float("-2.1542e-05")
+    std = float("0.0200272")
+    data = None
+
+
+class Program_weight_tensor_parameter_304:
+    name = "parameter_304"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.094925")
+    max_val = float("0.0936901")
+    mean = float("-2.6579e-05")
+    std = float("0.0200015")
+    data = None
+
+
+class Program_weight_tensor_parameter_305:
+    name = "parameter_305"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0972304")
+    max_val = float("0.0887948")
+    mean = float("4.11008e-05")
+    std = float("0.0199814")
+    data = None
+
+
+class Program_weight_tensor_parameter_306:
+    name = "parameter_306"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_307:
+    name = "parameter_307"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.102771")
+    max_val = float("0.1207")
+    mean = float("2.85613e-06")
+    std = float("0.02")
+    data = None
+
+
+class Program_weight_tensor_parameter_308:
+    name = "parameter_308"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_309:
+    name = "parameter_309"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.10016")
+    max_val = float("0.0966208")
+    mean = float("1.10829e-05")
+    std = float("0.0200145")
+    data = None
+
+
+class Program_weight_tensor_parameter_310:
+    name = "parameter_310"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_311:
+    name = "parameter_311"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_312:
+    name = "parameter_312"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_313:
+    name = "parameter_313"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_314:
+    name = "parameter_314"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0639653")
+    max_val = float("0.0644856")
+    mean = float("0.000140734")
+    std = float("0.0204312")
+    data = None
+
+
+class Program_weight_tensor_parameter_315:
+    name = "parameter_315"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0630646")
+    max_val = float("0.0496056")
+    mean = float("-0.000972034")
+    std = float("0.0195664")
+    data = None
+
+
+class Program_weight_tensor_parameter_316:
+    name = "parameter_316"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0683311")
+    max_val = float("0.060249")
+    mean = float("0.000509113")
+    std = float("0.0204194")
+    data = None
+
+
+class Program_weight_tensor_parameter_317:
+    name = "parameter_317"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0867553")
+    max_val = float("0.0595023")
+    mean = float("-0.000186983")
+    std = float("0.0202237")
+    data = None
+
+
+class Program_weight_tensor_parameter_318:
+    name = "parameter_318"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0949215")
+    max_val = float("0.0929923")
+    mean = float("-1.48893e-05")
+    std = float("0.0200072")
+    data = None
+
+
+class Program_weight_tensor_parameter_319:
+    name = "parameter_319"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0915239")
+    max_val = float("0.094362")
+    mean = float("9.13134e-06")
+    std = float("0.0199928")
+    data = None
+
+
+class Program_weight_tensor_parameter_320:
+    name = "parameter_320"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0939734")
+    max_val = float("0.0930242")
+    mean = float("-3.54406e-05")
+    std = float("0.0200039")
+    data = None
+
+
+class Program_weight_tensor_parameter_321:
+    name = "parameter_321"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.102151")
+    max_val = float("0.0971843")
+    mean = float("1.4659e-05")
+    std = float("0.0199957")
+    data = None
+
+
+class Program_weight_tensor_parameter_322:
+    name = "parameter_322"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.096152")
+    max_val = float("0.0939488")
+    mean = float("4.81302e-06")
+    std = float("0.0199878")
+    data = None
+
+
+class Program_weight_tensor_parameter_323:
+    name = "parameter_323"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_324:
+    name = "parameter_324"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.104435")
+    max_val = float("0.0944035")
+    mean = float("-6.43081e-06")
+    std = float("0.0199896")
+    data = None
+
+
+class Program_weight_tensor_parameter_325:
+    name = "parameter_325"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_326:
+    name = "parameter_326"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.100915")
+    max_val = float("0.103063")
+    mean = float("-1.38518e-05")
+    std = float("0.0199982")
+    data = None
+
+
+class Program_weight_tensor_parameter_327:
+    name = "parameter_327"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_328:
+    name = "parameter_328"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_329:
+    name = "parameter_329"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_330:
+    name = "parameter_330"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_331:
+    name = "parameter_331"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0655106")
+    max_val = float("0.0793115")
+    mean = float("-0.000177196")
+    std = float("0.0200876")
+    data = None
+
+
+class Program_weight_tensor_parameter_332:
+    name = "parameter_332"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0649945")
+    max_val = float("0.0765837")
+    mean = float("-0.00035018")
+    std = float("0.0208553")
+    data = None
+
+
+class Program_weight_tensor_parameter_333:
+    name = "parameter_333"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0724614")
+    max_val = float("0.0697428")
+    mean = float("-0.000303265")
+    std = float("0.0198988")
+    data = None
+
+
+class Program_weight_tensor_parameter_334:
+    name = "parameter_334"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0579749")
+    max_val = float("0.0723804")
+    mean = float("-0.000108606")
+    std = float("0.0206081")
+    data = None
+
+
+class Program_weight_tensor_parameter_335:
+    name = "parameter_335"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0966144")
+    max_val = float("0.0956888")
+    mean = float("4.70503e-05")
+    std = float("0.0199741")
+    data = None
+
+
+class Program_weight_tensor_parameter_336:
+    name = "parameter_336"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0944042")
+    max_val = float("0.0962674")
+    mean = float("-4.50011e-05")
+    std = float("0.0199971")
+    data = None
+
+
+class Program_weight_tensor_parameter_337:
+    name = "parameter_337"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0971175")
+    max_val = float("0.0999373")
+    mean = float("7.93104e-06")
+    std = float("0.0199647")
+    data = None
+
+
+class Program_weight_tensor_parameter_338:
+    name = "parameter_338"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0895577")
+    max_val = float("0.0978528")
+    mean = float("6.31164e-06")
+    std = float("0.0200072")
+    data = None
+
+
+class Program_weight_tensor_parameter_339:
+    name = "parameter_339"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0939715")
+    max_val = float("0.0922678")
+    mean = float("1.26627e-05")
+    std = float("0.019993")
+    data = None
+
+
+class Program_weight_tensor_parameter_340:
+    name = "parameter_340"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_341:
+    name = "parameter_341"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0952799")
+    max_val = float("0.103177")
+    mean = float("-1.78093e-07")
+    std = float("0.0199958")
+    data = None
+
+
+class Program_weight_tensor_parameter_342:
+    name = "parameter_342"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_343:
+    name = "parameter_343"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.107197")
+    max_val = float("0.0964814")
+    mean = float("1.33163e-05")
+    std = float("0.0200002")
+    data = None
+
+
+class Program_weight_tensor_parameter_344:
+    name = "parameter_344"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_345:
+    name = "parameter_345"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_346:
+    name = "parameter_346"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_347:
+    name = "parameter_347"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_348:
+    name = "parameter_348"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0651132")
+    max_val = float("0.0669098")
+    mean = float("0.000271728")
+    std = float("0.0200293")
+    data = None
+
+
+class Program_weight_tensor_parameter_349:
+    name = "parameter_349"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0566132")
+    max_val = float("0.0661084")
+    mean = float("0.000778474")
+    std = float("0.0201461")
+    data = None
+
+
+class Program_weight_tensor_parameter_350:
+    name = "parameter_350"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0573333")
+    max_val = float("0.0605919")
+    mean = float("-0.000275255")
+    std = float("0.0200007")
+    data = None
+
+
+class Program_weight_tensor_parameter_351:
+    name = "parameter_351"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0586884")
+    max_val = float("0.0657262")
+    mean = float("-0.000724771")
+    std = float("0.020373")
+    data = None
+
+
+class Program_weight_tensor_parameter_352:
+    name = "parameter_352"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0955559")
+    max_val = float("0.0866505")
+    mean = float("-1.0279e-05")
+    std = float("0.0199847")
+    data = None
+
+
+class Program_weight_tensor_parameter_353:
+    name = "parameter_353"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.092631")
+    max_val = float("0.0932036")
+    mean = float("-1.70493e-05")
+    std = float("0.0199895")
+    data = None
+
+
+class Program_weight_tensor_parameter_354:
+    name = "parameter_354"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.102376")
+    max_val = float("0.0951306")
+    mean = float("-1.46204e-05")
+    std = float("0.0199796")
+    data = None
+
+
+class Program_weight_tensor_parameter_355:
+    name = "parameter_355"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.106232")
+    max_val = float("0.0874891")
+    mean = float("-1.33059e-05")
+    std = float("0.0199711")
+    data = None
+
+
+class Program_weight_tensor_parameter_356:
+    name = "parameter_356"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.101956")
+    max_val = float("0.0935847")
+    mean = float("-4.34596e-06")
+    std = float("0.0199674")
+    data = None
+
+
+class Program_weight_tensor_parameter_357:
+    name = "parameter_357"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_358:
+    name = "parameter_358"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0944472")
+    max_val = float("0.10012")
+    mean = float("9.15296e-06")
+    std = float("0.0199866")
+    data = None
+
+
+class Program_weight_tensor_parameter_359:
+    name = "parameter_359"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_360:
+    name = "parameter_360"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.111544")
+    max_val = float("0.104288")
+    mean = float("-8.35623e-06")
+    std = float("0.0199907")
+    data = None
+
+
+class Program_weight_tensor_parameter_361:
+    name = "parameter_361"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_362:
+    name = "parameter_362"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_363:
+    name = "parameter_363"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_364:
+    name = "parameter_364"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_365:
+    name = "parameter_365"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0598522")
+    max_val = float("0.0604418")
+    mean = float("0.000259007")
+    std = float("0.0199941")
+    data = None
+
+
+class Program_weight_tensor_parameter_366:
+    name = "parameter_366"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0635184")
+    max_val = float("0.0538524")
+    mean = float("-0.000558313")
+    std = float("0.020858")
+    data = None
+
+
+class Program_weight_tensor_parameter_367:
+    name = "parameter_367"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.076988")
+    max_val = float("0.0773655")
+    mean = float("-0.000799213")
+    std = float("0.0194891")
+    data = None
+
+
+class Program_weight_tensor_parameter_368:
+    name = "parameter_368"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.052951")
+    max_val = float("0.0621607")
+    mean = float("0.0011092")
+    std = float("0.0196794")
+    data = None
+
+
+class Program_weight_tensor_parameter_369:
+    name = "parameter_369"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0949259")
+    max_val = float("0.0924379")
+    mean = float("-1.43068e-05")
+    std = float("0.019978")
+    data = None
+
+
+class Program_weight_tensor_parameter_370:
+    name = "parameter_370"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.098106")
+    max_val = float("0.0937175")
+    mean = float("4.33239e-05")
+    std = float("0.0200302")
+    data = None
+
+
+class Program_weight_tensor_parameter_371:
+    name = "parameter_371"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0950287")
+    max_val = float("0.0926024")
+    mean = float("5.24463e-06")
+    std = float("0.0200123")
+    data = None
+
+
+class Program_weight_tensor_parameter_372:
+    name = "parameter_372"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0971058")
+    max_val = float("0.0922806")
+    mean = float("-4.05307e-05")
+    std = float("0.0199756")
+    data = None
+
+
+class Program_weight_tensor_parameter_373:
+    name = "parameter_373"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.104123")
+    max_val = float("0.0927598")
+    mean = float("-2.64591e-05")
+    std = float("0.020012")
+    data = None
+
+
+class Program_weight_tensor_parameter_374:
+    name = "parameter_374"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_375:
+    name = "parameter_375"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0933842")
+    max_val = float("0.101948")
+    mean = float("-6.79444e-06")
+    std = float("0.0199902")
+    data = None
+
+
+class Program_weight_tensor_parameter_376:
+    name = "parameter_376"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_377:
+    name = "parameter_377"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0945606")
+    max_val = float("0.102819")
+    mean = float("3.25459e-06")
+    std = float("0.0199865")
+    data = None
+
+
+class Program_weight_tensor_parameter_378:
+    name = "parameter_378"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_379:
+    name = "parameter_379"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_380:
+    name = "parameter_380"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_381:
+    name = "parameter_381"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_382:
+    name = "parameter_382"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0812335")
+    max_val = float("0.0642184")
+    mean = float("0.000128249")
+    std = float("0.020208")
+    data = None
+
+
+class Program_weight_tensor_parameter_383:
+    name = "parameter_383"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0620183")
+    max_val = float("0.0594621")
+    mean = float("-0.00105505")
+    std = float("0.020015")
+    data = None
+
+
+class Program_weight_tensor_parameter_384:
+    name = "parameter_384"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0561532")
+    max_val = float("0.069992")
+    mean = float("0.000415732")
+    std = float("0.0194367")
+    data = None
+
+
+class Program_weight_tensor_parameter_385:
+    name = "parameter_385"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0606509")
+    max_val = float("0.0585851")
+    mean = float("-0.000265057")
+    std = float("0.0193097")
+    data = None
+
+
+class Program_weight_tensor_parameter_386:
+    name = "parameter_386"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0893203")
+    max_val = float("0.0974871")
+    mean = float("-3.63362e-06")
+    std = float("0.0200269")
+    data = None
+
+
+class Program_weight_tensor_parameter_387:
+    name = "parameter_387"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0941134")
+    max_val = float("0.0877735")
+    mean = float("-2.65744e-05")
+    std = float("0.02001")
+    data = None
+
+
+class Program_weight_tensor_parameter_388:
+    name = "parameter_388"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0936635")
+    max_val = float("0.0988059")
+    mean = float("-3.38022e-05")
+    std = float("0.0199935")
+    data = None
+
+
+class Program_weight_tensor_parameter_389:
+    name = "parameter_389"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0959509")
+    max_val = float("0.0968037")
+    mean = float("-1.03065e-05")
+    std = float("0.0200405")
+    data = None
+
+
+class Program_weight_tensor_parameter_390:
+    name = "parameter_390"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0890728")
+    max_val = float("0.0956342")
+    mean = float("1.55674e-05")
+    std = float("0.0200325")
+    data = None
+
+
+class Program_weight_tensor_parameter_391:
+    name = "parameter_391"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_392:
+    name = "parameter_392"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0943604")
+    max_val = float("0.111684")
+    mean = float("1.39828e-05")
+    std = float("0.0200019")
+    data = None
+
+
+class Program_weight_tensor_parameter_393:
+    name = "parameter_393"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_394:
+    name = "parameter_394"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.100682")
+    max_val = float("0.0971403")
+    mean = float("9.96819e-06")
+    std = float("0.0200029")
+    data = None
+
+
+class Program_weight_tensor_parameter_395:
+    name = "parameter_395"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_396:
+    name = "parameter_396"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_397:
+    name = "parameter_397"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_398:
+    name = "parameter_398"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_399:
+    name = "parameter_399"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0584292")
+    max_val = float("0.0763427")
+    mean = float("-0.000338736")
+    std = float("0.0201055")
+    data = None
+
+
+class Program_weight_tensor_parameter_400:
+    name = "parameter_400"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.063222")
+    max_val = float("0.0555351")
+    mean = float("-0.000473943")
+    std = float("0.0213078")
+    data = None
+
+
+class Program_weight_tensor_parameter_401:
+    name = "parameter_401"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0613989")
+    max_val = float("0.0622419")
+    mean = float("4.58493e-05")
+    std = float("0.0193804")
+    data = None
+
+
+class Program_weight_tensor_parameter_402:
+    name = "parameter_402"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0739278")
+    max_val = float("0.0657546")
+    mean = float("0.000618415")
+    std = float("0.0209295")
+    data = None
+
+
+class Program_weight_tensor_parameter_403:
+    name = "parameter_403"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0947384")
+    max_val = float("0.0932296")
+    mean = float("1.10171e-05")
+    std = float("0.0199879")
+    data = None
+
+
+class Program_weight_tensor_parameter_404:
+    name = "parameter_404"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.105606")
+    max_val = float("0.0955081")
+    mean = float("2.61478e-05")
+    std = float("0.0199997")
+    data = None
+
+
+class Program_weight_tensor_parameter_405:
+    name = "parameter_405"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0955242")
+    max_val = float("0.0950305")
+    mean = float("6.06502e-06")
+    std = float("0.019978")
+    data = None
+
+
+class Program_weight_tensor_parameter_406:
+    name = "parameter_406"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0969456")
+    max_val = float("0.100197")
+    mean = float("-3.20375e-05")
+    std = float("0.0199887")
+    data = None
+
+
+class Program_weight_tensor_parameter_407:
+    name = "parameter_407"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.106133")
+    max_val = float("0.0949257")
+    mean = float("-1.25246e-07")
+    std = float("0.0199849")
+    data = None
+
+
+class Program_weight_tensor_parameter_408:
+    name = "parameter_408"
+    shape = [32000, 768]
+    dtype = "float32"
+    min_val = float("-0.110157")
+    max_val = float("0.108801")
+    mean = float("-8.50089e-07")
+    std = float("0.019998")
+    data = None
+
+
+class Program_weight_tensor_parameter_409:
+    name = "parameter_409"
+    shape = [1, 1, 768]
+    dtype = "float32"
+    min_val = float("-0.060647")
+    max_val = float("0.0587686")
+    mean = float("-0.000567059")
+    std = float("0.0195027")
+    data = None
diff --git a/paddle_samples/PaddleNLP/xlnet-base-cased/graph_net.json b/paddle_samples/PaddleNLP/xlnet-base-cased/graph_net.json
new file mode 100644
index 000000000..6260580c5
--- /dev/null
+++ b/paddle_samples/PaddleNLP/xlnet-base-cased/graph_net.json
@@ -0,0 +1,6 @@
+{
+    "framework": "paddle",
+    "model_name": "xlnet-base-cased",
+    "num_devices_required": 1,
+    "num_nodes_required": 1
+}
\ No newline at end of file
diff --git a/paddle_samples/PaddleNLP/xlnet-base-cased/input_meta.py b/paddle_samples/PaddleNLP/xlnet-base-cased/input_meta.py
new file mode 100644
index 000000000..feae33c5c
--- /dev/null
+++ b/paddle_samples/PaddleNLP/xlnet-base-cased/input_meta.py
@@ -0,0 +1,42 @@
+class Program_weight_tensor_data_0:
+    name = "data_0"
+    shape = [1, 22]
+    dtype = "int64"
+    data = [
+        17,
+        11368,
+        19,
+        94,
+        304,
+        27,
+        2656,
+        9,
+        35,
+        569,
+        1899,
+        75,
+        392,
+        1243,
+        2626,
+        21,
+        58,
+        4797,
+        23,
+        9,
+        4,
+        3,
+    ]
+
+
+class Program_weight_tensor_data_1:
+    name = "data_1"
+    shape = [1, 22]
+    dtype = "int64"
+    data = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]
+
+
+class Program_weight_tensor_data_2:
+    name = "data_2"
+    shape = [1, 22]
+    dtype = "int64"
+    data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
diff --git a/paddle_samples/PaddleNLP/xlnet-base-cased/model.py b/paddle_samples/PaddleNLP/xlnet-base-cased/model.py
new file mode 100644
index 000000000..fa9ce555f
--- /dev/null
+++ b/paddle_samples/PaddleNLP/xlnet-base-cased/model.py
@@ -0,0 +1,4369 @@
+import paddle
+
+
+class GraphModule(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self,
+        parameter_0,
+        parameter_1,
+        parameter_2,
+        parameter_3,
+        parameter_4,
+        parameter_5,
+        parameter_6,
+        parameter_7,
+        parameter_8,
+        parameter_9,
+        parameter_10,
+        parameter_11,
+        parameter_12,
+        parameter_13,
+        parameter_14,
+        parameter_15,
+        parameter_16,
+        parameter_17,
+        parameter_18,
+        parameter_19,
+        parameter_20,
+        parameter_21,
+        parameter_22,
+        parameter_23,
+        parameter_24,
+        parameter_25,
+        parameter_26,
+        parameter_27,
+        parameter_28,
+        parameter_29,
+        parameter_30,
+        parameter_31,
+        parameter_32,
+        parameter_33,
+        parameter_34,
+        parameter_35,
+        parameter_36,
+        parameter_37,
+        parameter_38,
+        parameter_39,
+        parameter_40,
+        parameter_41,
+        parameter_42,
+        parameter_43,
+        parameter_44,
+        parameter_45,
+        parameter_46,
+        parameter_47,
+        parameter_48,
+        parameter_49,
+        parameter_50,
+        parameter_51,
+        parameter_52,
+        parameter_53,
+        parameter_54,
+        parameter_55,
+        parameter_56,
+        parameter_57,
+        parameter_58,
+        parameter_59,
+        parameter_60,
+        parameter_61,
+        parameter_62,
+        parameter_63,
+        parameter_64,
+        parameter_65,
+        parameter_66,
+        parameter_67,
+        parameter_68,
+        parameter_69,
+        parameter_70,
+        parameter_71,
+        parameter_72,
+        parameter_73,
+        parameter_74,
+        parameter_75,
+        parameter_76,
+        parameter_77,
+        parameter_78,
+        parameter_79,
+        parameter_80,
+        parameter_81,
+        parameter_82,
+        parameter_83,
+        parameter_84,
+        parameter_85,
+        parameter_86,
+        parameter_87,
+        parameter_88,
+        parameter_89,
+        parameter_90,
+        parameter_91,
+        parameter_92,
+        parameter_93,
+        parameter_94,
+        parameter_95,
+        parameter_96,
+        parameter_97,
+        parameter_98,
+        parameter_99,
+        parameter_100,
+        parameter_101,
+        parameter_102,
+        parameter_103,
+        parameter_104,
+        parameter_105,
+        parameter_106,
+        parameter_107,
+        parameter_108,
+        parameter_109,
+        parameter_110,
+        parameter_111,
+        parameter_112,
+        parameter_113,
+        parameter_114,
+        parameter_115,
+        parameter_116,
+        parameter_117,
+        parameter_118,
+        parameter_119,
+        parameter_120,
+        parameter_121,
+        parameter_122,
+        parameter_123,
+        parameter_124,
+        parameter_125,
+        parameter_126,
+        parameter_127,
+        parameter_128,
+        parameter_129,
+        parameter_130,
+        parameter_131,
+        parameter_132,
+        parameter_133,
+        parameter_134,
+        parameter_135,
+        parameter_136,
+        parameter_137,
+        parameter_138,
+        parameter_139,
+        parameter_140,
+        parameter_141,
+        parameter_142,
+        parameter_143,
+        parameter_144,
+        parameter_145,
+        parameter_146,
+        parameter_147,
+        parameter_148,
+        parameter_149,
+        parameter_150,
+        parameter_151,
+        parameter_152,
+        parameter_153,
+        parameter_154,
+        parameter_155,
+        parameter_156,
+        parameter_157,
+        parameter_158,
+        parameter_159,
+        parameter_160,
+        parameter_161,
+        parameter_162,
+        parameter_163,
+        parameter_164,
+        parameter_165,
+        parameter_166,
+        parameter_167,
+        parameter_168,
+        parameter_169,
+        parameter_170,
+        parameter_171,
+        parameter_172,
+        parameter_173,
+        parameter_174,
+        parameter_175,
+        parameter_176,
+        parameter_177,
+        parameter_178,
+        parameter_179,
+        parameter_180,
+        parameter_181,
+        parameter_182,
+        parameter_183,
+        parameter_184,
+        parameter_185,
+        parameter_186,
+        parameter_187,
+        parameter_188,
+        parameter_189,
+        parameter_190,
+        parameter_191,
+        parameter_192,
+        parameter_193,
+        parameter_194,
+        parameter_195,
+        parameter_196,
+        parameter_197,
+        parameter_198,
+        parameter_199,
+        parameter_200,
+        parameter_201,
+        parameter_202,
+        parameter_203,
+        parameter_204,
+        parameter_205,
+        data_0,
+        data_1,
+        data_2,
+    ):
+        # pd_op.transpose: (22x1xi64) <- (1x22xi64)
+        transpose_1 = paddle._C_ops.transpose(data_0, [1, 0])
+        del data_0
+
+        # pd_op.transpose: (22x1xi64) <- (1x22xi64)
+        transpose_2 = paddle._C_ops.transpose(data_1, [1, 0])
+        del data_1
+
+        # pd_op.transpose: (22x1xi64) <- (1x22xi64)
+        transpose_3 = paddle._C_ops.transpose(data_2, [1, 0])
+        del data_2
+
+        # pd_op.cast: (22x1xf32) <- (22x1xi64)
+        cast_0 = paddle._C_ops.cast(transpose_3, paddle.float32)
+        del transpose_3
+
+        # pd_op.full: (1xf32) <- ()
+        full_0 = paddle._C_ops.full(
+            [1], float("-1"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (22x1xf32) <- (22x1xf32, 1xf32)
+        scale_0 = paddle._C_ops.scale(cast_0, full_0, float("1"), True)
+        del cast_0
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_0 = [0]
+
+        # pd_op.unsqueeze: (1x22x1xf32) <- (22x1xf32, 1xi64)
+        unsqueeze_0 = paddle._C_ops.unsqueeze(scale_0, full_int_array_0)
+        del scale_0
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_1 = [-1]
+
+        # pd_op.unsqueeze: (1x22x1x1xf32) <- (1x22x1xf32, 1xi64)
+        unsqueeze_1 = paddle._C_ops.unsqueeze(unsqueeze_0, full_int_array_1)
+        del full_int_array_1, unsqueeze_0
+
+        # pd_op.full: (xf32) <- ()
+        full_1 = paddle._C_ops.full(
+            [], float("0"), paddle.float32, paddle.framework._current_expected_place()
+        )
+
+        # pd_op.greater_than: (1x22x1x1xb) <- (1x22x1x1xf32, xf32)
+        greater_than_0 = paddle._C_ops.greater_than(unsqueeze_1, full_1)
+        del unsqueeze_1
+
+        # pd_op.cast: (1x22x1x1xf32) <- (1x22x1x1xb)
+        cast_1 = paddle._C_ops.cast(greater_than_0, paddle.float32)
+        del greater_than_0
+
+        # pd_op.full: (22xf32) <- ()
+        full_2 = paddle._C_ops.full(
+            [22], float("1"), paddle.float32, paddle.framework._current_expected_place()
+        )
+
+        # pd_op.diag: (22x22xf32) <- (22xf32)
+        diag_0 = paddle._C_ops.diag(full_2, 0, float("0"))
+        del full_2
+
+        # pd_op.scale: (22x22xf32) <- (22x22xf32, 1xf32)
+        scale_1 = paddle._C_ops.scale(diag_0, full_0, float("0"), True)
+        del diag_0, full_0
+
+        # pd_op.cast: (22x22xf32) <- (22x22xf32)
+        cast_2 = paddle._C_ops.cast(scale_1, paddle.float32)
+        del scale_1
+
+        # pd_op.full_int_array: (2xi64) <- ()
+        full_int_array_2 = [2, 3]
+
+        # pd_op.unsqueeze: (22x22x1x1xf32) <- (22x22xf32, 2xi64)
+        unsqueeze_2 = paddle._C_ops.unsqueeze(cast_2, full_int_array_2)
+        del cast_2, full_int_array_2
+
+        # pd_op.add: (22x22x1x1xf32) <- (1x22x1x1xf32, 22x22x1x1xf32)
+        add_0 = paddle._C_ops.add(cast_1, unsqueeze_2)
+        del cast_1, unsqueeze_2
+
+        # pd_op.greater_than: (22x22x1x1xb) <- (22x22x1x1xf32, xf32)
+        greater_than_1 = paddle._C_ops.greater_than(add_0, full_1)
+        del add_0, full_1
+
+        # pd_op.cast: (22x22x1x1xf32) <- (22x22x1x1xb)
+        cast_3 = paddle._C_ops.cast(greater_than_1, paddle.float32)
+        del greater_than_1
+
+        # pd_op.embedding: (22x1x768xf32) <- (22x1xi64, 32000x768xf32)
+        embedding_0 = paddle._C_ops.embedding(transpose_1, parameter_204, -1, False)
+        del parameter_204, transpose_1
+
+        # pd_op.full: (1xf32) <- ()
+        full_3 = paddle._C_ops.full(
+            [1], float("0.1"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_0, dropout_1 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                embedding_0, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del embedding_0
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_3 = [1]
+
+        # pd_op.unsqueeze: (22x1x1xi64) <- (22x1xi64, 1xi64)
+        unsqueeze_3 = paddle._C_ops.unsqueeze(transpose_2, full_int_array_3)
+
+        # pd_op.unsqueeze: (1x22x1xi64) <- (22x1xi64, 1xi64)
+        unsqueeze_4 = paddle._C_ops.unsqueeze(transpose_2, full_int_array_0)
+        del full_int_array_0, transpose_2
+
+        # pd_op.not_equal: (22x22x1xb) <- (22x1x1xi64, 1x22x1xi64)
+        not_equal_0 = paddle._C_ops.not_equal(unsqueeze_3, unsqueeze_4)
+        del unsqueeze_3, unsqueeze_4
+
+        # pd_op.cast: (22x22x1xi64) <- (22x22x1xb)
+        cast_4 = paddle._C_ops.cast(not_equal_0, paddle.int64)
+        del not_equal_0
+
+        # pd_op.full: (1xi32) <- ()
+        full_4 = paddle._C_ops.full(
+            [1], float("2"), paddle.int32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.one_hot: (22x22x1x2xf32) <- (22x22x1xi64, 1xi32)
+        one_hot_0 = paddle._C_ops.one_hot(
+            cast_4 % paddle.cast(full_4, cast_4.dtype), full_4
+        )
+        del cast_4, full_4
+
+        # pd_op.cast: (22x22x1x2xf32) <- (22x22x1x2xf32)
+        cast_5 = paddle._C_ops.cast(one_hot_0, paddle.float32)
+        del one_hot_0
+
+        # pd_op.full: (1xf64) <- ()
+        full_5 = paddle._C_ops.full(
+            [1], float("0"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full: (1xf64) <- ()
+        full_6 = paddle._C_ops.full(
+            [1], float("768"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full: (1xf64) <- ()
+        full_7 = paddle._C_ops.full(
+            [1], float("2"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.arange: (384xf32) <- (1xf64, 1xf64, 1xf64)
+        arange_0 = paddle.arange(full_5, full_6, full_7, dtype="float32")
+        del full_6, full_7
+
+        # pd_op.full: (1xf32) <- ()
+        full_8 = paddle._C_ops.full(
+            [1], float("0.00130208"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (384xf32) <- (384xf32, 1xf32)
+        scale_2 = paddle._C_ops.scale(arange_0, full_8, float("0"), True)
+        del arange_0, full_8
+
+        # pd_op.full: (384xf32) <- ()
+        full_9 = paddle._C_ops.full(
+            [384],
+            float("10000"),
+            paddle.float32,
+            paddle.framework._current_expected_place(),
+        )
+
+        # pd_op.elementwise_pow: (384xf32) <- (384xf32, 384xf32)
+        elementwise_pow_0 = paddle._C_ops.elementwise_pow(full_9, scale_2)
+        del full_9, scale_2
+
+        # pd_op.full: (384xf32) <- ()
+        full_10 = paddle._C_ops.full(
+            [384],
+            float("1"),
+            paddle.float32,
+            paddle.framework._current_expected_place(),
+        )
+
+        # pd_op.divide: (384xf32) <- (384xf32, 384xf32)
+        divide_0 = paddle._C_ops.divide(full_10, elementwise_pow_0)
+        del elementwise_pow_0, full_10
+
+        # pd_op.full: (1xf64) <- ()
+        full_11 = paddle._C_ops.full(
+            [1], float("22"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full: (1xf64) <- ()
+        full_12 = paddle._C_ops.full(
+            [1], float("-22"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full: (1xf64) <- ()
+        full_13 = paddle._C_ops.full(
+            [1], float("-1"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.arange: (44xf32) <- (1xf64, 1xf64, 1xf64)
+        arange_1 = paddle.arange(full_11, full_12, full_13, dtype="float32")
+        del full_12, full_13
+
+        # builtin.combine: ([44xf32, 384xf32]) <- (44xf32, 384xf32)
+        combine_0 = [arange_1, divide_0]
+        del arange_1, divide_0
+
+        # pd_op.einsum: (44x384xf32, [0xf32, 0xf32], [44xf32, 384xf32]) <- ([44xf32, 384xf32])
+        einsum_0, einsum_1, einsum_2 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_0, "i,d->id"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_0
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_0,
+            split_1,
+        ) = einsum_1
+        del einsum_1
+
+        # builtin.split: (44xf32, 384xf32) <- ([44xf32, 384xf32])
+        (
+            split_2,
+            split_3,
+        ) = einsum_2
+        del einsum_2
+
+        # pd_op.sin: (44x384xf32) <- (44x384xf32)
+        sin_0 = paddle._C_ops.sin(einsum_0)
+
+        # pd_op.cos: (44x384xf32) <- (44x384xf32)
+        cos_0 = paddle._C_ops.cos(einsum_0)
+        del einsum_0
+
+        # pd_op.full: (1xi32) <- ()
+        full_14 = paddle._C_ops.full(
+            [1], float("-1"), paddle.int32, paddle.core.CPUPlace()
+        )
+
+        # builtin.combine: ([44x384xf32, 44x384xf32]) <- (44x384xf32, 44x384xf32)
+        combine_1 = [sin_0, cos_0]
+        del cos_0, sin_0
+
+        # pd_op.concat: (44x768xf32) <- ([44x384xf32, 44x384xf32], 1xi32)
+        concat_0 = paddle._C_ops.concat(combine_1, full_14)
+        del combine_1, full_14
+
+        # pd_op.unsqueeze: (44x1x768xf32) <- (44x768xf32, 1xi64)
+        unsqueeze_5 = paddle._C_ops.unsqueeze(concat_0, full_int_array_3)
+        del concat_0
+
+        # pd_op.full_int_array: (3xi64) <- ()
+        full_int_array_4 = [-1, 1, -1]
+
+        # pd_op.expand: (44x1x768xf32) <- (44x1x768xf32, 3xi64)
+        expand_0 = paddle._C_ops.expand(unsqueeze_5, full_int_array_4)
+        del full_int_array_4, unsqueeze_5
+
+        # pd_op.dropout: (44x1x768xf32, 44x1x768xui8) <- (44x1x768xf32, None, 1xf32)
+        dropout_2, dropout_3 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                expand_0, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del expand_0
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_0 = paddle._C_ops.matmul(dropout_0, parameter_203, False, False)
+        del parameter_203
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_5 = [22, 1, 12, 64]
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_0 = paddle._C_ops.reshape(matmul_0, full_int_array_5)
+        del matmul_0
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_1 = paddle._C_ops.matmul(dropout_0, parameter_202, False, False)
+        del parameter_202
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_1 = paddle._C_ops.reshape(matmul_1, full_int_array_5)
+        del matmul_1
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_2 = paddle._C_ops.matmul(dropout_0, parameter_201, False, False)
+        del parameter_201
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_2 = paddle._C_ops.reshape(matmul_2, full_int_array_5)
+        del matmul_2
+
+        # pd_op.matmul: (44x1x768xf32) <- (44x1x768xf32, 768x768xf32)
+        matmul_3 = paddle._C_ops.matmul(dropout_2, parameter_199, False, False)
+        del parameter_199
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_6 = [44, -1, 12, 64]
+
+        # pd_op.reshape: (44x1x12x64xf32) <- (44x1x768xf32, 4xi64)
+        reshape_3 = paddle._C_ops.reshape(matmul_3, full_int_array_6)
+        del matmul_3
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_1 = paddle._C_ops.add(reshape_0, parameter_196)
+        del parameter_196
+
+        # builtin.combine: ([22x1x12x64xf32, 22x1x12x64xf32]) <- (22x1x12x64xf32, 22x1x12x64xf32)
+        combine_2 = [add_1, reshape_1]
+        del add_1, reshape_1
+
+        # pd_op.einsum: (1x12x22x22xf32, [0xf32, 0xf32], [22x1x12x64xf32, 22x1x12x64xf32]) <- ([22x1x12x64xf32, 22x1x12x64xf32])
+        einsum_3, einsum_4, einsum_5 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_2, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_2
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_4,
+            split_5,
+        ) = einsum_4
+        del einsum_4
+
+        # builtin.split: (22x1x12x64xf32, 22x1x12x64xf32) <- ([22x1x12x64xf32, 22x1x12x64xf32])
+        (
+            split_6,
+            split_7,
+        ) = einsum_5
+        del einsum_5
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_2 = paddle._C_ops.add(reshape_0, parameter_198)
+        del parameter_198
+
+        # builtin.combine: ([22x1x12x64xf32, 44x1x12x64xf32]) <- (22x1x12x64xf32, 44x1x12x64xf32)
+        combine_3 = [add_2, reshape_3]
+        del add_2, reshape_3
+
+        # pd_op.einsum: (1x12x22x44xf32, [0xf32, 0xf32], [22x1x12x64xf32, 44x1x12x64xf32]) <- ([22x1x12x64xf32, 44x1x12x64xf32])
+        einsum_6, einsum_7, einsum_8 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_3, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_3
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_8,
+            split_9,
+        ) = einsum_7
+        del einsum_7
+
+        # builtin.split: (22x1x12x64xf32, 44x1x12x64xf32) <- ([22x1x12x64xf32, 44x1x12x64xf32])
+        (
+            split_10,
+            split_11,
+        ) = einsum_8
+        del einsum_8
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_7 = [1, 12, 44, 22]
+
+        # pd_op.reshape: (1x12x44x22xf32) <- (1x12x22x44xf32, 4xi64)
+        reshape_4 = paddle._C_ops.reshape(einsum_6, full_int_array_7)
+        del einsum_6
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_8 = [2147483647]
+
+        # pd_op.slice: (1x12x43x22xf32) <- (1x12x44x22xf32, 1xi64, 1xi64)
+        slice_0 = paddle._C_ops.slice(
+            reshape_4, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_4
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_9 = [1, 12, 22, 43]
+
+        # pd_op.reshape: (1x12x22x43xf32) <- (1x12x43x22xf32, 4xi64)
+        reshape_5 = paddle._C_ops.reshape(slice_0, full_int_array_9)
+        del slice_0
+
+        # pd_op.full: (1xf64) <- ()
+        full_15 = paddle._C_ops.full(
+            [1], float("1"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.arange: (22xi64) <- (1xf64, 1xf64, 1xf64)
+        arange_2 = paddle.arange(full_5, full_11, full_15, dtype="int64")
+        del full_11, full_15, full_5
+
+        # pd_op.index_select: (1x12x22x22xf32) <- (1x12x22x43xf32, 22xi64)
+        index_select_0 = paddle._C_ops.index_select(reshape_5, arange_2, 3)
+        del reshape_5
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_3 = paddle._C_ops.add(reshape_0, parameter_197)
+        del parameter_197, reshape_0
+
+        # builtin.combine: ([22x1x12x64xf32, 2x12x64xf32]) <- (22x1x12x64xf32, 2x12x64xf32)
+        combine_4 = [add_3, parameter_195]
+        del add_3, parameter_195
+
+        # pd_op.einsum: (22x1x12x2xf32, [0xf32, 0xf32], [22x1x12x64xf32, 2x12x64xf32]) <- ([22x1x12x64xf32, 2x12x64xf32])
+        einsum_9, einsum_10, einsum_11 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_4, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_4
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_12,
+            split_13,
+        ) = einsum_10
+        del einsum_10
+
+        # builtin.split: (22x1x12x64xf32, 2x12x64xf32) <- ([22x1x12x64xf32, 2x12x64xf32])
+        (
+            split_14,
+            split_15,
+        ) = einsum_11
+        del einsum_11
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x12x2xf32]) <- (22x22x1x2xf32, 22x1x12x2xf32)
+        combine_5 = [cast_5, einsum_9]
+        del einsum_9
+
+        # pd_op.einsum: (1x12x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x12x2xf32]) <- ([22x22x1x2xf32, 22x1x12x2xf32])
+        einsum_12, einsum_13, einsum_14 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_5, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_5
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_16,
+            split_17,
+        ) = einsum_13
+        del einsum_13
+
+        # builtin.split: (22x22x1x2xf32, 22x1x12x2xf32) <- ([22x22x1x2xf32, 22x1x12x2xf32])
+        (
+            split_18,
+            split_19,
+        ) = einsum_14
+        del einsum_14
+
+        # pd_op.add: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x12x22x22xf32)
+        add_4 = paddle._C_ops.add(einsum_3, index_select_0)
+        del einsum_3, index_select_0
+
+        # pd_op.add: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x12x22x22xf32)
+        add_5 = paddle._C_ops.add(add_4, einsum_12)
+        del add_4, einsum_12
+
+        # pd_op.full: (1xf32) <- ()
+        full_16 = paddle._C_ops.full(
+            [1], float("0.125"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x12x22x22xf32) <- (1x12x22x22xf32, 1xf32)
+        scale_3 = paddle._C_ops.scale(add_5, full_16, float("0"), True)
+        del add_5
+
+        # pd_op.transpose: (1x1x22x22xf32) <- (22x22x1x1xf32)
+        transpose_4 = paddle._C_ops.transpose(cast_3, [2, 3, 0, 1])
+        del cast_3
+
+        # pd_op.full: (1xf32) <- ()
+        full_17 = paddle._C_ops.full(
+            [1], float("1e+30"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x1x22x22xf32) <- (1x1x22x22xf32, 1xf32)
+        scale_4 = paddle._C_ops.scale(transpose_4, full_17, float("0"), True)
+        del full_17, transpose_4
+
+        # pd_op.subtract: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x1x22x22xf32)
+        subtract_0 = paddle._C_ops.subtract(scale_3, scale_4)
+        del scale_3
+
+        # pd_op.softmax: (1x12x22x22xf32) <- (1x12x22x22xf32)
+        softmax_0 = paddle._C_ops.softmax(subtract_0, 3)
+        del subtract_0
+
+        # pd_op.dropout: (1x12x22x22xf32, 1x12x22x22xui8) <- (1x12x22x22xf32, None, 1xf32)
+        dropout_4, dropout_5 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_0, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_0
+
+        # builtin.combine: ([1x12x22x22xf32, 22x1x12x64xf32]) <- (1x12x22x22xf32, 22x1x12x64xf32)
+        combine_6 = [dropout_4, reshape_2]
+        del dropout_4, reshape_2
+
+        # pd_op.einsum: (22x1x12x64xf32, [0xf32, 0xf32], [1x12x22x22xf32, 22x1x12x64xf32]) <- ([1x12x22x22xf32, 22x1x12x64xf32])
+        einsum_15, einsum_16, einsum_17 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_6, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_6
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_20,
+            split_21,
+        ) = einsum_16
+        del einsum_16
+
+        # builtin.split: (1x12x22x22xf32, 22x1x12x64xf32) <- ([1x12x22x22xf32, 22x1x12x64xf32])
+        (
+            split_22,
+            split_23,
+        ) = einsum_17
+        del einsum_17
+
+        # pd_op.full_int_array: (3xi64) <- ()
+        full_int_array_10 = [22, 1, 768]
+
+        # pd_op.reshape: (22x1x768xf32) <- (22x1x12x64xf32, 3xi64)
+        reshape_6 = paddle._C_ops.reshape(einsum_15, full_int_array_10)
+        del einsum_15
+
+        # builtin.combine: ([22x1x768xf32, 768x768xf32]) <- (22x1x768xf32, 768x768xf32)
+        combine_7 = [reshape_6, parameter_200]
+        del parameter_200, reshape_6
+
+        # pd_op.einsum: (22x1x768xf32, [0xf32, 0xf32], [22x1x768xf32, 768x768xf32]) <- ([22x1x768xf32, 768x768xf32])
+        einsum_18, einsum_19, einsum_20 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_7, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_7
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_24,
+            split_25,
+        ) = einsum_19
+        del einsum_19
+
+        # builtin.split: (22x1x768xf32, 768x768xf32) <- ([22x1x768xf32, 768x768xf32])
+        (
+            split_26,
+            split_27,
+        ) = einsum_20
+        del einsum_20
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_6, dropout_7 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_18, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_18
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 22x1x768xf32)
+        add_6 = paddle._C_ops.add(dropout_6, dropout_0)
+        del dropout_0, dropout_6
+
+        # pd_op.layer_norm: (22x1x768xf32, 22x1xf32, 22x1xf32) <- (22x1x768xf32, 768xf32, 768xf32)
+        layer_norm_0, layer_norm_1, layer_norm_2 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_6, parameter_194, parameter_193, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_6, parameter_193, parameter_194
+
+        # pd_op.matmul: (22x1x3072xf32) <- (22x1x768xf32, 768x3072xf32)
+        matmul_4 = paddle._C_ops.matmul(layer_norm_0, parameter_190, False, False)
+        del parameter_190
+
+        # pd_op.add: (22x1x3072xf32) <- (22x1x3072xf32, 3072xf32)
+        add_7 = paddle._C_ops.add(matmul_4, parameter_189)
+        del matmul_4, parameter_189
+
+        # pd_op.gelu: (22x1x3072xf32) <- (22x1x3072xf32)
+        gelu_0 = paddle._C_ops.gelu(add_7, False)
+        del add_7
+
+        # pd_op.dropout: (22x1x3072xf32, 22x1x3072xui8) <- (22x1x3072xf32, None, 1xf32)
+        dropout_8, dropout_9 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_0, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_0
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x3072xf32, 3072x768xf32)
+        matmul_5 = paddle._C_ops.matmul(dropout_8, parameter_188, False, False)
+        del dropout_8, parameter_188
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 768xf32)
+        add_8 = paddle._C_ops.add(matmul_5, parameter_187)
+        del matmul_5, parameter_187
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_10, dropout_11 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_8, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_8
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 22x1x768xf32)
+        add_9 = paddle._C_ops.add(dropout_10, layer_norm_0)
+        del dropout_10, layer_norm_0
+
+        # pd_op.layer_norm: (22x1x768xf32, 22x1xf32, 22x1xf32) <- (22x1x768xf32, 768xf32, 768xf32)
+        layer_norm_3, layer_norm_4, layer_norm_5 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_9, parameter_192, parameter_191, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_9, parameter_191, parameter_192
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_6 = paddle._C_ops.matmul(layer_norm_3, parameter_186, False, False)
+        del parameter_186
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_7 = paddle._C_ops.reshape(matmul_6, full_int_array_5)
+        del matmul_6
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_7 = paddle._C_ops.matmul(layer_norm_3, parameter_185, False, False)
+        del parameter_185
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_8 = paddle._C_ops.reshape(matmul_7, full_int_array_5)
+        del matmul_7
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_8 = paddle._C_ops.matmul(layer_norm_3, parameter_184, False, False)
+        del parameter_184
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_9 = paddle._C_ops.reshape(matmul_8, full_int_array_5)
+        del matmul_8
+
+        # pd_op.matmul: (44x1x768xf32) <- (44x1x768xf32, 768x768xf32)
+        matmul_9 = paddle._C_ops.matmul(dropout_2, parameter_182, False, False)
+        del parameter_182
+
+        # pd_op.reshape: (44x1x12x64xf32) <- (44x1x768xf32, 4xi64)
+        reshape_10 = paddle._C_ops.reshape(matmul_9, full_int_array_6)
+        del matmul_9
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_10 = paddle._C_ops.add(reshape_7, parameter_179)
+        del parameter_179
+
+        # builtin.combine: ([22x1x12x64xf32, 22x1x12x64xf32]) <- (22x1x12x64xf32, 22x1x12x64xf32)
+        combine_8 = [add_10, reshape_8]
+        del add_10, reshape_8
+
+        # pd_op.einsum: (1x12x22x22xf32, [0xf32, 0xf32], [22x1x12x64xf32, 22x1x12x64xf32]) <- ([22x1x12x64xf32, 22x1x12x64xf32])
+        einsum_21, einsum_22, einsum_23 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_8, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_8
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_28,
+            split_29,
+        ) = einsum_22
+        del einsum_22
+
+        # builtin.split: (22x1x12x64xf32, 22x1x12x64xf32) <- ([22x1x12x64xf32, 22x1x12x64xf32])
+        (
+            split_30,
+            split_31,
+        ) = einsum_23
+        del einsum_23
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_11 = paddle._C_ops.add(reshape_7, parameter_181)
+        del parameter_181
+
+        # builtin.combine: ([22x1x12x64xf32, 44x1x12x64xf32]) <- (22x1x12x64xf32, 44x1x12x64xf32)
+        combine_9 = [add_11, reshape_10]
+        del add_11, reshape_10
+
+        # pd_op.einsum: (1x12x22x44xf32, [0xf32, 0xf32], [22x1x12x64xf32, 44x1x12x64xf32]) <- ([22x1x12x64xf32, 44x1x12x64xf32])
+        einsum_24, einsum_25, einsum_26 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_9, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_9
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_32,
+            split_33,
+        ) = einsum_25
+        del einsum_25
+
+        # builtin.split: (22x1x12x64xf32, 44x1x12x64xf32) <- ([22x1x12x64xf32, 44x1x12x64xf32])
+        (
+            split_34,
+            split_35,
+        ) = einsum_26
+        del einsum_26
+
+        # pd_op.reshape: (1x12x44x22xf32) <- (1x12x22x44xf32, 4xi64)
+        reshape_11 = paddle._C_ops.reshape(einsum_24, full_int_array_7)
+        del einsum_24
+
+        # pd_op.slice: (1x12x43x22xf32) <- (1x12x44x22xf32, 1xi64, 1xi64)
+        slice_1 = paddle._C_ops.slice(
+            reshape_11, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_11
+
+        # pd_op.reshape: (1x12x22x43xf32) <- (1x12x43x22xf32, 4xi64)
+        reshape_12 = paddle._C_ops.reshape(slice_1, full_int_array_9)
+        del slice_1
+
+        # pd_op.index_select: (1x12x22x22xf32) <- (1x12x22x43xf32, 22xi64)
+        index_select_1 = paddle._C_ops.index_select(reshape_12, arange_2, 3)
+        del reshape_12
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_12 = paddle._C_ops.add(reshape_7, parameter_180)
+        del parameter_180, reshape_7
+
+        # builtin.combine: ([22x1x12x64xf32, 2x12x64xf32]) <- (22x1x12x64xf32, 2x12x64xf32)
+        combine_10 = [add_12, parameter_178]
+        del add_12, parameter_178
+
+        # pd_op.einsum: (22x1x12x2xf32, [0xf32, 0xf32], [22x1x12x64xf32, 2x12x64xf32]) <- ([22x1x12x64xf32, 2x12x64xf32])
+        einsum_27, einsum_28, einsum_29 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_10, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_10
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_36,
+            split_37,
+        ) = einsum_28
+        del einsum_28
+
+        # builtin.split: (22x1x12x64xf32, 2x12x64xf32) <- ([22x1x12x64xf32, 2x12x64xf32])
+        (
+            split_38,
+            split_39,
+        ) = einsum_29
+        del einsum_29
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x12x2xf32]) <- (22x22x1x2xf32, 22x1x12x2xf32)
+        combine_11 = [cast_5, einsum_27]
+        del einsum_27
+
+        # pd_op.einsum: (1x12x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x12x2xf32]) <- ([22x22x1x2xf32, 22x1x12x2xf32])
+        einsum_30, einsum_31, einsum_32 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_11, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_11
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_40,
+            split_41,
+        ) = einsum_31
+        del einsum_31
+
+        # builtin.split: (22x22x1x2xf32, 22x1x12x2xf32) <- ([22x22x1x2xf32, 22x1x12x2xf32])
+        (
+            split_42,
+            split_43,
+        ) = einsum_32
+        del einsum_32
+
+        # pd_op.add: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x12x22x22xf32)
+        add_13 = paddle._C_ops.add(einsum_21, index_select_1)
+        del einsum_21, index_select_1
+
+        # pd_op.add: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x12x22x22xf32)
+        add_14 = paddle._C_ops.add(add_13, einsum_30)
+        del add_13, einsum_30
+
+        # pd_op.scale: (1x12x22x22xf32) <- (1x12x22x22xf32, 1xf32)
+        scale_5 = paddle._C_ops.scale(add_14, full_16, float("0"), True)
+        del add_14
+
+        # pd_op.subtract: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x1x22x22xf32)
+        subtract_1 = paddle._C_ops.subtract(scale_5, scale_4)
+        del scale_5
+
+        # pd_op.softmax: (1x12x22x22xf32) <- (1x12x22x22xf32)
+        softmax_1 = paddle._C_ops.softmax(subtract_1, 3)
+        del subtract_1
+
+        # pd_op.dropout: (1x12x22x22xf32, 1x12x22x22xui8) <- (1x12x22x22xf32, None, 1xf32)
+        dropout_12, dropout_13 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_1, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_1
+
+        # builtin.combine: ([1x12x22x22xf32, 22x1x12x64xf32]) <- (1x12x22x22xf32, 22x1x12x64xf32)
+        combine_12 = [dropout_12, reshape_9]
+        del dropout_12, reshape_9
+
+        # pd_op.einsum: (22x1x12x64xf32, [0xf32, 0xf32], [1x12x22x22xf32, 22x1x12x64xf32]) <- ([1x12x22x22xf32, 22x1x12x64xf32])
+        einsum_33, einsum_34, einsum_35 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_12, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_12
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_44,
+            split_45,
+        ) = einsum_34
+        del einsum_34
+
+        # builtin.split: (1x12x22x22xf32, 22x1x12x64xf32) <- ([1x12x22x22xf32, 22x1x12x64xf32])
+        (
+            split_46,
+            split_47,
+        ) = einsum_35
+        del einsum_35
+
+        # pd_op.reshape: (22x1x768xf32) <- (22x1x12x64xf32, 3xi64)
+        reshape_13 = paddle._C_ops.reshape(einsum_33, full_int_array_10)
+        del einsum_33
+
+        # builtin.combine: ([22x1x768xf32, 768x768xf32]) <- (22x1x768xf32, 768x768xf32)
+        combine_13 = [reshape_13, parameter_183]
+        del parameter_183, reshape_13
+
+        # pd_op.einsum: (22x1x768xf32, [0xf32, 0xf32], [22x1x768xf32, 768x768xf32]) <- ([22x1x768xf32, 768x768xf32])
+        einsum_36, einsum_37, einsum_38 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_13, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_13
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_48,
+            split_49,
+        ) = einsum_37
+        del einsum_37
+
+        # builtin.split: (22x1x768xf32, 768x768xf32) <- ([22x1x768xf32, 768x768xf32])
+        (
+            split_50,
+            split_51,
+        ) = einsum_38
+        del einsum_38
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_14, dropout_15 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_36, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_36
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 22x1x768xf32)
+        add_15 = paddle._C_ops.add(dropout_14, layer_norm_3)
+        del dropout_14, layer_norm_3
+
+        # pd_op.layer_norm: (22x1x768xf32, 22x1xf32, 22x1xf32) <- (22x1x768xf32, 768xf32, 768xf32)
+        layer_norm_6, layer_norm_7, layer_norm_8 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_15, parameter_177, parameter_176, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_15, parameter_176, parameter_177
+
+        # pd_op.matmul: (22x1x3072xf32) <- (22x1x768xf32, 768x3072xf32)
+        matmul_10 = paddle._C_ops.matmul(layer_norm_6, parameter_173, False, False)
+        del parameter_173
+
+        # pd_op.add: (22x1x3072xf32) <- (22x1x3072xf32, 3072xf32)
+        add_16 = paddle._C_ops.add(matmul_10, parameter_172)
+        del matmul_10, parameter_172
+
+        # pd_op.gelu: (22x1x3072xf32) <- (22x1x3072xf32)
+        gelu_1 = paddle._C_ops.gelu(add_16, False)
+        del add_16
+
+        # pd_op.dropout: (22x1x3072xf32, 22x1x3072xui8) <- (22x1x3072xf32, None, 1xf32)
+        dropout_16, dropout_17 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_1, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_1
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x3072xf32, 3072x768xf32)
+        matmul_11 = paddle._C_ops.matmul(dropout_16, parameter_171, False, False)
+        del dropout_16, parameter_171
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 768xf32)
+        add_17 = paddle._C_ops.add(matmul_11, parameter_170)
+        del matmul_11, parameter_170
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_18, dropout_19 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_17, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_17
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 22x1x768xf32)
+        add_18 = paddle._C_ops.add(dropout_18, layer_norm_6)
+        del dropout_18, layer_norm_6
+
+        # pd_op.layer_norm: (22x1x768xf32, 22x1xf32, 22x1xf32) <- (22x1x768xf32, 768xf32, 768xf32)
+        layer_norm_9, layer_norm_10, layer_norm_11 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_18, parameter_175, parameter_174, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_18, parameter_174, parameter_175
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_12 = paddle._C_ops.matmul(layer_norm_9, parameter_169, False, False)
+        del parameter_169
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_14 = paddle._C_ops.reshape(matmul_12, full_int_array_5)
+        del matmul_12
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_13 = paddle._C_ops.matmul(layer_norm_9, parameter_168, False, False)
+        del parameter_168
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_15 = paddle._C_ops.reshape(matmul_13, full_int_array_5)
+        del matmul_13
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_14 = paddle._C_ops.matmul(layer_norm_9, parameter_167, False, False)
+        del parameter_167
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_16 = paddle._C_ops.reshape(matmul_14, full_int_array_5)
+        del matmul_14
+
+        # pd_op.matmul: (44x1x768xf32) <- (44x1x768xf32, 768x768xf32)
+        matmul_15 = paddle._C_ops.matmul(dropout_2, parameter_165, False, False)
+        del parameter_165
+
+        # pd_op.reshape: (44x1x12x64xf32) <- (44x1x768xf32, 4xi64)
+        reshape_17 = paddle._C_ops.reshape(matmul_15, full_int_array_6)
+        del matmul_15
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_19 = paddle._C_ops.add(reshape_14, parameter_162)
+        del parameter_162
+
+        # builtin.combine: ([22x1x12x64xf32, 22x1x12x64xf32]) <- (22x1x12x64xf32, 22x1x12x64xf32)
+        combine_14 = [add_19, reshape_15]
+        del add_19, reshape_15
+
+        # pd_op.einsum: (1x12x22x22xf32, [0xf32, 0xf32], [22x1x12x64xf32, 22x1x12x64xf32]) <- ([22x1x12x64xf32, 22x1x12x64xf32])
+        einsum_39, einsum_40, einsum_41 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_14, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_14
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_52,
+            split_53,
+        ) = einsum_40
+        del einsum_40
+
+        # builtin.split: (22x1x12x64xf32, 22x1x12x64xf32) <- ([22x1x12x64xf32, 22x1x12x64xf32])
+        (
+            split_54,
+            split_55,
+        ) = einsum_41
+        del einsum_41
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_20 = paddle._C_ops.add(reshape_14, parameter_164)
+        del parameter_164
+
+        # builtin.combine: ([22x1x12x64xf32, 44x1x12x64xf32]) <- (22x1x12x64xf32, 44x1x12x64xf32)
+        combine_15 = [add_20, reshape_17]
+        del add_20, reshape_17
+
+        # pd_op.einsum: (1x12x22x44xf32, [0xf32, 0xf32], [22x1x12x64xf32, 44x1x12x64xf32]) <- ([22x1x12x64xf32, 44x1x12x64xf32])
+        einsum_42, einsum_43, einsum_44 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_15, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_15
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_56,
+            split_57,
+        ) = einsum_43
+        del einsum_43
+
+        # builtin.split: (22x1x12x64xf32, 44x1x12x64xf32) <- ([22x1x12x64xf32, 44x1x12x64xf32])
+        (
+            split_58,
+            split_59,
+        ) = einsum_44
+        del einsum_44
+
+        # pd_op.reshape: (1x12x44x22xf32) <- (1x12x22x44xf32, 4xi64)
+        reshape_18 = paddle._C_ops.reshape(einsum_42, full_int_array_7)
+        del einsum_42
+
+        # pd_op.slice: (1x12x43x22xf32) <- (1x12x44x22xf32, 1xi64, 1xi64)
+        slice_2 = paddle._C_ops.slice(
+            reshape_18, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_18
+
+        # pd_op.reshape: (1x12x22x43xf32) <- (1x12x43x22xf32, 4xi64)
+        reshape_19 = paddle._C_ops.reshape(slice_2, full_int_array_9)
+        del slice_2
+
+        # pd_op.index_select: (1x12x22x22xf32) <- (1x12x22x43xf32, 22xi64)
+        index_select_2 = paddle._C_ops.index_select(reshape_19, arange_2, 3)
+        del reshape_19
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_21 = paddle._C_ops.add(reshape_14, parameter_163)
+        del parameter_163, reshape_14
+
+        # builtin.combine: ([22x1x12x64xf32, 2x12x64xf32]) <- (22x1x12x64xf32, 2x12x64xf32)
+        combine_16 = [add_21, parameter_161]
+        del add_21, parameter_161
+
+        # pd_op.einsum: (22x1x12x2xf32, [0xf32, 0xf32], [22x1x12x64xf32, 2x12x64xf32]) <- ([22x1x12x64xf32, 2x12x64xf32])
+        einsum_45, einsum_46, einsum_47 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_16, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_16
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_60,
+            split_61,
+        ) = einsum_46
+        del einsum_46
+
+        # builtin.split: (22x1x12x64xf32, 2x12x64xf32) <- ([22x1x12x64xf32, 2x12x64xf32])
+        (
+            split_62,
+            split_63,
+        ) = einsum_47
+        del einsum_47
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x12x2xf32]) <- (22x22x1x2xf32, 22x1x12x2xf32)
+        combine_17 = [cast_5, einsum_45]
+        del einsum_45
+
+        # pd_op.einsum: (1x12x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x12x2xf32]) <- ([22x22x1x2xf32, 22x1x12x2xf32])
+        einsum_48, einsum_49, einsum_50 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_17, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_17
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_64,
+            split_65,
+        ) = einsum_49
+        del einsum_49
+
+        # builtin.split: (22x22x1x2xf32, 22x1x12x2xf32) <- ([22x22x1x2xf32, 22x1x12x2xf32])
+        (
+            split_66,
+            split_67,
+        ) = einsum_50
+        del einsum_50
+
+        # pd_op.add: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x12x22x22xf32)
+        add_22 = paddle._C_ops.add(einsum_39, index_select_2)
+        del einsum_39, index_select_2
+
+        # pd_op.add: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x12x22x22xf32)
+        add_23 = paddle._C_ops.add(add_22, einsum_48)
+        del add_22, einsum_48
+
+        # pd_op.scale: (1x12x22x22xf32) <- (1x12x22x22xf32, 1xf32)
+        scale_6 = paddle._C_ops.scale(add_23, full_16, float("0"), True)
+        del add_23
+
+        # pd_op.subtract: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x1x22x22xf32)
+        subtract_2 = paddle._C_ops.subtract(scale_6, scale_4)
+        del scale_6
+
+        # pd_op.softmax: (1x12x22x22xf32) <- (1x12x22x22xf32)
+        softmax_2 = paddle._C_ops.softmax(subtract_2, 3)
+        del subtract_2
+
+        # pd_op.dropout: (1x12x22x22xf32, 1x12x22x22xui8) <- (1x12x22x22xf32, None, 1xf32)
+        dropout_20, dropout_21 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_2, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_2
+
+        # builtin.combine: ([1x12x22x22xf32, 22x1x12x64xf32]) <- (1x12x22x22xf32, 22x1x12x64xf32)
+        combine_18 = [dropout_20, reshape_16]
+        del dropout_20, reshape_16
+
+        # pd_op.einsum: (22x1x12x64xf32, [0xf32, 0xf32], [1x12x22x22xf32, 22x1x12x64xf32]) <- ([1x12x22x22xf32, 22x1x12x64xf32])
+        einsum_51, einsum_52, einsum_53 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_18, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_18
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_68,
+            split_69,
+        ) = einsum_52
+        del einsum_52
+
+        # builtin.split: (1x12x22x22xf32, 22x1x12x64xf32) <- ([1x12x22x22xf32, 22x1x12x64xf32])
+        (
+            split_70,
+            split_71,
+        ) = einsum_53
+        del einsum_53
+
+        # pd_op.reshape: (22x1x768xf32) <- (22x1x12x64xf32, 3xi64)
+        reshape_20 = paddle._C_ops.reshape(einsum_51, full_int_array_10)
+        del einsum_51
+
+        # builtin.combine: ([22x1x768xf32, 768x768xf32]) <- (22x1x768xf32, 768x768xf32)
+        combine_19 = [reshape_20, parameter_166]
+        del parameter_166, reshape_20
+
+        # pd_op.einsum: (22x1x768xf32, [0xf32, 0xf32], [22x1x768xf32, 768x768xf32]) <- ([22x1x768xf32, 768x768xf32])
+        einsum_54, einsum_55, einsum_56 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_19, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_19
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_72,
+            split_73,
+        ) = einsum_55
+        del einsum_55
+
+        # builtin.split: (22x1x768xf32, 768x768xf32) <- ([22x1x768xf32, 768x768xf32])
+        (
+            split_74,
+            split_75,
+        ) = einsum_56
+        del einsum_56
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_22, dropout_23 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_54, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_54
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 22x1x768xf32)
+        add_24 = paddle._C_ops.add(dropout_22, layer_norm_9)
+        del dropout_22, layer_norm_9
+
+        # pd_op.layer_norm: (22x1x768xf32, 22x1xf32, 22x1xf32) <- (22x1x768xf32, 768xf32, 768xf32)
+        layer_norm_12, layer_norm_13, layer_norm_14 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_24, parameter_160, parameter_159, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_24, parameter_159, parameter_160
+
+        # pd_op.matmul: (22x1x3072xf32) <- (22x1x768xf32, 768x3072xf32)
+        matmul_16 = paddle._C_ops.matmul(layer_norm_12, parameter_156, False, False)
+        del parameter_156
+
+        # pd_op.add: (22x1x3072xf32) <- (22x1x3072xf32, 3072xf32)
+        add_25 = paddle._C_ops.add(matmul_16, parameter_155)
+        del matmul_16, parameter_155
+
+        # pd_op.gelu: (22x1x3072xf32) <- (22x1x3072xf32)
+        gelu_2 = paddle._C_ops.gelu(add_25, False)
+        del add_25
+
+        # pd_op.dropout: (22x1x3072xf32, 22x1x3072xui8) <- (22x1x3072xf32, None, 1xf32)
+        dropout_24, dropout_25 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_2, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_2
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x3072xf32, 3072x768xf32)
+        matmul_17 = paddle._C_ops.matmul(dropout_24, parameter_154, False, False)
+        del dropout_24, parameter_154
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 768xf32)
+        add_26 = paddle._C_ops.add(matmul_17, parameter_153)
+        del matmul_17, parameter_153
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_26, dropout_27 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_26, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_26
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 22x1x768xf32)
+        add_27 = paddle._C_ops.add(dropout_26, layer_norm_12)
+        del dropout_26, layer_norm_12
+
+        # pd_op.layer_norm: (22x1x768xf32, 22x1xf32, 22x1xf32) <- (22x1x768xf32, 768xf32, 768xf32)
+        layer_norm_15, layer_norm_16, layer_norm_17 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_27, parameter_158, parameter_157, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_27, parameter_157, parameter_158
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_18 = paddle._C_ops.matmul(layer_norm_15, parameter_152, False, False)
+        del parameter_152
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_21 = paddle._C_ops.reshape(matmul_18, full_int_array_5)
+        del matmul_18
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_19 = paddle._C_ops.matmul(layer_norm_15, parameter_151, False, False)
+        del parameter_151
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_22 = paddle._C_ops.reshape(matmul_19, full_int_array_5)
+        del matmul_19
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_20 = paddle._C_ops.matmul(layer_norm_15, parameter_150, False, False)
+        del parameter_150
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_23 = paddle._C_ops.reshape(matmul_20, full_int_array_5)
+        del matmul_20
+
+        # pd_op.matmul: (44x1x768xf32) <- (44x1x768xf32, 768x768xf32)
+        matmul_21 = paddle._C_ops.matmul(dropout_2, parameter_148, False, False)
+        del parameter_148
+
+        # pd_op.reshape: (44x1x12x64xf32) <- (44x1x768xf32, 4xi64)
+        reshape_24 = paddle._C_ops.reshape(matmul_21, full_int_array_6)
+        del matmul_21
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_28 = paddle._C_ops.add(reshape_21, parameter_145)
+        del parameter_145
+
+        # builtin.combine: ([22x1x12x64xf32, 22x1x12x64xf32]) <- (22x1x12x64xf32, 22x1x12x64xf32)
+        combine_20 = [add_28, reshape_22]
+        del add_28, reshape_22
+
+        # pd_op.einsum: (1x12x22x22xf32, [0xf32, 0xf32], [22x1x12x64xf32, 22x1x12x64xf32]) <- ([22x1x12x64xf32, 22x1x12x64xf32])
+        einsum_57, einsum_58, einsum_59 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_20, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_20
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_76,
+            split_77,
+        ) = einsum_58
+        del einsum_58
+
+        # builtin.split: (22x1x12x64xf32, 22x1x12x64xf32) <- ([22x1x12x64xf32, 22x1x12x64xf32])
+        (
+            split_78,
+            split_79,
+        ) = einsum_59
+        del einsum_59
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_29 = paddle._C_ops.add(reshape_21, parameter_147)
+        del parameter_147
+
+        # builtin.combine: ([22x1x12x64xf32, 44x1x12x64xf32]) <- (22x1x12x64xf32, 44x1x12x64xf32)
+        combine_21 = [add_29, reshape_24]
+        del add_29, reshape_24
+
+        # pd_op.einsum: (1x12x22x44xf32, [0xf32, 0xf32], [22x1x12x64xf32, 44x1x12x64xf32]) <- ([22x1x12x64xf32, 44x1x12x64xf32])
+        einsum_60, einsum_61, einsum_62 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_21, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_21
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_80,
+            split_81,
+        ) = einsum_61
+        del einsum_61
+
+        # builtin.split: (22x1x12x64xf32, 44x1x12x64xf32) <- ([22x1x12x64xf32, 44x1x12x64xf32])
+        (
+            split_82,
+            split_83,
+        ) = einsum_62
+        del einsum_62
+
+        # pd_op.reshape: (1x12x44x22xf32) <- (1x12x22x44xf32, 4xi64)
+        reshape_25 = paddle._C_ops.reshape(einsum_60, full_int_array_7)
+        del einsum_60
+
+        # pd_op.slice: (1x12x43x22xf32) <- (1x12x44x22xf32, 1xi64, 1xi64)
+        slice_3 = paddle._C_ops.slice(
+            reshape_25, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_25
+
+        # pd_op.reshape: (1x12x22x43xf32) <- (1x12x43x22xf32, 4xi64)
+        reshape_26 = paddle._C_ops.reshape(slice_3, full_int_array_9)
+        del slice_3
+
+        # pd_op.index_select: (1x12x22x22xf32) <- (1x12x22x43xf32, 22xi64)
+        index_select_3 = paddle._C_ops.index_select(reshape_26, arange_2, 3)
+        del reshape_26
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_30 = paddle._C_ops.add(reshape_21, parameter_146)
+        del parameter_146, reshape_21
+
+        # builtin.combine: ([22x1x12x64xf32, 2x12x64xf32]) <- (22x1x12x64xf32, 2x12x64xf32)
+        combine_22 = [add_30, parameter_144]
+        del add_30, parameter_144
+
+        # pd_op.einsum: (22x1x12x2xf32, [0xf32, 0xf32], [22x1x12x64xf32, 2x12x64xf32]) <- ([22x1x12x64xf32, 2x12x64xf32])
+        einsum_63, einsum_64, einsum_65 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_22, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_22
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_84,
+            split_85,
+        ) = einsum_64
+        del einsum_64
+
+        # builtin.split: (22x1x12x64xf32, 2x12x64xf32) <- ([22x1x12x64xf32, 2x12x64xf32])
+        (
+            split_86,
+            split_87,
+        ) = einsum_65
+        del einsum_65
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x12x2xf32]) <- (22x22x1x2xf32, 22x1x12x2xf32)
+        combine_23 = [cast_5, einsum_63]
+        del einsum_63
+
+        # pd_op.einsum: (1x12x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x12x2xf32]) <- ([22x22x1x2xf32, 22x1x12x2xf32])
+        einsum_66, einsum_67, einsum_68 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_23, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_23
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_88,
+            split_89,
+        ) = einsum_67
+        del einsum_67
+
+        # builtin.split: (22x22x1x2xf32, 22x1x12x2xf32) <- ([22x22x1x2xf32, 22x1x12x2xf32])
+        (
+            split_90,
+            split_91,
+        ) = einsum_68
+        del einsum_68
+
+        # pd_op.add: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x12x22x22xf32)
+        add_31 = paddle._C_ops.add(einsum_57, index_select_3)
+        del einsum_57, index_select_3
+
+        # pd_op.add: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x12x22x22xf32)
+        add_32 = paddle._C_ops.add(add_31, einsum_66)
+        del add_31, einsum_66
+
+        # pd_op.scale: (1x12x22x22xf32) <- (1x12x22x22xf32, 1xf32)
+        scale_7 = paddle._C_ops.scale(add_32, full_16, float("0"), True)
+        del add_32
+
+        # pd_op.subtract: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x1x22x22xf32)
+        subtract_3 = paddle._C_ops.subtract(scale_7, scale_4)
+        del scale_7
+
+        # pd_op.softmax: (1x12x22x22xf32) <- (1x12x22x22xf32)
+        softmax_3 = paddle._C_ops.softmax(subtract_3, 3)
+        del subtract_3
+
+        # pd_op.dropout: (1x12x22x22xf32, 1x12x22x22xui8) <- (1x12x22x22xf32, None, 1xf32)
+        dropout_28, dropout_29 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_3, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_3
+
+        # builtin.combine: ([1x12x22x22xf32, 22x1x12x64xf32]) <- (1x12x22x22xf32, 22x1x12x64xf32)
+        combine_24 = [dropout_28, reshape_23]
+        del dropout_28, reshape_23
+
+        # pd_op.einsum: (22x1x12x64xf32, [0xf32, 0xf32], [1x12x22x22xf32, 22x1x12x64xf32]) <- ([1x12x22x22xf32, 22x1x12x64xf32])
+        einsum_69, einsum_70, einsum_71 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_24, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_24
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_92,
+            split_93,
+        ) = einsum_70
+        del einsum_70
+
+        # builtin.split: (1x12x22x22xf32, 22x1x12x64xf32) <- ([1x12x22x22xf32, 22x1x12x64xf32])
+        (
+            split_94,
+            split_95,
+        ) = einsum_71
+        del einsum_71
+
+        # pd_op.reshape: (22x1x768xf32) <- (22x1x12x64xf32, 3xi64)
+        reshape_27 = paddle._C_ops.reshape(einsum_69, full_int_array_10)
+        del einsum_69
+
+        # builtin.combine: ([22x1x768xf32, 768x768xf32]) <- (22x1x768xf32, 768x768xf32)
+        combine_25 = [reshape_27, parameter_149]
+        del parameter_149, reshape_27
+
+        # pd_op.einsum: (22x1x768xf32, [0xf32, 0xf32], [22x1x768xf32, 768x768xf32]) <- ([22x1x768xf32, 768x768xf32])
+        einsum_72, einsum_73, einsum_74 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_25, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_25
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_96,
+            split_97,
+        ) = einsum_73
+        del einsum_73
+
+        # builtin.split: (22x1x768xf32, 768x768xf32) <- ([22x1x768xf32, 768x768xf32])
+        (
+            split_98,
+            split_99,
+        ) = einsum_74
+        del einsum_74
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_30, dropout_31 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_72, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_72
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 22x1x768xf32)
+        add_33 = paddle._C_ops.add(dropout_30, layer_norm_15)
+        del dropout_30, layer_norm_15
+
+        # pd_op.layer_norm: (22x1x768xf32, 22x1xf32, 22x1xf32) <- (22x1x768xf32, 768xf32, 768xf32)
+        layer_norm_18, layer_norm_19, layer_norm_20 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_33, parameter_143, parameter_142, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_33, parameter_142, parameter_143
+
+        # pd_op.matmul: (22x1x3072xf32) <- (22x1x768xf32, 768x3072xf32)
+        matmul_22 = paddle._C_ops.matmul(layer_norm_18, parameter_139, False, False)
+        del parameter_139
+
+        # pd_op.add: (22x1x3072xf32) <- (22x1x3072xf32, 3072xf32)
+        add_34 = paddle._C_ops.add(matmul_22, parameter_138)
+        del matmul_22, parameter_138
+
+        # pd_op.gelu: (22x1x3072xf32) <- (22x1x3072xf32)
+        gelu_3 = paddle._C_ops.gelu(add_34, False)
+        del add_34
+
+        # pd_op.dropout: (22x1x3072xf32, 22x1x3072xui8) <- (22x1x3072xf32, None, 1xf32)
+        dropout_32, dropout_33 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_3, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_3
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x3072xf32, 3072x768xf32)
+        matmul_23 = paddle._C_ops.matmul(dropout_32, parameter_137, False, False)
+        del dropout_32, parameter_137
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 768xf32)
+        add_35 = paddle._C_ops.add(matmul_23, parameter_136)
+        del matmul_23, parameter_136
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_34, dropout_35 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_35, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_35
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 22x1x768xf32)
+        add_36 = paddle._C_ops.add(dropout_34, layer_norm_18)
+        del dropout_34, layer_norm_18
+
+        # pd_op.layer_norm: (22x1x768xf32, 22x1xf32, 22x1xf32) <- (22x1x768xf32, 768xf32, 768xf32)
+        layer_norm_21, layer_norm_22, layer_norm_23 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_36, parameter_141, parameter_140, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_36, parameter_140, parameter_141
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_24 = paddle._C_ops.matmul(layer_norm_21, parameter_135, False, False)
+        del parameter_135
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_28 = paddle._C_ops.reshape(matmul_24, full_int_array_5)
+        del matmul_24
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_25 = paddle._C_ops.matmul(layer_norm_21, parameter_134, False, False)
+        del parameter_134
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_29 = paddle._C_ops.reshape(matmul_25, full_int_array_5)
+        del matmul_25
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_26 = paddle._C_ops.matmul(layer_norm_21, parameter_133, False, False)
+        del parameter_133
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_30 = paddle._C_ops.reshape(matmul_26, full_int_array_5)
+        del matmul_26
+
+        # pd_op.matmul: (44x1x768xf32) <- (44x1x768xf32, 768x768xf32)
+        matmul_27 = paddle._C_ops.matmul(dropout_2, parameter_131, False, False)
+        del parameter_131
+
+        # pd_op.reshape: (44x1x12x64xf32) <- (44x1x768xf32, 4xi64)
+        reshape_31 = paddle._C_ops.reshape(matmul_27, full_int_array_6)
+        del matmul_27
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_37 = paddle._C_ops.add(reshape_28, parameter_128)
+        del parameter_128
+
+        # builtin.combine: ([22x1x12x64xf32, 22x1x12x64xf32]) <- (22x1x12x64xf32, 22x1x12x64xf32)
+        combine_26 = [add_37, reshape_29]
+        del add_37, reshape_29
+
+        # pd_op.einsum: (1x12x22x22xf32, [0xf32, 0xf32], [22x1x12x64xf32, 22x1x12x64xf32]) <- ([22x1x12x64xf32, 22x1x12x64xf32])
+        einsum_75, einsum_76, einsum_77 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_26, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_26
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_100,
+            split_101,
+        ) = einsum_76
+        del einsum_76
+
+        # builtin.split: (22x1x12x64xf32, 22x1x12x64xf32) <- ([22x1x12x64xf32, 22x1x12x64xf32])
+        (
+            split_102,
+            split_103,
+        ) = einsum_77
+        del einsum_77
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_38 = paddle._C_ops.add(reshape_28, parameter_130)
+        del parameter_130
+
+        # builtin.combine: ([22x1x12x64xf32, 44x1x12x64xf32]) <- (22x1x12x64xf32, 44x1x12x64xf32)
+        combine_27 = [add_38, reshape_31]
+        del add_38, reshape_31
+
+        # pd_op.einsum: (1x12x22x44xf32, [0xf32, 0xf32], [22x1x12x64xf32, 44x1x12x64xf32]) <- ([22x1x12x64xf32, 44x1x12x64xf32])
+        einsum_78, einsum_79, einsum_80 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_27, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_27
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_104,
+            split_105,
+        ) = einsum_79
+        del einsum_79
+
+        # builtin.split: (22x1x12x64xf32, 44x1x12x64xf32) <- ([22x1x12x64xf32, 44x1x12x64xf32])
+        (
+            split_106,
+            split_107,
+        ) = einsum_80
+        del einsum_80
+
+        # pd_op.reshape: (1x12x44x22xf32) <- (1x12x22x44xf32, 4xi64)
+        reshape_32 = paddle._C_ops.reshape(einsum_78, full_int_array_7)
+        del einsum_78
+
+        # pd_op.slice: (1x12x43x22xf32) <- (1x12x44x22xf32, 1xi64, 1xi64)
+        slice_4 = paddle._C_ops.slice(
+            reshape_32, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_32
+
+        # pd_op.reshape: (1x12x22x43xf32) <- (1x12x43x22xf32, 4xi64)
+        reshape_33 = paddle._C_ops.reshape(slice_4, full_int_array_9)
+        del slice_4
+
+        # pd_op.index_select: (1x12x22x22xf32) <- (1x12x22x43xf32, 22xi64)
+        index_select_4 = paddle._C_ops.index_select(reshape_33, arange_2, 3)
+        del reshape_33
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_39 = paddle._C_ops.add(reshape_28, parameter_129)
+        del parameter_129, reshape_28
+
+        # builtin.combine: ([22x1x12x64xf32, 2x12x64xf32]) <- (22x1x12x64xf32, 2x12x64xf32)
+        combine_28 = [add_39, parameter_127]
+        del add_39, parameter_127
+
+        # pd_op.einsum: (22x1x12x2xf32, [0xf32, 0xf32], [22x1x12x64xf32, 2x12x64xf32]) <- ([22x1x12x64xf32, 2x12x64xf32])
+        einsum_81, einsum_82, einsum_83 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_28, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_28
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_108,
+            split_109,
+        ) = einsum_82
+        del einsum_82
+
+        # builtin.split: (22x1x12x64xf32, 2x12x64xf32) <- ([22x1x12x64xf32, 2x12x64xf32])
+        (
+            split_110,
+            split_111,
+        ) = einsum_83
+        del einsum_83
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x12x2xf32]) <- (22x22x1x2xf32, 22x1x12x2xf32)
+        combine_29 = [cast_5, einsum_81]
+        del einsum_81
+
+        # pd_op.einsum: (1x12x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x12x2xf32]) <- ([22x22x1x2xf32, 22x1x12x2xf32])
+        einsum_84, einsum_85, einsum_86 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_29, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_29
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_112,
+            split_113,
+        ) = einsum_85
+        del einsum_85
+
+        # builtin.split: (22x22x1x2xf32, 22x1x12x2xf32) <- ([22x22x1x2xf32, 22x1x12x2xf32])
+        (
+            split_114,
+            split_115,
+        ) = einsum_86
+        del einsum_86
+
+        # pd_op.add: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x12x22x22xf32)
+        add_40 = paddle._C_ops.add(einsum_75, index_select_4)
+        del einsum_75, index_select_4
+
+        # pd_op.add: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x12x22x22xf32)
+        add_41 = paddle._C_ops.add(add_40, einsum_84)
+        del add_40, einsum_84
+
+        # pd_op.scale: (1x12x22x22xf32) <- (1x12x22x22xf32, 1xf32)
+        scale_8 = paddle._C_ops.scale(add_41, full_16, float("0"), True)
+        del add_41
+
+        # pd_op.subtract: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x1x22x22xf32)
+        subtract_4 = paddle._C_ops.subtract(scale_8, scale_4)
+        del scale_8
+
+        # pd_op.softmax: (1x12x22x22xf32) <- (1x12x22x22xf32)
+        softmax_4 = paddle._C_ops.softmax(subtract_4, 3)
+        del subtract_4
+
+        # pd_op.dropout: (1x12x22x22xf32, 1x12x22x22xui8) <- (1x12x22x22xf32, None, 1xf32)
+        dropout_36, dropout_37 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_4, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_4
+
+        # builtin.combine: ([1x12x22x22xf32, 22x1x12x64xf32]) <- (1x12x22x22xf32, 22x1x12x64xf32)
+        combine_30 = [dropout_36, reshape_30]
+        del dropout_36, reshape_30
+
+        # pd_op.einsum: (22x1x12x64xf32, [0xf32, 0xf32], [1x12x22x22xf32, 22x1x12x64xf32]) <- ([1x12x22x22xf32, 22x1x12x64xf32])
+        einsum_87, einsum_88, einsum_89 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_30, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_30
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_116,
+            split_117,
+        ) = einsum_88
+        del einsum_88
+
+        # builtin.split: (1x12x22x22xf32, 22x1x12x64xf32) <- ([1x12x22x22xf32, 22x1x12x64xf32])
+        (
+            split_118,
+            split_119,
+        ) = einsum_89
+        del einsum_89
+
+        # pd_op.reshape: (22x1x768xf32) <- (22x1x12x64xf32, 3xi64)
+        reshape_34 = paddle._C_ops.reshape(einsum_87, full_int_array_10)
+        del einsum_87
+
+        # builtin.combine: ([22x1x768xf32, 768x768xf32]) <- (22x1x768xf32, 768x768xf32)
+        combine_31 = [reshape_34, parameter_132]
+        del parameter_132, reshape_34
+
+        # pd_op.einsum: (22x1x768xf32, [0xf32, 0xf32], [22x1x768xf32, 768x768xf32]) <- ([22x1x768xf32, 768x768xf32])
+        einsum_90, einsum_91, einsum_92 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_31, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_31
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_120,
+            split_121,
+        ) = einsum_91
+        del einsum_91
+
+        # builtin.split: (22x1x768xf32, 768x768xf32) <- ([22x1x768xf32, 768x768xf32])
+        (
+            split_122,
+            split_123,
+        ) = einsum_92
+        del einsum_92
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_38, dropout_39 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_90, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_90
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 22x1x768xf32)
+        add_42 = paddle._C_ops.add(dropout_38, layer_norm_21)
+        del dropout_38, layer_norm_21
+
+        # pd_op.layer_norm: (22x1x768xf32, 22x1xf32, 22x1xf32) <- (22x1x768xf32, 768xf32, 768xf32)
+        layer_norm_24, layer_norm_25, layer_norm_26 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_42, parameter_126, parameter_125, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_42, parameter_125, parameter_126
+
+        # pd_op.matmul: (22x1x3072xf32) <- (22x1x768xf32, 768x3072xf32)
+        matmul_28 = paddle._C_ops.matmul(layer_norm_24, parameter_122, False, False)
+        del parameter_122
+
+        # pd_op.add: (22x1x3072xf32) <- (22x1x3072xf32, 3072xf32)
+        add_43 = paddle._C_ops.add(matmul_28, parameter_121)
+        del matmul_28, parameter_121
+
+        # pd_op.gelu: (22x1x3072xf32) <- (22x1x3072xf32)
+        gelu_4 = paddle._C_ops.gelu(add_43, False)
+        del add_43
+
+        # pd_op.dropout: (22x1x3072xf32, 22x1x3072xui8) <- (22x1x3072xf32, None, 1xf32)
+        dropout_40, dropout_41 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_4, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_4
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x3072xf32, 3072x768xf32)
+        matmul_29 = paddle._C_ops.matmul(dropout_40, parameter_120, False, False)
+        del dropout_40, parameter_120
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 768xf32)
+        add_44 = paddle._C_ops.add(matmul_29, parameter_119)
+        del matmul_29, parameter_119
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_42, dropout_43 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_44, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_44
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 22x1x768xf32)
+        add_45 = paddle._C_ops.add(dropout_42, layer_norm_24)
+        del dropout_42, layer_norm_24
+
+        # pd_op.layer_norm: (22x1x768xf32, 22x1xf32, 22x1xf32) <- (22x1x768xf32, 768xf32, 768xf32)
+        layer_norm_27, layer_norm_28, layer_norm_29 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_45, parameter_124, parameter_123, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_45, parameter_123, parameter_124
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_30 = paddle._C_ops.matmul(layer_norm_27, parameter_118, False, False)
+        del parameter_118
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_35 = paddle._C_ops.reshape(matmul_30, full_int_array_5)
+        del matmul_30
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_31 = paddle._C_ops.matmul(layer_norm_27, parameter_117, False, False)
+        del parameter_117
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_36 = paddle._C_ops.reshape(matmul_31, full_int_array_5)
+        del matmul_31
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_32 = paddle._C_ops.matmul(layer_norm_27, parameter_116, False, False)
+        del parameter_116
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_37 = paddle._C_ops.reshape(matmul_32, full_int_array_5)
+        del matmul_32
+
+        # pd_op.matmul: (44x1x768xf32) <- (44x1x768xf32, 768x768xf32)
+        matmul_33 = paddle._C_ops.matmul(dropout_2, parameter_114, False, False)
+        del parameter_114
+
+        # pd_op.reshape: (44x1x12x64xf32) <- (44x1x768xf32, 4xi64)
+        reshape_38 = paddle._C_ops.reshape(matmul_33, full_int_array_6)
+        del matmul_33
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_46 = paddle._C_ops.add(reshape_35, parameter_111)
+        del parameter_111
+
+        # builtin.combine: ([22x1x12x64xf32, 22x1x12x64xf32]) <- (22x1x12x64xf32, 22x1x12x64xf32)
+        combine_32 = [add_46, reshape_36]
+        del add_46, reshape_36
+
+        # pd_op.einsum: (1x12x22x22xf32, [0xf32, 0xf32], [22x1x12x64xf32, 22x1x12x64xf32]) <- ([22x1x12x64xf32, 22x1x12x64xf32])
+        einsum_93, einsum_94, einsum_95 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_32, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_32
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_124,
+            split_125,
+        ) = einsum_94
+        del einsum_94
+
+        # builtin.split: (22x1x12x64xf32, 22x1x12x64xf32) <- ([22x1x12x64xf32, 22x1x12x64xf32])
+        (
+            split_126,
+            split_127,
+        ) = einsum_95
+        del einsum_95
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_47 = paddle._C_ops.add(reshape_35, parameter_113)
+        del parameter_113
+
+        # builtin.combine: ([22x1x12x64xf32, 44x1x12x64xf32]) <- (22x1x12x64xf32, 44x1x12x64xf32)
+        combine_33 = [add_47, reshape_38]
+        del add_47, reshape_38
+
+        # pd_op.einsum: (1x12x22x44xf32, [0xf32, 0xf32], [22x1x12x64xf32, 44x1x12x64xf32]) <- ([22x1x12x64xf32, 44x1x12x64xf32])
+        einsum_96, einsum_97, einsum_98 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_33, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_33
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_128,
+            split_129,
+        ) = einsum_97
+        del einsum_97
+
+        # builtin.split: (22x1x12x64xf32, 44x1x12x64xf32) <- ([22x1x12x64xf32, 44x1x12x64xf32])
+        (
+            split_130,
+            split_131,
+        ) = einsum_98
+        del einsum_98
+
+        # pd_op.reshape: (1x12x44x22xf32) <- (1x12x22x44xf32, 4xi64)
+        reshape_39 = paddle._C_ops.reshape(einsum_96, full_int_array_7)
+        del einsum_96
+
+        # pd_op.slice: (1x12x43x22xf32) <- (1x12x44x22xf32, 1xi64, 1xi64)
+        slice_5 = paddle._C_ops.slice(
+            reshape_39, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_39
+
+        # pd_op.reshape: (1x12x22x43xf32) <- (1x12x43x22xf32, 4xi64)
+        reshape_40 = paddle._C_ops.reshape(slice_5, full_int_array_9)
+        del slice_5
+
+        # pd_op.index_select: (1x12x22x22xf32) <- (1x12x22x43xf32, 22xi64)
+        index_select_5 = paddle._C_ops.index_select(reshape_40, arange_2, 3)
+        del reshape_40
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_48 = paddle._C_ops.add(reshape_35, parameter_112)
+        del parameter_112, reshape_35
+
+        # builtin.combine: ([22x1x12x64xf32, 2x12x64xf32]) <- (22x1x12x64xf32, 2x12x64xf32)
+        combine_34 = [add_48, parameter_110]
+        del add_48, parameter_110
+
+        # pd_op.einsum: (22x1x12x2xf32, [0xf32, 0xf32], [22x1x12x64xf32, 2x12x64xf32]) <- ([22x1x12x64xf32, 2x12x64xf32])
+        einsum_99, einsum_100, einsum_101 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_34, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_34
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_132,
+            split_133,
+        ) = einsum_100
+        del einsum_100
+
+        # builtin.split: (22x1x12x64xf32, 2x12x64xf32) <- ([22x1x12x64xf32, 2x12x64xf32])
+        (
+            split_134,
+            split_135,
+        ) = einsum_101
+        del einsum_101
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x12x2xf32]) <- (22x22x1x2xf32, 22x1x12x2xf32)
+        combine_35 = [cast_5, einsum_99]
+        del einsum_99
+
+        # pd_op.einsum: (1x12x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x12x2xf32]) <- ([22x22x1x2xf32, 22x1x12x2xf32])
+        einsum_102, einsum_103, einsum_104 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_35, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_35
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_136,
+            split_137,
+        ) = einsum_103
+        del einsum_103
+
+        # builtin.split: (22x22x1x2xf32, 22x1x12x2xf32) <- ([22x22x1x2xf32, 22x1x12x2xf32])
+        (
+            split_138,
+            split_139,
+        ) = einsum_104
+        del einsum_104
+
+        # pd_op.add: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x12x22x22xf32)
+        add_49 = paddle._C_ops.add(einsum_93, index_select_5)
+        del einsum_93, index_select_5
+
+        # pd_op.add: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x12x22x22xf32)
+        add_50 = paddle._C_ops.add(add_49, einsum_102)
+        del add_49, einsum_102
+
+        # pd_op.scale: (1x12x22x22xf32) <- (1x12x22x22xf32, 1xf32)
+        scale_9 = paddle._C_ops.scale(add_50, full_16, float("0"), True)
+        del add_50
+
+        # pd_op.subtract: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x1x22x22xf32)
+        subtract_5 = paddle._C_ops.subtract(scale_9, scale_4)
+        del scale_9
+
+        # pd_op.softmax: (1x12x22x22xf32) <- (1x12x22x22xf32)
+        softmax_5 = paddle._C_ops.softmax(subtract_5, 3)
+        del subtract_5
+
+        # pd_op.dropout: (1x12x22x22xf32, 1x12x22x22xui8) <- (1x12x22x22xf32, None, 1xf32)
+        dropout_44, dropout_45 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_5, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_5
+
+        # builtin.combine: ([1x12x22x22xf32, 22x1x12x64xf32]) <- (1x12x22x22xf32, 22x1x12x64xf32)
+        combine_36 = [dropout_44, reshape_37]
+        del dropout_44, reshape_37
+
+        # pd_op.einsum: (22x1x12x64xf32, [0xf32, 0xf32], [1x12x22x22xf32, 22x1x12x64xf32]) <- ([1x12x22x22xf32, 22x1x12x64xf32])
+        einsum_105, einsum_106, einsum_107 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_36, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_36
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_140,
+            split_141,
+        ) = einsum_106
+        del einsum_106
+
+        # builtin.split: (1x12x22x22xf32, 22x1x12x64xf32) <- ([1x12x22x22xf32, 22x1x12x64xf32])
+        (
+            split_142,
+            split_143,
+        ) = einsum_107
+        del einsum_107
+
+        # pd_op.reshape: (22x1x768xf32) <- (22x1x12x64xf32, 3xi64)
+        reshape_41 = paddle._C_ops.reshape(einsum_105, full_int_array_10)
+        del einsum_105
+
+        # builtin.combine: ([22x1x768xf32, 768x768xf32]) <- (22x1x768xf32, 768x768xf32)
+        combine_37 = [reshape_41, parameter_115]
+        del parameter_115, reshape_41
+
+        # pd_op.einsum: (22x1x768xf32, [0xf32, 0xf32], [22x1x768xf32, 768x768xf32]) <- ([22x1x768xf32, 768x768xf32])
+        einsum_108, einsum_109, einsum_110 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_37, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_37
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_144,
+            split_145,
+        ) = einsum_109
+        del einsum_109
+
+        # builtin.split: (22x1x768xf32, 768x768xf32) <- ([22x1x768xf32, 768x768xf32])
+        (
+            split_146,
+            split_147,
+        ) = einsum_110
+        del einsum_110
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_46, dropout_47 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_108, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_108
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 22x1x768xf32)
+        add_51 = paddle._C_ops.add(dropout_46, layer_norm_27)
+        del dropout_46, layer_norm_27
+
+        # pd_op.layer_norm: (22x1x768xf32, 22x1xf32, 22x1xf32) <- (22x1x768xf32, 768xf32, 768xf32)
+        layer_norm_30, layer_norm_31, layer_norm_32 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_51, parameter_109, parameter_108, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_51, parameter_108, parameter_109
+
+        # pd_op.matmul: (22x1x3072xf32) <- (22x1x768xf32, 768x3072xf32)
+        matmul_34 = paddle._C_ops.matmul(layer_norm_30, parameter_105, False, False)
+        del parameter_105
+
+        # pd_op.add: (22x1x3072xf32) <- (22x1x3072xf32, 3072xf32)
+        add_52 = paddle._C_ops.add(matmul_34, parameter_104)
+        del matmul_34, parameter_104
+
+        # pd_op.gelu: (22x1x3072xf32) <- (22x1x3072xf32)
+        gelu_5 = paddle._C_ops.gelu(add_52, False)
+        del add_52
+
+        # pd_op.dropout: (22x1x3072xf32, 22x1x3072xui8) <- (22x1x3072xf32, None, 1xf32)
+        dropout_48, dropout_49 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_5, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_5
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x3072xf32, 3072x768xf32)
+        matmul_35 = paddle._C_ops.matmul(dropout_48, parameter_103, False, False)
+        del dropout_48, parameter_103
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 768xf32)
+        add_53 = paddle._C_ops.add(matmul_35, parameter_102)
+        del matmul_35, parameter_102
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_50, dropout_51 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_53, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_53
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 22x1x768xf32)
+        add_54 = paddle._C_ops.add(dropout_50, layer_norm_30)
+        del dropout_50, layer_norm_30
+
+        # pd_op.layer_norm: (22x1x768xf32, 22x1xf32, 22x1xf32) <- (22x1x768xf32, 768xf32, 768xf32)
+        layer_norm_33, layer_norm_34, layer_norm_35 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_54, parameter_107, parameter_106, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_54, parameter_106, parameter_107
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_36 = paddle._C_ops.matmul(layer_norm_33, parameter_101, False, False)
+        del parameter_101
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_42 = paddle._C_ops.reshape(matmul_36, full_int_array_5)
+        del matmul_36
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_37 = paddle._C_ops.matmul(layer_norm_33, parameter_100, False, False)
+        del parameter_100
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_43 = paddle._C_ops.reshape(matmul_37, full_int_array_5)
+        del matmul_37
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_38 = paddle._C_ops.matmul(layer_norm_33, parameter_99, False, False)
+        del parameter_99
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_44 = paddle._C_ops.reshape(matmul_38, full_int_array_5)
+        del matmul_38
+
+        # pd_op.matmul: (44x1x768xf32) <- (44x1x768xf32, 768x768xf32)
+        matmul_39 = paddle._C_ops.matmul(dropout_2, parameter_97, False, False)
+        del parameter_97
+
+        # pd_op.reshape: (44x1x12x64xf32) <- (44x1x768xf32, 4xi64)
+        reshape_45 = paddle._C_ops.reshape(matmul_39, full_int_array_6)
+        del matmul_39
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_55 = paddle._C_ops.add(reshape_42, parameter_94)
+        del parameter_94
+
+        # builtin.combine: ([22x1x12x64xf32, 22x1x12x64xf32]) <- (22x1x12x64xf32, 22x1x12x64xf32)
+        combine_38 = [add_55, reshape_43]
+        del add_55, reshape_43
+
+        # pd_op.einsum: (1x12x22x22xf32, [0xf32, 0xf32], [22x1x12x64xf32, 22x1x12x64xf32]) <- ([22x1x12x64xf32, 22x1x12x64xf32])
+        einsum_111, einsum_112, einsum_113 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_38, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_38
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_148,
+            split_149,
+        ) = einsum_112
+        del einsum_112
+
+        # builtin.split: (22x1x12x64xf32, 22x1x12x64xf32) <- ([22x1x12x64xf32, 22x1x12x64xf32])
+        (
+            split_150,
+            split_151,
+        ) = einsum_113
+        del einsum_113
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_56 = paddle._C_ops.add(reshape_42, parameter_96)
+        del parameter_96
+
+        # builtin.combine: ([22x1x12x64xf32, 44x1x12x64xf32]) <- (22x1x12x64xf32, 44x1x12x64xf32)
+        combine_39 = [add_56, reshape_45]
+        del add_56, reshape_45
+
+        # pd_op.einsum: (1x12x22x44xf32, [0xf32, 0xf32], [22x1x12x64xf32, 44x1x12x64xf32]) <- ([22x1x12x64xf32, 44x1x12x64xf32])
+        einsum_114, einsum_115, einsum_116 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_39, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_39
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_152,
+            split_153,
+        ) = einsum_115
+        del einsum_115
+
+        # builtin.split: (22x1x12x64xf32, 44x1x12x64xf32) <- ([22x1x12x64xf32, 44x1x12x64xf32])
+        (
+            split_154,
+            split_155,
+        ) = einsum_116
+        del einsum_116
+
+        # pd_op.reshape: (1x12x44x22xf32) <- (1x12x22x44xf32, 4xi64)
+        reshape_46 = paddle._C_ops.reshape(einsum_114, full_int_array_7)
+        del einsum_114
+
+        # pd_op.slice: (1x12x43x22xf32) <- (1x12x44x22xf32, 1xi64, 1xi64)
+        slice_6 = paddle._C_ops.slice(
+            reshape_46, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_46
+
+        # pd_op.reshape: (1x12x22x43xf32) <- (1x12x43x22xf32, 4xi64)
+        reshape_47 = paddle._C_ops.reshape(slice_6, full_int_array_9)
+        del slice_6
+
+        # pd_op.index_select: (1x12x22x22xf32) <- (1x12x22x43xf32, 22xi64)
+        index_select_6 = paddle._C_ops.index_select(reshape_47, arange_2, 3)
+        del reshape_47
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_57 = paddle._C_ops.add(reshape_42, parameter_95)
+        del parameter_95, reshape_42
+
+        # builtin.combine: ([22x1x12x64xf32, 2x12x64xf32]) <- (22x1x12x64xf32, 2x12x64xf32)
+        combine_40 = [add_57, parameter_93]
+        del add_57, parameter_93
+
+        # pd_op.einsum: (22x1x12x2xf32, [0xf32, 0xf32], [22x1x12x64xf32, 2x12x64xf32]) <- ([22x1x12x64xf32, 2x12x64xf32])
+        einsum_117, einsum_118, einsum_119 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_40, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_40
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_156,
+            split_157,
+        ) = einsum_118
+        del einsum_118
+
+        # builtin.split: (22x1x12x64xf32, 2x12x64xf32) <- ([22x1x12x64xf32, 2x12x64xf32])
+        (
+            split_158,
+            split_159,
+        ) = einsum_119
+        del einsum_119
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x12x2xf32]) <- (22x22x1x2xf32, 22x1x12x2xf32)
+        combine_41 = [cast_5, einsum_117]
+        del einsum_117
+
+        # pd_op.einsum: (1x12x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x12x2xf32]) <- ([22x22x1x2xf32, 22x1x12x2xf32])
+        einsum_120, einsum_121, einsum_122 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_41, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_41
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_160,
+            split_161,
+        ) = einsum_121
+        del einsum_121
+
+        # builtin.split: (22x22x1x2xf32, 22x1x12x2xf32) <- ([22x22x1x2xf32, 22x1x12x2xf32])
+        (
+            split_162,
+            split_163,
+        ) = einsum_122
+        del einsum_122
+
+        # pd_op.add: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x12x22x22xf32)
+        add_58 = paddle._C_ops.add(einsum_111, index_select_6)
+        del einsum_111, index_select_6
+
+        # pd_op.add: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x12x22x22xf32)
+        add_59 = paddle._C_ops.add(add_58, einsum_120)
+        del add_58, einsum_120
+
+        # pd_op.scale: (1x12x22x22xf32) <- (1x12x22x22xf32, 1xf32)
+        scale_10 = paddle._C_ops.scale(add_59, full_16, float("0"), True)
+        del add_59
+
+        # pd_op.subtract: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x1x22x22xf32)
+        subtract_6 = paddle._C_ops.subtract(scale_10, scale_4)
+        del scale_10
+
+        # pd_op.softmax: (1x12x22x22xf32) <- (1x12x22x22xf32)
+        softmax_6 = paddle._C_ops.softmax(subtract_6, 3)
+        del subtract_6
+
+        # pd_op.dropout: (1x12x22x22xf32, 1x12x22x22xui8) <- (1x12x22x22xf32, None, 1xf32)
+        dropout_52, dropout_53 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_6, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_6
+
+        # builtin.combine: ([1x12x22x22xf32, 22x1x12x64xf32]) <- (1x12x22x22xf32, 22x1x12x64xf32)
+        combine_42 = [dropout_52, reshape_44]
+        del dropout_52, reshape_44
+
+        # pd_op.einsum: (22x1x12x64xf32, [0xf32, 0xf32], [1x12x22x22xf32, 22x1x12x64xf32]) <- ([1x12x22x22xf32, 22x1x12x64xf32])
+        einsum_123, einsum_124, einsum_125 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_42, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_42
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_164,
+            split_165,
+        ) = einsum_124
+        del einsum_124
+
+        # builtin.split: (1x12x22x22xf32, 22x1x12x64xf32) <- ([1x12x22x22xf32, 22x1x12x64xf32])
+        (
+            split_166,
+            split_167,
+        ) = einsum_125
+        del einsum_125
+
+        # pd_op.reshape: (22x1x768xf32) <- (22x1x12x64xf32, 3xi64)
+        reshape_48 = paddle._C_ops.reshape(einsum_123, full_int_array_10)
+        del einsum_123
+
+        # builtin.combine: ([22x1x768xf32, 768x768xf32]) <- (22x1x768xf32, 768x768xf32)
+        combine_43 = [reshape_48, parameter_98]
+        del parameter_98, reshape_48
+
+        # pd_op.einsum: (22x1x768xf32, [0xf32, 0xf32], [22x1x768xf32, 768x768xf32]) <- ([22x1x768xf32, 768x768xf32])
+        einsum_126, einsum_127, einsum_128 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_43, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_43
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_168,
+            split_169,
+        ) = einsum_127
+        del einsum_127
+
+        # builtin.split: (22x1x768xf32, 768x768xf32) <- ([22x1x768xf32, 768x768xf32])
+        (
+            split_170,
+            split_171,
+        ) = einsum_128
+        del einsum_128
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_54, dropout_55 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_126, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_126
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 22x1x768xf32)
+        add_60 = paddle._C_ops.add(dropout_54, layer_norm_33)
+        del dropout_54, layer_norm_33
+
+        # pd_op.layer_norm: (22x1x768xf32, 22x1xf32, 22x1xf32) <- (22x1x768xf32, 768xf32, 768xf32)
+        layer_norm_36, layer_norm_37, layer_norm_38 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_60, parameter_92, parameter_91, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_60, parameter_91, parameter_92
+
+        # pd_op.matmul: (22x1x3072xf32) <- (22x1x768xf32, 768x3072xf32)
+        matmul_40 = paddle._C_ops.matmul(layer_norm_36, parameter_88, False, False)
+        del parameter_88
+
+        # pd_op.add: (22x1x3072xf32) <- (22x1x3072xf32, 3072xf32)
+        add_61 = paddle._C_ops.add(matmul_40, parameter_87)
+        del matmul_40, parameter_87
+
+        # pd_op.gelu: (22x1x3072xf32) <- (22x1x3072xf32)
+        gelu_6 = paddle._C_ops.gelu(add_61, False)
+        del add_61
+
+        # pd_op.dropout: (22x1x3072xf32, 22x1x3072xui8) <- (22x1x3072xf32, None, 1xf32)
+        dropout_56, dropout_57 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_6, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_6
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x3072xf32, 3072x768xf32)
+        matmul_41 = paddle._C_ops.matmul(dropout_56, parameter_86, False, False)
+        del dropout_56, parameter_86
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 768xf32)
+        add_62 = paddle._C_ops.add(matmul_41, parameter_85)
+        del matmul_41, parameter_85
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_58, dropout_59 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_62, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_62
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 22x1x768xf32)
+        add_63 = paddle._C_ops.add(dropout_58, layer_norm_36)
+        del dropout_58, layer_norm_36
+
+        # pd_op.layer_norm: (22x1x768xf32, 22x1xf32, 22x1xf32) <- (22x1x768xf32, 768xf32, 768xf32)
+        layer_norm_39, layer_norm_40, layer_norm_41 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_63, parameter_90, parameter_89, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_63, parameter_89, parameter_90
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_42 = paddle._C_ops.matmul(layer_norm_39, parameter_84, False, False)
+        del parameter_84
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_49 = paddle._C_ops.reshape(matmul_42, full_int_array_5)
+        del matmul_42
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_43 = paddle._C_ops.matmul(layer_norm_39, parameter_83, False, False)
+        del parameter_83
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_50 = paddle._C_ops.reshape(matmul_43, full_int_array_5)
+        del matmul_43
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_44 = paddle._C_ops.matmul(layer_norm_39, parameter_82, False, False)
+        del parameter_82
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_51 = paddle._C_ops.reshape(matmul_44, full_int_array_5)
+        del matmul_44
+
+        # pd_op.matmul: (44x1x768xf32) <- (44x1x768xf32, 768x768xf32)
+        matmul_45 = paddle._C_ops.matmul(dropout_2, parameter_80, False, False)
+        del parameter_80
+
+        # pd_op.reshape: (44x1x12x64xf32) <- (44x1x768xf32, 4xi64)
+        reshape_52 = paddle._C_ops.reshape(matmul_45, full_int_array_6)
+        del matmul_45
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_64 = paddle._C_ops.add(reshape_49, parameter_77)
+        del parameter_77
+
+        # builtin.combine: ([22x1x12x64xf32, 22x1x12x64xf32]) <- (22x1x12x64xf32, 22x1x12x64xf32)
+        combine_44 = [add_64, reshape_50]
+        del add_64, reshape_50
+
+        # pd_op.einsum: (1x12x22x22xf32, [0xf32, 0xf32], [22x1x12x64xf32, 22x1x12x64xf32]) <- ([22x1x12x64xf32, 22x1x12x64xf32])
+        einsum_129, einsum_130, einsum_131 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_44, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_44
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_172,
+            split_173,
+        ) = einsum_130
+        del einsum_130
+
+        # builtin.split: (22x1x12x64xf32, 22x1x12x64xf32) <- ([22x1x12x64xf32, 22x1x12x64xf32])
+        (
+            split_174,
+            split_175,
+        ) = einsum_131
+        del einsum_131
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_65 = paddle._C_ops.add(reshape_49, parameter_79)
+        del parameter_79
+
+        # builtin.combine: ([22x1x12x64xf32, 44x1x12x64xf32]) <- (22x1x12x64xf32, 44x1x12x64xf32)
+        combine_45 = [add_65, reshape_52]
+        del add_65, reshape_52
+
+        # pd_op.einsum: (1x12x22x44xf32, [0xf32, 0xf32], [22x1x12x64xf32, 44x1x12x64xf32]) <- ([22x1x12x64xf32, 44x1x12x64xf32])
+        einsum_132, einsum_133, einsum_134 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_45, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_45
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_176,
+            split_177,
+        ) = einsum_133
+        del einsum_133
+
+        # builtin.split: (22x1x12x64xf32, 44x1x12x64xf32) <- ([22x1x12x64xf32, 44x1x12x64xf32])
+        (
+            split_178,
+            split_179,
+        ) = einsum_134
+        del einsum_134
+
+        # pd_op.reshape: (1x12x44x22xf32) <- (1x12x22x44xf32, 4xi64)
+        reshape_53 = paddle._C_ops.reshape(einsum_132, full_int_array_7)
+        del einsum_132
+
+        # pd_op.slice: (1x12x43x22xf32) <- (1x12x44x22xf32, 1xi64, 1xi64)
+        slice_7 = paddle._C_ops.slice(
+            reshape_53, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_53
+
+        # pd_op.reshape: (1x12x22x43xf32) <- (1x12x43x22xf32, 4xi64)
+        reshape_54 = paddle._C_ops.reshape(slice_7, full_int_array_9)
+        del slice_7
+
+        # pd_op.index_select: (1x12x22x22xf32) <- (1x12x22x43xf32, 22xi64)
+        index_select_7 = paddle._C_ops.index_select(reshape_54, arange_2, 3)
+        del reshape_54
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_66 = paddle._C_ops.add(reshape_49, parameter_78)
+        del parameter_78, reshape_49
+
+        # builtin.combine: ([22x1x12x64xf32, 2x12x64xf32]) <- (22x1x12x64xf32, 2x12x64xf32)
+        combine_46 = [add_66, parameter_76]
+        del add_66, parameter_76
+
+        # pd_op.einsum: (22x1x12x2xf32, [0xf32, 0xf32], [22x1x12x64xf32, 2x12x64xf32]) <- ([22x1x12x64xf32, 2x12x64xf32])
+        einsum_135, einsum_136, einsum_137 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_46, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_46
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_180,
+            split_181,
+        ) = einsum_136
+        del einsum_136
+
+        # builtin.split: (22x1x12x64xf32, 2x12x64xf32) <- ([22x1x12x64xf32, 2x12x64xf32])
+        (
+            split_182,
+            split_183,
+        ) = einsum_137
+        del einsum_137
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x12x2xf32]) <- (22x22x1x2xf32, 22x1x12x2xf32)
+        combine_47 = [cast_5, einsum_135]
+        del einsum_135
+
+        # pd_op.einsum: (1x12x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x12x2xf32]) <- ([22x22x1x2xf32, 22x1x12x2xf32])
+        einsum_138, einsum_139, einsum_140 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_47, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_47
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_184,
+            split_185,
+        ) = einsum_139
+        del einsum_139
+
+        # builtin.split: (22x22x1x2xf32, 22x1x12x2xf32) <- ([22x22x1x2xf32, 22x1x12x2xf32])
+        (
+            split_186,
+            split_187,
+        ) = einsum_140
+        del einsum_140
+
+        # pd_op.add: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x12x22x22xf32)
+        add_67 = paddle._C_ops.add(einsum_129, index_select_7)
+        del einsum_129, index_select_7
+
+        # pd_op.add: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x12x22x22xf32)
+        add_68 = paddle._C_ops.add(add_67, einsum_138)
+        del add_67, einsum_138
+
+        # pd_op.scale: (1x12x22x22xf32) <- (1x12x22x22xf32, 1xf32)
+        scale_11 = paddle._C_ops.scale(add_68, full_16, float("0"), True)
+        del add_68
+
+        # pd_op.subtract: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x1x22x22xf32)
+        subtract_7 = paddle._C_ops.subtract(scale_11, scale_4)
+        del scale_11
+
+        # pd_op.softmax: (1x12x22x22xf32) <- (1x12x22x22xf32)
+        softmax_7 = paddle._C_ops.softmax(subtract_7, 3)
+        del subtract_7
+
+        # pd_op.dropout: (1x12x22x22xf32, 1x12x22x22xui8) <- (1x12x22x22xf32, None, 1xf32)
+        dropout_60, dropout_61 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_7, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_7
+
+        # builtin.combine: ([1x12x22x22xf32, 22x1x12x64xf32]) <- (1x12x22x22xf32, 22x1x12x64xf32)
+        combine_48 = [dropout_60, reshape_51]
+        del dropout_60, reshape_51
+
+        # pd_op.einsum: (22x1x12x64xf32, [0xf32, 0xf32], [1x12x22x22xf32, 22x1x12x64xf32]) <- ([1x12x22x22xf32, 22x1x12x64xf32])
+        einsum_141, einsum_142, einsum_143 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_48, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_48
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_188,
+            split_189,
+        ) = einsum_142
+        del einsum_142
+
+        # builtin.split: (1x12x22x22xf32, 22x1x12x64xf32) <- ([1x12x22x22xf32, 22x1x12x64xf32])
+        (
+            split_190,
+            split_191,
+        ) = einsum_143
+        del einsum_143
+
+        # pd_op.reshape: (22x1x768xf32) <- (22x1x12x64xf32, 3xi64)
+        reshape_55 = paddle._C_ops.reshape(einsum_141, full_int_array_10)
+        del einsum_141
+
+        # builtin.combine: ([22x1x768xf32, 768x768xf32]) <- (22x1x768xf32, 768x768xf32)
+        combine_49 = [reshape_55, parameter_81]
+        del parameter_81, reshape_55
+
+        # pd_op.einsum: (22x1x768xf32, [0xf32, 0xf32], [22x1x768xf32, 768x768xf32]) <- ([22x1x768xf32, 768x768xf32])
+        einsum_144, einsum_145, einsum_146 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_49, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_49
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_192,
+            split_193,
+        ) = einsum_145
+        del einsum_145
+
+        # builtin.split: (22x1x768xf32, 768x768xf32) <- ([22x1x768xf32, 768x768xf32])
+        (
+            split_194,
+            split_195,
+        ) = einsum_146
+        del einsum_146
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_62, dropout_63 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_144, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_144
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 22x1x768xf32)
+        add_69 = paddle._C_ops.add(dropout_62, layer_norm_39)
+        del dropout_62, layer_norm_39
+
+        # pd_op.layer_norm: (22x1x768xf32, 22x1xf32, 22x1xf32) <- (22x1x768xf32, 768xf32, 768xf32)
+        layer_norm_42, layer_norm_43, layer_norm_44 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_69, parameter_75, parameter_74, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_69, parameter_74, parameter_75
+
+        # pd_op.matmul: (22x1x3072xf32) <- (22x1x768xf32, 768x3072xf32)
+        matmul_46 = paddle._C_ops.matmul(layer_norm_42, parameter_71, False, False)
+        del parameter_71
+
+        # pd_op.add: (22x1x3072xf32) <- (22x1x3072xf32, 3072xf32)
+        add_70 = paddle._C_ops.add(matmul_46, parameter_70)
+        del matmul_46, parameter_70
+
+        # pd_op.gelu: (22x1x3072xf32) <- (22x1x3072xf32)
+        gelu_7 = paddle._C_ops.gelu(add_70, False)
+        del add_70
+
+        # pd_op.dropout: (22x1x3072xf32, 22x1x3072xui8) <- (22x1x3072xf32, None, 1xf32)
+        dropout_64, dropout_65 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_7, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_7
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x3072xf32, 3072x768xf32)
+        matmul_47 = paddle._C_ops.matmul(dropout_64, parameter_69, False, False)
+        del dropout_64, parameter_69
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 768xf32)
+        add_71 = paddle._C_ops.add(matmul_47, parameter_68)
+        del matmul_47, parameter_68
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_66, dropout_67 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_71, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_71
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 22x1x768xf32)
+        add_72 = paddle._C_ops.add(dropout_66, layer_norm_42)
+        del dropout_66, layer_norm_42
+
+        # pd_op.layer_norm: (22x1x768xf32, 22x1xf32, 22x1xf32) <- (22x1x768xf32, 768xf32, 768xf32)
+        layer_norm_45, layer_norm_46, layer_norm_47 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_72, parameter_73, parameter_72, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_72, parameter_72, parameter_73
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_48 = paddle._C_ops.matmul(layer_norm_45, parameter_67, False, False)
+        del parameter_67
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_56 = paddle._C_ops.reshape(matmul_48, full_int_array_5)
+        del matmul_48
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_49 = paddle._C_ops.matmul(layer_norm_45, parameter_66, False, False)
+        del parameter_66
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_57 = paddle._C_ops.reshape(matmul_49, full_int_array_5)
+        del matmul_49
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_50 = paddle._C_ops.matmul(layer_norm_45, parameter_65, False, False)
+        del parameter_65
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_58 = paddle._C_ops.reshape(matmul_50, full_int_array_5)
+        del matmul_50
+
+        # pd_op.matmul: (44x1x768xf32) <- (44x1x768xf32, 768x768xf32)
+        matmul_51 = paddle._C_ops.matmul(dropout_2, parameter_63, False, False)
+        del parameter_63
+
+        # pd_op.reshape: (44x1x12x64xf32) <- (44x1x768xf32, 4xi64)
+        reshape_59 = paddle._C_ops.reshape(matmul_51, full_int_array_6)
+        del matmul_51
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_73 = paddle._C_ops.add(reshape_56, parameter_60)
+        del parameter_60
+
+        # builtin.combine: ([22x1x12x64xf32, 22x1x12x64xf32]) <- (22x1x12x64xf32, 22x1x12x64xf32)
+        combine_50 = [add_73, reshape_57]
+        del add_73, reshape_57
+
+        # pd_op.einsum: (1x12x22x22xf32, [0xf32, 0xf32], [22x1x12x64xf32, 22x1x12x64xf32]) <- ([22x1x12x64xf32, 22x1x12x64xf32])
+        einsum_147, einsum_148, einsum_149 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_50, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_50
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_196,
+            split_197,
+        ) = einsum_148
+        del einsum_148
+
+        # builtin.split: (22x1x12x64xf32, 22x1x12x64xf32) <- ([22x1x12x64xf32, 22x1x12x64xf32])
+        (
+            split_198,
+            split_199,
+        ) = einsum_149
+        del einsum_149
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_74 = paddle._C_ops.add(reshape_56, parameter_62)
+        del parameter_62
+
+        # builtin.combine: ([22x1x12x64xf32, 44x1x12x64xf32]) <- (22x1x12x64xf32, 44x1x12x64xf32)
+        combine_51 = [add_74, reshape_59]
+        del add_74, reshape_59
+
+        # pd_op.einsum: (1x12x22x44xf32, [0xf32, 0xf32], [22x1x12x64xf32, 44x1x12x64xf32]) <- ([22x1x12x64xf32, 44x1x12x64xf32])
+        einsum_150, einsum_151, einsum_152 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_51, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_51
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_200,
+            split_201,
+        ) = einsum_151
+        del einsum_151
+
+        # builtin.split: (22x1x12x64xf32, 44x1x12x64xf32) <- ([22x1x12x64xf32, 44x1x12x64xf32])
+        (
+            split_202,
+            split_203,
+        ) = einsum_152
+        del einsum_152
+
+        # pd_op.reshape: (1x12x44x22xf32) <- (1x12x22x44xf32, 4xi64)
+        reshape_60 = paddle._C_ops.reshape(einsum_150, full_int_array_7)
+        del einsum_150
+
+        # pd_op.slice: (1x12x43x22xf32) <- (1x12x44x22xf32, 1xi64, 1xi64)
+        slice_8 = paddle._C_ops.slice(
+            reshape_60, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_60
+
+        # pd_op.reshape: (1x12x22x43xf32) <- (1x12x43x22xf32, 4xi64)
+        reshape_61 = paddle._C_ops.reshape(slice_8, full_int_array_9)
+        del slice_8
+
+        # pd_op.index_select: (1x12x22x22xf32) <- (1x12x22x43xf32, 22xi64)
+        index_select_8 = paddle._C_ops.index_select(reshape_61, arange_2, 3)
+        del reshape_61
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_75 = paddle._C_ops.add(reshape_56, parameter_61)
+        del parameter_61, reshape_56
+
+        # builtin.combine: ([22x1x12x64xf32, 2x12x64xf32]) <- (22x1x12x64xf32, 2x12x64xf32)
+        combine_52 = [add_75, parameter_59]
+        del add_75, parameter_59
+
+        # pd_op.einsum: (22x1x12x2xf32, [0xf32, 0xf32], [22x1x12x64xf32, 2x12x64xf32]) <- ([22x1x12x64xf32, 2x12x64xf32])
+        einsum_153, einsum_154, einsum_155 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_52, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_52
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_204,
+            split_205,
+        ) = einsum_154
+        del einsum_154
+
+        # builtin.split: (22x1x12x64xf32, 2x12x64xf32) <- ([22x1x12x64xf32, 2x12x64xf32])
+        (
+            split_206,
+            split_207,
+        ) = einsum_155
+        del einsum_155
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x12x2xf32]) <- (22x22x1x2xf32, 22x1x12x2xf32)
+        combine_53 = [cast_5, einsum_153]
+        del einsum_153
+
+        # pd_op.einsum: (1x12x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x12x2xf32]) <- ([22x22x1x2xf32, 22x1x12x2xf32])
+        einsum_156, einsum_157, einsum_158 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_53, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_53
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_208,
+            split_209,
+        ) = einsum_157
+        del einsum_157
+
+        # builtin.split: (22x22x1x2xf32, 22x1x12x2xf32) <- ([22x22x1x2xf32, 22x1x12x2xf32])
+        (
+            split_210,
+            split_211,
+        ) = einsum_158
+        del einsum_158
+
+        # pd_op.add: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x12x22x22xf32)
+        add_76 = paddle._C_ops.add(einsum_147, index_select_8)
+        del einsum_147, index_select_8
+
+        # pd_op.add: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x12x22x22xf32)
+        add_77 = paddle._C_ops.add(add_76, einsum_156)
+        del add_76, einsum_156
+
+        # pd_op.scale: (1x12x22x22xf32) <- (1x12x22x22xf32, 1xf32)
+        scale_12 = paddle._C_ops.scale(add_77, full_16, float("0"), True)
+        del add_77
+
+        # pd_op.subtract: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x1x22x22xf32)
+        subtract_8 = paddle._C_ops.subtract(scale_12, scale_4)
+        del scale_12
+
+        # pd_op.softmax: (1x12x22x22xf32) <- (1x12x22x22xf32)
+        softmax_8 = paddle._C_ops.softmax(subtract_8, 3)
+        del subtract_8
+
+        # pd_op.dropout: (1x12x22x22xf32, 1x12x22x22xui8) <- (1x12x22x22xf32, None, 1xf32)
+        dropout_68, dropout_69 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_8, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_8
+
+        # builtin.combine: ([1x12x22x22xf32, 22x1x12x64xf32]) <- (1x12x22x22xf32, 22x1x12x64xf32)
+        combine_54 = [dropout_68, reshape_58]
+        del dropout_68, reshape_58
+
+        # pd_op.einsum: (22x1x12x64xf32, [0xf32, 0xf32], [1x12x22x22xf32, 22x1x12x64xf32]) <- ([1x12x22x22xf32, 22x1x12x64xf32])
+        einsum_159, einsum_160, einsum_161 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_54, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_54
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_212,
+            split_213,
+        ) = einsum_160
+        del einsum_160
+
+        # builtin.split: (1x12x22x22xf32, 22x1x12x64xf32) <- ([1x12x22x22xf32, 22x1x12x64xf32])
+        (
+            split_214,
+            split_215,
+        ) = einsum_161
+        del einsum_161
+
+        # pd_op.reshape: (22x1x768xf32) <- (22x1x12x64xf32, 3xi64)
+        reshape_62 = paddle._C_ops.reshape(einsum_159, full_int_array_10)
+        del einsum_159
+
+        # builtin.combine: ([22x1x768xf32, 768x768xf32]) <- (22x1x768xf32, 768x768xf32)
+        combine_55 = [reshape_62, parameter_64]
+        del parameter_64, reshape_62
+
+        # pd_op.einsum: (22x1x768xf32, [0xf32, 0xf32], [22x1x768xf32, 768x768xf32]) <- ([22x1x768xf32, 768x768xf32])
+        einsum_162, einsum_163, einsum_164 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_55, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_55
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_216,
+            split_217,
+        ) = einsum_163
+        del einsum_163
+
+        # builtin.split: (22x1x768xf32, 768x768xf32) <- ([22x1x768xf32, 768x768xf32])
+        (
+            split_218,
+            split_219,
+        ) = einsum_164
+        del einsum_164
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_70, dropout_71 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_162, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_162
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 22x1x768xf32)
+        add_78 = paddle._C_ops.add(dropout_70, layer_norm_45)
+        del dropout_70, layer_norm_45
+
+        # pd_op.layer_norm: (22x1x768xf32, 22x1xf32, 22x1xf32) <- (22x1x768xf32, 768xf32, 768xf32)
+        layer_norm_48, layer_norm_49, layer_norm_50 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_78, parameter_58, parameter_57, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_78, parameter_57, parameter_58
+
+        # pd_op.matmul: (22x1x3072xf32) <- (22x1x768xf32, 768x3072xf32)
+        matmul_52 = paddle._C_ops.matmul(layer_norm_48, parameter_54, False, False)
+        del parameter_54
+
+        # pd_op.add: (22x1x3072xf32) <- (22x1x3072xf32, 3072xf32)
+        add_79 = paddle._C_ops.add(matmul_52, parameter_53)
+        del matmul_52, parameter_53
+
+        # pd_op.gelu: (22x1x3072xf32) <- (22x1x3072xf32)
+        gelu_8 = paddle._C_ops.gelu(add_79, False)
+        del add_79
+
+        # pd_op.dropout: (22x1x3072xf32, 22x1x3072xui8) <- (22x1x3072xf32, None, 1xf32)
+        dropout_72, dropout_73 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_8, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_8
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x3072xf32, 3072x768xf32)
+        matmul_53 = paddle._C_ops.matmul(dropout_72, parameter_52, False, False)
+        del dropout_72, parameter_52
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 768xf32)
+        add_80 = paddle._C_ops.add(matmul_53, parameter_51)
+        del matmul_53, parameter_51
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_74, dropout_75 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_80, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_80
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 22x1x768xf32)
+        add_81 = paddle._C_ops.add(dropout_74, layer_norm_48)
+        del dropout_74, layer_norm_48
+
+        # pd_op.layer_norm: (22x1x768xf32, 22x1xf32, 22x1xf32) <- (22x1x768xf32, 768xf32, 768xf32)
+        layer_norm_51, layer_norm_52, layer_norm_53 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_81, parameter_56, parameter_55, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_81, parameter_55, parameter_56
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_54 = paddle._C_ops.matmul(layer_norm_51, parameter_50, False, False)
+        del parameter_50
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_63 = paddle._C_ops.reshape(matmul_54, full_int_array_5)
+        del matmul_54
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_55 = paddle._C_ops.matmul(layer_norm_51, parameter_49, False, False)
+        del parameter_49
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_64 = paddle._C_ops.reshape(matmul_55, full_int_array_5)
+        del matmul_55
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_56 = paddle._C_ops.matmul(layer_norm_51, parameter_48, False, False)
+        del parameter_48
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_65 = paddle._C_ops.reshape(matmul_56, full_int_array_5)
+        del matmul_56
+
+        # pd_op.matmul: (44x1x768xf32) <- (44x1x768xf32, 768x768xf32)
+        matmul_57 = paddle._C_ops.matmul(dropout_2, parameter_46, False, False)
+        del parameter_46
+
+        # pd_op.reshape: (44x1x12x64xf32) <- (44x1x768xf32, 4xi64)
+        reshape_66 = paddle._C_ops.reshape(matmul_57, full_int_array_6)
+        del matmul_57
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_82 = paddle._C_ops.add(reshape_63, parameter_43)
+        del parameter_43
+
+        # builtin.combine: ([22x1x12x64xf32, 22x1x12x64xf32]) <- (22x1x12x64xf32, 22x1x12x64xf32)
+        combine_56 = [add_82, reshape_64]
+        del add_82, reshape_64
+
+        # pd_op.einsum: (1x12x22x22xf32, [0xf32, 0xf32], [22x1x12x64xf32, 22x1x12x64xf32]) <- ([22x1x12x64xf32, 22x1x12x64xf32])
+        einsum_165, einsum_166, einsum_167 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_56, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_56
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_220,
+            split_221,
+        ) = einsum_166
+        del einsum_166
+
+        # builtin.split: (22x1x12x64xf32, 22x1x12x64xf32) <- ([22x1x12x64xf32, 22x1x12x64xf32])
+        (
+            split_222,
+            split_223,
+        ) = einsum_167
+        del einsum_167
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_83 = paddle._C_ops.add(reshape_63, parameter_45)
+        del parameter_45
+
+        # builtin.combine: ([22x1x12x64xf32, 44x1x12x64xf32]) <- (22x1x12x64xf32, 44x1x12x64xf32)
+        combine_57 = [add_83, reshape_66]
+        del add_83, reshape_66
+
+        # pd_op.einsum: (1x12x22x44xf32, [0xf32, 0xf32], [22x1x12x64xf32, 44x1x12x64xf32]) <- ([22x1x12x64xf32, 44x1x12x64xf32])
+        einsum_168, einsum_169, einsum_170 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_57, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_57
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_224,
+            split_225,
+        ) = einsum_169
+        del einsum_169
+
+        # builtin.split: (22x1x12x64xf32, 44x1x12x64xf32) <- ([22x1x12x64xf32, 44x1x12x64xf32])
+        (
+            split_226,
+            split_227,
+        ) = einsum_170
+        del einsum_170
+
+        # pd_op.reshape: (1x12x44x22xf32) <- (1x12x22x44xf32, 4xi64)
+        reshape_67 = paddle._C_ops.reshape(einsum_168, full_int_array_7)
+        del einsum_168
+
+        # pd_op.slice: (1x12x43x22xf32) <- (1x12x44x22xf32, 1xi64, 1xi64)
+        slice_9 = paddle._C_ops.slice(
+            reshape_67, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_67
+
+        # pd_op.reshape: (1x12x22x43xf32) <- (1x12x43x22xf32, 4xi64)
+        reshape_68 = paddle._C_ops.reshape(slice_9, full_int_array_9)
+        del slice_9
+
+        # pd_op.index_select: (1x12x22x22xf32) <- (1x12x22x43xf32, 22xi64)
+        index_select_9 = paddle._C_ops.index_select(reshape_68, arange_2, 3)
+        del reshape_68
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_84 = paddle._C_ops.add(reshape_63, parameter_44)
+        del parameter_44, reshape_63
+
+        # builtin.combine: ([22x1x12x64xf32, 2x12x64xf32]) <- (22x1x12x64xf32, 2x12x64xf32)
+        combine_58 = [add_84, parameter_42]
+        del add_84, parameter_42
+
+        # pd_op.einsum: (22x1x12x2xf32, [0xf32, 0xf32], [22x1x12x64xf32, 2x12x64xf32]) <- ([22x1x12x64xf32, 2x12x64xf32])
+        einsum_171, einsum_172, einsum_173 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_58, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_58
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_228,
+            split_229,
+        ) = einsum_172
+        del einsum_172
+
+        # builtin.split: (22x1x12x64xf32, 2x12x64xf32) <- ([22x1x12x64xf32, 2x12x64xf32])
+        (
+            split_230,
+            split_231,
+        ) = einsum_173
+        del einsum_173
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x12x2xf32]) <- (22x22x1x2xf32, 22x1x12x2xf32)
+        combine_59 = [cast_5, einsum_171]
+        del einsum_171
+
+        # pd_op.einsum: (1x12x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x12x2xf32]) <- ([22x22x1x2xf32, 22x1x12x2xf32])
+        einsum_174, einsum_175, einsum_176 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_59, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_59
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_232,
+            split_233,
+        ) = einsum_175
+        del einsum_175
+
+        # builtin.split: (22x22x1x2xf32, 22x1x12x2xf32) <- ([22x22x1x2xf32, 22x1x12x2xf32])
+        (
+            split_234,
+            split_235,
+        ) = einsum_176
+        del einsum_176
+
+        # pd_op.add: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x12x22x22xf32)
+        add_85 = paddle._C_ops.add(einsum_165, index_select_9)
+        del einsum_165, index_select_9
+
+        # pd_op.add: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x12x22x22xf32)
+        add_86 = paddle._C_ops.add(add_85, einsum_174)
+        del add_85, einsum_174
+
+        # pd_op.scale: (1x12x22x22xf32) <- (1x12x22x22xf32, 1xf32)
+        scale_13 = paddle._C_ops.scale(add_86, full_16, float("0"), True)
+        del add_86
+
+        # pd_op.subtract: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x1x22x22xf32)
+        subtract_9 = paddle._C_ops.subtract(scale_13, scale_4)
+        del scale_13
+
+        # pd_op.softmax: (1x12x22x22xf32) <- (1x12x22x22xf32)
+        softmax_9 = paddle._C_ops.softmax(subtract_9, 3)
+        del subtract_9
+
+        # pd_op.dropout: (1x12x22x22xf32, 1x12x22x22xui8) <- (1x12x22x22xf32, None, 1xf32)
+        dropout_76, dropout_77 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_9, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_9
+
+        # builtin.combine: ([1x12x22x22xf32, 22x1x12x64xf32]) <- (1x12x22x22xf32, 22x1x12x64xf32)
+        combine_60 = [dropout_76, reshape_65]
+        del dropout_76, reshape_65
+
+        # pd_op.einsum: (22x1x12x64xf32, [0xf32, 0xf32], [1x12x22x22xf32, 22x1x12x64xf32]) <- ([1x12x22x22xf32, 22x1x12x64xf32])
+        einsum_177, einsum_178, einsum_179 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_60, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_60
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_236,
+            split_237,
+        ) = einsum_178
+        del einsum_178
+
+        # builtin.split: (1x12x22x22xf32, 22x1x12x64xf32) <- ([1x12x22x22xf32, 22x1x12x64xf32])
+        (
+            split_238,
+            split_239,
+        ) = einsum_179
+        del einsum_179
+
+        # pd_op.reshape: (22x1x768xf32) <- (22x1x12x64xf32, 3xi64)
+        reshape_69 = paddle._C_ops.reshape(einsum_177, full_int_array_10)
+        del einsum_177
+
+        # builtin.combine: ([22x1x768xf32, 768x768xf32]) <- (22x1x768xf32, 768x768xf32)
+        combine_61 = [reshape_69, parameter_47]
+        del parameter_47, reshape_69
+
+        # pd_op.einsum: (22x1x768xf32, [0xf32, 0xf32], [22x1x768xf32, 768x768xf32]) <- ([22x1x768xf32, 768x768xf32])
+        einsum_180, einsum_181, einsum_182 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_61, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_61
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_240,
+            split_241,
+        ) = einsum_181
+        del einsum_181
+
+        # builtin.split: (22x1x768xf32, 768x768xf32) <- ([22x1x768xf32, 768x768xf32])
+        (
+            split_242,
+            split_243,
+        ) = einsum_182
+        del einsum_182
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_78, dropout_79 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_180, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_180
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 22x1x768xf32)
+        add_87 = paddle._C_ops.add(dropout_78, layer_norm_51)
+        del dropout_78, layer_norm_51
+
+        # pd_op.layer_norm: (22x1x768xf32, 22x1xf32, 22x1xf32) <- (22x1x768xf32, 768xf32, 768xf32)
+        layer_norm_54, layer_norm_55, layer_norm_56 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_87, parameter_41, parameter_40, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_87, parameter_40, parameter_41
+
+        # pd_op.matmul: (22x1x3072xf32) <- (22x1x768xf32, 768x3072xf32)
+        matmul_58 = paddle._C_ops.matmul(layer_norm_54, parameter_37, False, False)
+        del parameter_37
+
+        # pd_op.add: (22x1x3072xf32) <- (22x1x3072xf32, 3072xf32)
+        add_88 = paddle._C_ops.add(matmul_58, parameter_36)
+        del matmul_58, parameter_36
+
+        # pd_op.gelu: (22x1x3072xf32) <- (22x1x3072xf32)
+        gelu_9 = paddle._C_ops.gelu(add_88, False)
+        del add_88
+
+        # pd_op.dropout: (22x1x3072xf32, 22x1x3072xui8) <- (22x1x3072xf32, None, 1xf32)
+        dropout_80, dropout_81 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_9, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_9
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x3072xf32, 3072x768xf32)
+        matmul_59 = paddle._C_ops.matmul(dropout_80, parameter_35, False, False)
+        del dropout_80, parameter_35
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 768xf32)
+        add_89 = paddle._C_ops.add(matmul_59, parameter_34)
+        del matmul_59, parameter_34
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_82, dropout_83 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_89, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_89
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 22x1x768xf32)
+        add_90 = paddle._C_ops.add(dropout_82, layer_norm_54)
+        del dropout_82, layer_norm_54
+
+        # pd_op.layer_norm: (22x1x768xf32, 22x1xf32, 22x1xf32) <- (22x1x768xf32, 768xf32, 768xf32)
+        layer_norm_57, layer_norm_58, layer_norm_59 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_90, parameter_39, parameter_38, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_90, parameter_38, parameter_39
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_60 = paddle._C_ops.matmul(layer_norm_57, parameter_33, False, False)
+        del parameter_33
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_70 = paddle._C_ops.reshape(matmul_60, full_int_array_5)
+        del matmul_60
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_61 = paddle._C_ops.matmul(layer_norm_57, parameter_32, False, False)
+        del parameter_32
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_71 = paddle._C_ops.reshape(matmul_61, full_int_array_5)
+        del matmul_61
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_62 = paddle._C_ops.matmul(layer_norm_57, parameter_31, False, False)
+        del parameter_31
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_72 = paddle._C_ops.reshape(matmul_62, full_int_array_5)
+        del matmul_62
+
+        # pd_op.matmul: (44x1x768xf32) <- (44x1x768xf32, 768x768xf32)
+        matmul_63 = paddle._C_ops.matmul(dropout_2, parameter_29, False, False)
+        del parameter_29
+
+        # pd_op.reshape: (44x1x12x64xf32) <- (44x1x768xf32, 4xi64)
+        reshape_73 = paddle._C_ops.reshape(matmul_63, full_int_array_6)
+        del matmul_63
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_91 = paddle._C_ops.add(reshape_70, parameter_26)
+        del parameter_26
+
+        # builtin.combine: ([22x1x12x64xf32, 22x1x12x64xf32]) <- (22x1x12x64xf32, 22x1x12x64xf32)
+        combine_62 = [add_91, reshape_71]
+        del add_91, reshape_71
+
+        # pd_op.einsum: (1x12x22x22xf32, [0xf32, 0xf32], [22x1x12x64xf32, 22x1x12x64xf32]) <- ([22x1x12x64xf32, 22x1x12x64xf32])
+        einsum_183, einsum_184, einsum_185 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_62, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_62
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_244,
+            split_245,
+        ) = einsum_184
+        del einsum_184
+
+        # builtin.split: (22x1x12x64xf32, 22x1x12x64xf32) <- ([22x1x12x64xf32, 22x1x12x64xf32])
+        (
+            split_246,
+            split_247,
+        ) = einsum_185
+        del einsum_185
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_92 = paddle._C_ops.add(reshape_70, parameter_28)
+        del parameter_28
+
+        # builtin.combine: ([22x1x12x64xf32, 44x1x12x64xf32]) <- (22x1x12x64xf32, 44x1x12x64xf32)
+        combine_63 = [add_92, reshape_73]
+        del add_92, reshape_73
+
+        # pd_op.einsum: (1x12x22x44xf32, [0xf32, 0xf32], [22x1x12x64xf32, 44x1x12x64xf32]) <- ([22x1x12x64xf32, 44x1x12x64xf32])
+        einsum_186, einsum_187, einsum_188 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_63, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_63
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_248,
+            split_249,
+        ) = einsum_187
+        del einsum_187
+
+        # builtin.split: (22x1x12x64xf32, 44x1x12x64xf32) <- ([22x1x12x64xf32, 44x1x12x64xf32])
+        (
+            split_250,
+            split_251,
+        ) = einsum_188
+        del einsum_188
+
+        # pd_op.reshape: (1x12x44x22xf32) <- (1x12x22x44xf32, 4xi64)
+        reshape_74 = paddle._C_ops.reshape(einsum_186, full_int_array_7)
+        del einsum_186
+
+        # pd_op.slice: (1x12x43x22xf32) <- (1x12x44x22xf32, 1xi64, 1xi64)
+        slice_10 = paddle._C_ops.slice(
+            reshape_74, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_74
+
+        # pd_op.reshape: (1x12x22x43xf32) <- (1x12x43x22xf32, 4xi64)
+        reshape_75 = paddle._C_ops.reshape(slice_10, full_int_array_9)
+        del slice_10
+
+        # pd_op.index_select: (1x12x22x22xf32) <- (1x12x22x43xf32, 22xi64)
+        index_select_10 = paddle._C_ops.index_select(reshape_75, arange_2, 3)
+        del reshape_75
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_93 = paddle._C_ops.add(reshape_70, parameter_27)
+        del parameter_27, reshape_70
+
+        # builtin.combine: ([22x1x12x64xf32, 2x12x64xf32]) <- (22x1x12x64xf32, 2x12x64xf32)
+        combine_64 = [add_93, parameter_25]
+        del add_93, parameter_25
+
+        # pd_op.einsum: (22x1x12x2xf32, [0xf32, 0xf32], [22x1x12x64xf32, 2x12x64xf32]) <- ([22x1x12x64xf32, 2x12x64xf32])
+        einsum_189, einsum_190, einsum_191 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_64, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_64
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_252,
+            split_253,
+        ) = einsum_190
+        del einsum_190
+
+        # builtin.split: (22x1x12x64xf32, 2x12x64xf32) <- ([22x1x12x64xf32, 2x12x64xf32])
+        (
+            split_254,
+            split_255,
+        ) = einsum_191
+        del einsum_191
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x12x2xf32]) <- (22x22x1x2xf32, 22x1x12x2xf32)
+        combine_65 = [cast_5, einsum_189]
+        del einsum_189
+
+        # pd_op.einsum: (1x12x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x12x2xf32]) <- ([22x22x1x2xf32, 22x1x12x2xf32])
+        einsum_192, einsum_193, einsum_194 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_65, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_65
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_256,
+            split_257,
+        ) = einsum_193
+        del einsum_193
+
+        # builtin.split: (22x22x1x2xf32, 22x1x12x2xf32) <- ([22x22x1x2xf32, 22x1x12x2xf32])
+        (
+            split_258,
+            split_259,
+        ) = einsum_194
+        del einsum_194
+
+        # pd_op.add: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x12x22x22xf32)
+        add_94 = paddle._C_ops.add(einsum_183, index_select_10)
+        del einsum_183, index_select_10
+
+        # pd_op.add: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x12x22x22xf32)
+        add_95 = paddle._C_ops.add(add_94, einsum_192)
+        del add_94, einsum_192
+
+        # pd_op.scale: (1x12x22x22xf32) <- (1x12x22x22xf32, 1xf32)
+        scale_14 = paddle._C_ops.scale(add_95, full_16, float("0"), True)
+        del add_95
+
+        # pd_op.subtract: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x1x22x22xf32)
+        subtract_10 = paddle._C_ops.subtract(scale_14, scale_4)
+        del scale_14
+
+        # pd_op.softmax: (1x12x22x22xf32) <- (1x12x22x22xf32)
+        softmax_10 = paddle._C_ops.softmax(subtract_10, 3)
+        del subtract_10
+
+        # pd_op.dropout: (1x12x22x22xf32, 1x12x22x22xui8) <- (1x12x22x22xf32, None, 1xf32)
+        dropout_84, dropout_85 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_10, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_10
+
+        # builtin.combine: ([1x12x22x22xf32, 22x1x12x64xf32]) <- (1x12x22x22xf32, 22x1x12x64xf32)
+        combine_66 = [dropout_84, reshape_72]
+        del dropout_84, reshape_72
+
+        # pd_op.einsum: (22x1x12x64xf32, [0xf32, 0xf32], [1x12x22x22xf32, 22x1x12x64xf32]) <- ([1x12x22x22xf32, 22x1x12x64xf32])
+        einsum_195, einsum_196, einsum_197 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_66, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_66
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_260,
+            split_261,
+        ) = einsum_196
+        del einsum_196
+
+        # builtin.split: (1x12x22x22xf32, 22x1x12x64xf32) <- ([1x12x22x22xf32, 22x1x12x64xf32])
+        (
+            split_262,
+            split_263,
+        ) = einsum_197
+        del einsum_197
+
+        # pd_op.reshape: (22x1x768xf32) <- (22x1x12x64xf32, 3xi64)
+        reshape_76 = paddle._C_ops.reshape(einsum_195, full_int_array_10)
+        del einsum_195
+
+        # builtin.combine: ([22x1x768xf32, 768x768xf32]) <- (22x1x768xf32, 768x768xf32)
+        combine_67 = [reshape_76, parameter_30]
+        del parameter_30, reshape_76
+
+        # pd_op.einsum: (22x1x768xf32, [0xf32, 0xf32], [22x1x768xf32, 768x768xf32]) <- ([22x1x768xf32, 768x768xf32])
+        einsum_198, einsum_199, einsum_200 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_67, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_67
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_264,
+            split_265,
+        ) = einsum_199
+        del einsum_199
+
+        # builtin.split: (22x1x768xf32, 768x768xf32) <- ([22x1x768xf32, 768x768xf32])
+        (
+            split_266,
+            split_267,
+        ) = einsum_200
+        del einsum_200
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_86, dropout_87 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_198, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_198
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 22x1x768xf32)
+        add_96 = paddle._C_ops.add(dropout_86, layer_norm_57)
+        del dropout_86, layer_norm_57
+
+        # pd_op.layer_norm: (22x1x768xf32, 22x1xf32, 22x1xf32) <- (22x1x768xf32, 768xf32, 768xf32)
+        layer_norm_60, layer_norm_61, layer_norm_62 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_96, parameter_24, parameter_23, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_96, parameter_23, parameter_24
+
+        # pd_op.matmul: (22x1x3072xf32) <- (22x1x768xf32, 768x3072xf32)
+        matmul_64 = paddle._C_ops.matmul(layer_norm_60, parameter_20, False, False)
+        del parameter_20
+
+        # pd_op.add: (22x1x3072xf32) <- (22x1x3072xf32, 3072xf32)
+        add_97 = paddle._C_ops.add(matmul_64, parameter_19)
+        del matmul_64, parameter_19
+
+        # pd_op.gelu: (22x1x3072xf32) <- (22x1x3072xf32)
+        gelu_10 = paddle._C_ops.gelu(add_97, False)
+        del add_97
+
+        # pd_op.dropout: (22x1x3072xf32, 22x1x3072xui8) <- (22x1x3072xf32, None, 1xf32)
+        dropout_88, dropout_89 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_10, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_10
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x3072xf32, 3072x768xf32)
+        matmul_65 = paddle._C_ops.matmul(dropout_88, parameter_18, False, False)
+        del dropout_88, parameter_18
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 768xf32)
+        add_98 = paddle._C_ops.add(matmul_65, parameter_17)
+        del matmul_65, parameter_17
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_90, dropout_91 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_98, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_98
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 22x1x768xf32)
+        add_99 = paddle._C_ops.add(dropout_90, layer_norm_60)
+        del dropout_90, layer_norm_60
+
+        # pd_op.layer_norm: (22x1x768xf32, 22x1xf32, 22x1xf32) <- (22x1x768xf32, 768xf32, 768xf32)
+        layer_norm_63, layer_norm_64, layer_norm_65 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_99, parameter_22, parameter_21, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_99, parameter_21, parameter_22
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_66 = paddle._C_ops.matmul(layer_norm_63, parameter_16, False, False)
+        del parameter_16
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_77 = paddle._C_ops.reshape(matmul_66, full_int_array_5)
+        del matmul_66
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_67 = paddle._C_ops.matmul(layer_norm_63, parameter_15, False, False)
+        del parameter_15
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_78 = paddle._C_ops.reshape(matmul_67, full_int_array_5)
+        del matmul_67
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x768xf32, 768x768xf32)
+        matmul_68 = paddle._C_ops.matmul(layer_norm_63, parameter_14, False, False)
+        del parameter_14
+
+        # pd_op.reshape: (22x1x12x64xf32) <- (22x1x768xf32, 4xi64)
+        reshape_79 = paddle._C_ops.reshape(matmul_68, full_int_array_5)
+        del full_int_array_5, matmul_68
+
+        # pd_op.matmul: (44x1x768xf32) <- (44x1x768xf32, 768x768xf32)
+        matmul_69 = paddle._C_ops.matmul(dropout_2, parameter_12, False, False)
+        del dropout_2, parameter_12
+
+        # pd_op.reshape: (44x1x12x64xf32) <- (44x1x768xf32, 4xi64)
+        reshape_80 = paddle._C_ops.reshape(matmul_69, full_int_array_6)
+        del full_int_array_6, matmul_69
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_100 = paddle._C_ops.add(reshape_77, parameter_9)
+        del parameter_9
+
+        # builtin.combine: ([22x1x12x64xf32, 22x1x12x64xf32]) <- (22x1x12x64xf32, 22x1x12x64xf32)
+        combine_68 = [add_100, reshape_78]
+        del add_100, reshape_78
+
+        # pd_op.einsum: (1x12x22x22xf32, [0xf32, 0xf32], [22x1x12x64xf32, 22x1x12x64xf32]) <- ([22x1x12x64xf32, 22x1x12x64xf32])
+        einsum_201, einsum_202, einsum_203 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_68, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_68
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_268,
+            split_269,
+        ) = einsum_202
+        del einsum_202
+
+        # builtin.split: (22x1x12x64xf32, 22x1x12x64xf32) <- ([22x1x12x64xf32, 22x1x12x64xf32])
+        (
+            split_270,
+            split_271,
+        ) = einsum_203
+        del einsum_203
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_101 = paddle._C_ops.add(reshape_77, parameter_11)
+        del parameter_11
+
+        # builtin.combine: ([22x1x12x64xf32, 44x1x12x64xf32]) <- (22x1x12x64xf32, 44x1x12x64xf32)
+        combine_69 = [add_101, reshape_80]
+        del add_101, reshape_80
+
+        # pd_op.einsum: (1x12x22x44xf32, [0xf32, 0xf32], [22x1x12x64xf32, 44x1x12x64xf32]) <- ([22x1x12x64xf32, 44x1x12x64xf32])
+        einsum_204, einsum_205, einsum_206 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_69, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_69
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_272,
+            split_273,
+        ) = einsum_205
+        del einsum_205
+
+        # builtin.split: (22x1x12x64xf32, 44x1x12x64xf32) <- ([22x1x12x64xf32, 44x1x12x64xf32])
+        (
+            split_274,
+            split_275,
+        ) = einsum_206
+        del einsum_206
+
+        # pd_op.reshape: (1x12x44x22xf32) <- (1x12x22x44xf32, 4xi64)
+        reshape_81 = paddle._C_ops.reshape(einsum_204, full_int_array_7)
+        del einsum_204, full_int_array_7
+
+        # pd_op.slice: (1x12x43x22xf32) <- (1x12x44x22xf32, 1xi64, 1xi64)
+        slice_11 = paddle._C_ops.slice(
+            reshape_81, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del full_int_array_3, full_int_array_8, reshape_81
+
+        # pd_op.reshape: (1x12x22x43xf32) <- (1x12x43x22xf32, 4xi64)
+        reshape_82 = paddle._C_ops.reshape(slice_11, full_int_array_9)
+        del full_int_array_9, slice_11
+
+        # pd_op.index_select: (1x12x22x22xf32) <- (1x12x22x43xf32, 22xi64)
+        index_select_11 = paddle._C_ops.index_select(reshape_82, arange_2, 3)
+        del arange_2, reshape_82
+
+        # pd_op.add: (22x1x12x64xf32) <- (22x1x12x64xf32, 12x64xf32)
+        add_102 = paddle._C_ops.add(reshape_77, parameter_10)
+        del parameter_10, reshape_77
+
+        # builtin.combine: ([22x1x12x64xf32, 2x12x64xf32]) <- (22x1x12x64xf32, 2x12x64xf32)
+        combine_70 = [add_102, parameter_8]
+        del add_102, parameter_8
+
+        # pd_op.einsum: (22x1x12x2xf32, [0xf32, 0xf32], [22x1x12x64xf32, 2x12x64xf32]) <- ([22x1x12x64xf32, 2x12x64xf32])
+        einsum_207, einsum_208, einsum_209 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_70, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_70
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_276,
+            split_277,
+        ) = einsum_208
+        del einsum_208
+
+        # builtin.split: (22x1x12x64xf32, 2x12x64xf32) <- ([22x1x12x64xf32, 2x12x64xf32])
+        (
+            split_278,
+            split_279,
+        ) = einsum_209
+        del einsum_209
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x12x2xf32]) <- (22x22x1x2xf32, 22x1x12x2xf32)
+        combine_71 = [cast_5, einsum_207]
+        del cast_5, einsum_207
+
+        # pd_op.einsum: (1x12x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x12x2xf32]) <- ([22x22x1x2xf32, 22x1x12x2xf32])
+        einsum_210, einsum_211, einsum_212 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_71, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_71
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_280,
+            split_281,
+        ) = einsum_211
+        del einsum_211
+
+        # builtin.split: (22x22x1x2xf32, 22x1x12x2xf32) <- ([22x22x1x2xf32, 22x1x12x2xf32])
+        (
+            split_282,
+            split_283,
+        ) = einsum_212
+        del einsum_212
+
+        # pd_op.add: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x12x22x22xf32)
+        add_103 = paddle._C_ops.add(einsum_201, index_select_11)
+        del einsum_201, index_select_11
+
+        # pd_op.add: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x12x22x22xf32)
+        add_104 = paddle._C_ops.add(add_103, einsum_210)
+        del add_103, einsum_210
+
+        # pd_op.scale: (1x12x22x22xf32) <- (1x12x22x22xf32, 1xf32)
+        scale_15 = paddle._C_ops.scale(add_104, full_16, float("0"), True)
+        del add_104, full_16
+
+        # pd_op.subtract: (1x12x22x22xf32) <- (1x12x22x22xf32, 1x1x22x22xf32)
+        subtract_11 = paddle._C_ops.subtract(scale_15, scale_4)
+        del scale_15, scale_4
+
+        # pd_op.softmax: (1x12x22x22xf32) <- (1x12x22x22xf32)
+        softmax_11 = paddle._C_ops.softmax(subtract_11, 3)
+        del subtract_11
+
+        # pd_op.dropout: (1x12x22x22xf32, 1x12x22x22xui8) <- (1x12x22x22xf32, None, 1xf32)
+        dropout_92, dropout_93 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_11, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_11
+
+        # builtin.combine: ([1x12x22x22xf32, 22x1x12x64xf32]) <- (1x12x22x22xf32, 22x1x12x64xf32)
+        combine_72 = [dropout_92, reshape_79]
+        del dropout_92, reshape_79
+
+        # pd_op.einsum: (22x1x12x64xf32, [0xf32, 0xf32], [1x12x22x22xf32, 22x1x12x64xf32]) <- ([1x12x22x22xf32, 22x1x12x64xf32])
+        einsum_213, einsum_214, einsum_215 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_72, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_72
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_284,
+            split_285,
+        ) = einsum_214
+        del einsum_214
+
+        # builtin.split: (1x12x22x22xf32, 22x1x12x64xf32) <- ([1x12x22x22xf32, 22x1x12x64xf32])
+        (
+            split_286,
+            split_287,
+        ) = einsum_215
+        del einsum_215
+
+        # pd_op.reshape: (22x1x768xf32) <- (22x1x12x64xf32, 3xi64)
+        reshape_83 = paddle._C_ops.reshape(einsum_213, full_int_array_10)
+        del einsum_213, full_int_array_10
+
+        # builtin.combine: ([22x1x768xf32, 768x768xf32]) <- (22x1x768xf32, 768x768xf32)
+        combine_73 = [reshape_83, parameter_13]
+        del parameter_13, reshape_83
+
+        # pd_op.einsum: (22x1x768xf32, [0xf32, 0xf32], [22x1x768xf32, 768x768xf32]) <- ([22x1x768xf32, 768x768xf32])
+        einsum_216, einsum_217, einsum_218 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_73, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_73
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_288,
+            split_289,
+        ) = einsum_217
+        del einsum_217
+
+        # builtin.split: (22x1x768xf32, 768x768xf32) <- ([22x1x768xf32, 768x768xf32])
+        (
+            split_290,
+            split_291,
+        ) = einsum_218
+        del einsum_218
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_94, dropout_95 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_216, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_216
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 22x1x768xf32)
+        add_105 = paddle._C_ops.add(dropout_94, layer_norm_63)
+        del dropout_94, layer_norm_63
+
+        # pd_op.layer_norm: (22x1x768xf32, 22x1xf32, 22x1xf32) <- (22x1x768xf32, 768xf32, 768xf32)
+        layer_norm_66, layer_norm_67, layer_norm_68 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_105, parameter_7, parameter_6, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_105, parameter_6, parameter_7
+
+        # pd_op.matmul: (22x1x3072xf32) <- (22x1x768xf32, 768x3072xf32)
+        matmul_70 = paddle._C_ops.matmul(layer_norm_66, parameter_3, False, False)
+        del parameter_3
+
+        # pd_op.add: (22x1x3072xf32) <- (22x1x3072xf32, 3072xf32)
+        add_106 = paddle._C_ops.add(matmul_70, parameter_2)
+        del matmul_70, parameter_2
+
+        # pd_op.gelu: (22x1x3072xf32) <- (22x1x3072xf32)
+        gelu_11 = paddle._C_ops.gelu(add_106, False)
+        del add_106
+
+        # pd_op.dropout: (22x1x3072xf32, 22x1x3072xui8) <- (22x1x3072xf32, None, 1xf32)
+        dropout_96, dropout_97 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_11, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_11
+
+        # pd_op.matmul: (22x1x768xf32) <- (22x1x3072xf32, 3072x768xf32)
+        matmul_71 = paddle._C_ops.matmul(dropout_96, parameter_1, False, False)
+        del dropout_96, parameter_1
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 768xf32)
+        add_107 = paddle._C_ops.add(matmul_71, parameter_0)
+        del matmul_71, parameter_0
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_98, dropout_99 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_107, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_107
+
+        # pd_op.add: (22x1x768xf32) <- (22x1x768xf32, 22x1x768xf32)
+        add_108 = paddle._C_ops.add(dropout_98, layer_norm_66)
+        del dropout_98, layer_norm_66
+
+        # pd_op.layer_norm: (22x1x768xf32, 22x1xf32, 22x1xf32) <- (22x1x768xf32, 768xf32, 768xf32)
+        layer_norm_69, layer_norm_70, layer_norm_71 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_108, parameter_5, parameter_4, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_108, parameter_4, parameter_5
+
+        # pd_op.dropout: (22x1x768xf32, 22x1x768xui8) <- (22x1x768xf32, None, 1xf32)
+        dropout_100, dropout_101 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                layer_norm_69, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del full_3, layer_norm_69
+
+        # pd_op.transpose: (1x22x768xf32) <- (22x1x768xf32)
+        transpose_0 = paddle._C_ops.transpose(dropout_100, [1, 0, 2])
+        del dropout_100
+
+        return transpose_0
diff --git a/paddle_samples/PaddleNLP/xlnet-base-cased/weight_meta.py b/paddle_samples/PaddleNLP/xlnet-base-cased/weight_meta.py
new file mode 100644
index 000000000..cd5f57ddb
--- /dev/null
+++ b/paddle_samples/PaddleNLP/xlnet-base-cased/weight_meta.py
@@ -0,0 +1,2048 @@
+class Program_weight_tensor_parameter_0:
+    name = "parameter_0"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_1:
+    name = "parameter_1"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.107443")
+    max_val = float("0.0964704")
+    mean = float("1.56014e-05")
+    std = float("0.0199908")
+    data = None
+
+
+class Program_weight_tensor_parameter_2:
+    name = "parameter_2"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_3:
+    name = "parameter_3"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.101696")
+    max_val = float("0.0937056")
+    mean = float("-5.01245e-06")
+    std = float("0.020003")
+    data = None
+
+
+class Program_weight_tensor_parameter_4:
+    name = "parameter_4"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_5:
+    name = "parameter_5"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_6:
+    name = "parameter_6"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_7:
+    name = "parameter_7"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_8:
+    name = "parameter_8"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0700854")
+    max_val = float("0.0698896")
+    mean = float("0.000221964")
+    std = float("0.0206341")
+    data = None
+
+
+class Program_weight_tensor_parameter_9:
+    name = "parameter_9"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0657833")
+    max_val = float("0.0554211")
+    mean = float("-0.000644022")
+    std = float("0.0201863")
+    data = None
+
+
+class Program_weight_tensor_parameter_10:
+    name = "parameter_10"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0583059")
+    max_val = float("0.0721462")
+    mean = float("-0.000222992")
+    std = float("0.02004")
+    data = None
+
+
+class Program_weight_tensor_parameter_11:
+    name = "parameter_11"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0540794")
+    max_val = float("0.0620946")
+    mean = float("0.000534411")
+    std = float("0.0202118")
+    data = None
+
+
+class Program_weight_tensor_parameter_12:
+    name = "parameter_12"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.100007")
+    max_val = float("0.0982943")
+    mean = float("-5.16948e-05")
+    std = float("0.0200102")
+    data = None
+
+
+class Program_weight_tensor_parameter_13:
+    name = "parameter_13"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.102099")
+    max_val = float("0.0956306")
+    mean = float("-9.37182e-06")
+    std = float("0.0199744")
+    data = None
+
+
+class Program_weight_tensor_parameter_14:
+    name = "parameter_14"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.097491")
+    max_val = float("0.0942033")
+    mean = float("-1.36115e-06")
+    std = float("0.0199895")
+    data = None
+
+
+class Program_weight_tensor_parameter_15:
+    name = "parameter_15"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0922146")
+    max_val = float("0.0997002")
+    mean = float("-2.24828e-05")
+    std = float("0.0200103")
+    data = None
+
+
+class Program_weight_tensor_parameter_16:
+    name = "parameter_16"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0922014")
+    max_val = float("0.0950936")
+    mean = float("-1.00351e-05")
+    std = float("0.0200077")
+    data = None
+
+
+class Program_weight_tensor_parameter_17:
+    name = "parameter_17"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_18:
+    name = "parameter_18"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.114515")
+    max_val = float("0.102741")
+    mean = float("1.28552e-05")
+    std = float("0.0199976")
+    data = None
+
+
+class Program_weight_tensor_parameter_19:
+    name = "parameter_19"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_20:
+    name = "parameter_20"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.103737")
+    max_val = float("0.0964233")
+    mean = float("-3.63885e-05")
+    std = float("0.0199944")
+    data = None
+
+
+class Program_weight_tensor_parameter_21:
+    name = "parameter_21"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_22:
+    name = "parameter_22"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_23:
+    name = "parameter_23"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_24:
+    name = "parameter_24"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_25:
+    name = "parameter_25"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0650684")
+    max_val = float("0.0546684")
+    mean = float("0.000543553")
+    std = float("0.0197757")
+    data = None
+
+
+class Program_weight_tensor_parameter_26:
+    name = "parameter_26"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0639014")
+    max_val = float("0.0613068")
+    mean = float("0.000632318")
+    std = float("0.0199362")
+    data = None
+
+
+class Program_weight_tensor_parameter_27:
+    name = "parameter_27"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0533739")
+    max_val = float("0.0662696")
+    mean = float("0.000393619")
+    std = float("0.0197884")
+    data = None
+
+
+class Program_weight_tensor_parameter_28:
+    name = "parameter_28"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0806417")
+    max_val = float("0.0568907")
+    mean = float("1.81441e-05")
+    std = float("0.0199952")
+    data = None
+
+
+class Program_weight_tensor_parameter_29:
+    name = "parameter_29"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0925241")
+    max_val = float("0.0936963")
+    mean = float("1.56955e-05")
+    std = float("0.0199829")
+    data = None
+
+
+class Program_weight_tensor_parameter_30:
+    name = "parameter_30"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0896455")
+    max_val = float("0.0957685")
+    mean = float("-2.93209e-05")
+    std = float("0.0200303")
+    data = None
+
+
+class Program_weight_tensor_parameter_31:
+    name = "parameter_31"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.100002")
+    max_val = float("0.0897565")
+    mean = float("-3.3829e-05")
+    std = float("0.0199949")
+    data = None
+
+
+class Program_weight_tensor_parameter_32:
+    name = "parameter_32"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.087256")
+    max_val = float("0.0921711")
+    mean = float("-8.63745e-06")
+    std = float("0.0199731")
+    data = None
+
+
+class Program_weight_tensor_parameter_33:
+    name = "parameter_33"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0901312")
+    max_val = float("0.0977029")
+    mean = float("1.4295e-05")
+    std = float("0.0199949")
+    data = None
+
+
+class Program_weight_tensor_parameter_34:
+    name = "parameter_34"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_35:
+    name = "parameter_35"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0948151")
+    max_val = float("0.0944873")
+    mean = float("-1.0718e-06")
+    std = float("0.0200009")
+    data = None
+
+
+class Program_weight_tensor_parameter_36:
+    name = "parameter_36"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_37:
+    name = "parameter_37"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0973458")
+    max_val = float("0.101163")
+    mean = float("-7.36939e-08")
+    std = float("0.0199702")
+    data = None
+
+
+class Program_weight_tensor_parameter_38:
+    name = "parameter_38"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_39:
+    name = "parameter_39"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_40:
+    name = "parameter_40"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_41:
+    name = "parameter_41"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_42:
+    name = "parameter_42"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.06053")
+    max_val = float("0.0605476")
+    mean = float("0.000147032")
+    std = float("0.0192807")
+    data = None
+
+
+class Program_weight_tensor_parameter_43:
+    name = "parameter_43"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0577485")
+    max_val = float("0.0631016")
+    mean = float("0.000978126")
+    std = float("0.0200149")
+    data = None
+
+
+class Program_weight_tensor_parameter_44:
+    name = "parameter_44"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0632318")
+    max_val = float("0.0590489")
+    mean = float("-0.000117242")
+    std = float("0.0197921")
+    data = None
+
+
+class Program_weight_tensor_parameter_45:
+    name = "parameter_45"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.060267")
+    max_val = float("0.0617522")
+    mean = float("-0.000553793")
+    std = float("0.0198732")
+    data = None
+
+
+class Program_weight_tensor_parameter_46:
+    name = "parameter_46"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0949403")
+    max_val = float("0.0921171")
+    mean = float("3.01454e-05")
+    std = float("0.0200084")
+    data = None
+
+
+class Program_weight_tensor_parameter_47:
+    name = "parameter_47"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0943445")
+    max_val = float("0.0932839")
+    mean = float("-1.06599e-05")
+    std = float("0.0199709")
+    data = None
+
+
+class Program_weight_tensor_parameter_48:
+    name = "parameter_48"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0926475")
+    max_val = float("0.0971318")
+    mean = float("-2.8691e-05")
+    std = float("0.0199919")
+    data = None
+
+
+class Program_weight_tensor_parameter_49:
+    name = "parameter_49"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0970141")
+    max_val = float("0.0993287")
+    mean = float("2.19616e-05")
+    std = float("0.0200119")
+    data = None
+
+
+class Program_weight_tensor_parameter_50:
+    name = "parameter_50"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.104719")
+    max_val = float("0.0961093")
+    mean = float("4.22517e-05")
+    std = float("0.0199673")
+    data = None
+
+
+class Program_weight_tensor_parameter_51:
+    name = "parameter_51"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_52:
+    name = "parameter_52"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.100468")
+    max_val = float("0.0950249")
+    mean = float("-4.33319e-07")
+    std = float("0.0199887")
+    data = None
+
+
+class Program_weight_tensor_parameter_53:
+    name = "parameter_53"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_54:
+    name = "parameter_54"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0980237")
+    max_val = float("0.0971189")
+    mean = float("3.46093e-05")
+    std = float("0.0200024")
+    data = None
+
+
+class Program_weight_tensor_parameter_55:
+    name = "parameter_55"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_56:
+    name = "parameter_56"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_57:
+    name = "parameter_57"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_58:
+    name = "parameter_58"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_59:
+    name = "parameter_59"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0592654")
+    max_val = float("0.067405")
+    mean = float("4.66489e-05")
+    std = float("0.0200963")
+    data = None
+
+
+class Program_weight_tensor_parameter_60:
+    name = "parameter_60"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0716498")
+    max_val = float("0.064698")
+    mean = float("0.000369003")
+    std = float("0.0198631")
+    data = None
+
+
+class Program_weight_tensor_parameter_61:
+    name = "parameter_61"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0665162")
+    max_val = float("0.0689733")
+    mean = float("0.00104902")
+    std = float("0.0202639")
+    data = None
+
+
+class Program_weight_tensor_parameter_62:
+    name = "parameter_62"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0621085")
+    max_val = float("0.0697879")
+    mean = float("0.000479245")
+    std = float("0.0200544")
+    data = None
+
+
+class Program_weight_tensor_parameter_63:
+    name = "parameter_63"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.103067")
+    max_val = float("0.095989")
+    mean = float("1.96233e-05")
+    std = float("0.0199981")
+    data = None
+
+
+class Program_weight_tensor_parameter_64:
+    name = "parameter_64"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0913213")
+    max_val = float("0.0886865")
+    mean = float("-1.91935e-05")
+    std = float("0.0200027")
+    data = None
+
+
+class Program_weight_tensor_parameter_65:
+    name = "parameter_65"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.105297")
+    max_val = float("0.0904919")
+    mean = float("-4.00913e-05")
+    std = float("0.020032")
+    data = None
+
+
+class Program_weight_tensor_parameter_66:
+    name = "parameter_66"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0922652")
+    max_val = float("0.10427")
+    mean = float("-9.49519e-06")
+    std = float("0.0200073")
+    data = None
+
+
+class Program_weight_tensor_parameter_67:
+    name = "parameter_67"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0972453")
+    max_val = float("0.0973256")
+    mean = float("-9.09023e-06")
+    std = float("0.0199783")
+    data = None
+
+
+class Program_weight_tensor_parameter_68:
+    name = "parameter_68"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_69:
+    name = "parameter_69"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.106309")
+    max_val = float("0.105607")
+    mean = float("1.10291e-05")
+    std = float("0.0200022")
+    data = None
+
+
+class Program_weight_tensor_parameter_70:
+    name = "parameter_70"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_71:
+    name = "parameter_71"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0975573")
+    max_val = float("0.0966081")
+    mean = float("-1.16121e-05")
+    std = float("0.0200024")
+    data = None
+
+
+class Program_weight_tensor_parameter_72:
+    name = "parameter_72"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_73:
+    name = "parameter_73"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_74:
+    name = "parameter_74"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_75:
+    name = "parameter_75"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_76:
+    name = "parameter_76"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0568817")
+    max_val = float("0.0863073")
+    mean = float("-1.90975e-05")
+    std = float("0.0193294")
+    data = None
+
+
+class Program_weight_tensor_parameter_77:
+    name = "parameter_77"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0652397")
+    max_val = float("0.0550323")
+    mean = float("-0.000326906")
+    std = float("0.0202795")
+    data = None
+
+
+class Program_weight_tensor_parameter_78:
+    name = "parameter_78"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0611418")
+    max_val = float("0.0689744")
+    mean = float("0.000107098")
+    std = float("0.0206267")
+    data = None
+
+
+class Program_weight_tensor_parameter_79:
+    name = "parameter_79"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0615758")
+    max_val = float("0.056214")
+    mean = float("-0.000222315")
+    std = float("0.0196349")
+    data = None
+
+
+class Program_weight_tensor_parameter_80:
+    name = "parameter_80"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.103743")
+    max_val = float("0.0985834")
+    mean = float("7.308e-06")
+    std = float("0.0199702")
+    data = None
+
+
+class Program_weight_tensor_parameter_81:
+    name = "parameter_81"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0918515")
+    max_val = float("0.0900846")
+    mean = float("-5.87464e-05")
+    std = float("0.0199767")
+    data = None
+
+
+class Program_weight_tensor_parameter_82:
+    name = "parameter_82"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0912084")
+    max_val = float("0.0985826")
+    mean = float("-3.34139e-06")
+    std = float("0.0200063")
+    data = None
+
+
+class Program_weight_tensor_parameter_83:
+    name = "parameter_83"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0894189")
+    max_val = float("0.0908116")
+    mean = float("2.44833e-05")
+    std = float("0.0199879")
+    data = None
+
+
+class Program_weight_tensor_parameter_84:
+    name = "parameter_84"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0945442")
+    max_val = float("0.094868")
+    mean = float("-1.32895e-05")
+    std = float("0.019989")
+    data = None
+
+
+class Program_weight_tensor_parameter_85:
+    name = "parameter_85"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_86:
+    name = "parameter_86"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.100435")
+    max_val = float("0.0962955")
+    mean = float("9.07234e-06")
+    std = float("0.0199853")
+    data = None
+
+
+class Program_weight_tensor_parameter_87:
+    name = "parameter_87"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_88:
+    name = "parameter_88"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0965716")
+    max_val = float("0.0993747")
+    mean = float("-1.30958e-05")
+    std = float("0.0200061")
+    data = None
+
+
+class Program_weight_tensor_parameter_89:
+    name = "parameter_89"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_90:
+    name = "parameter_90"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_91:
+    name = "parameter_91"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_92:
+    name = "parameter_92"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_93:
+    name = "parameter_93"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0606395")
+    max_val = float("0.0643096")
+    mean = float("0.000744228")
+    std = float("0.0200752")
+    data = None
+
+
+class Program_weight_tensor_parameter_94:
+    name = "parameter_94"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0645655")
+    max_val = float("0.060837")
+    mean = float("-0.00195379")
+    std = float("0.0194921")
+    data = None
+
+
+class Program_weight_tensor_parameter_95:
+    name = "parameter_95"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0606664")
+    max_val = float("0.057889")
+    mean = float("0.00184684")
+    std = float("0.0198158")
+    data = None
+
+
+class Program_weight_tensor_parameter_96:
+    name = "parameter_96"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.065119")
+    max_val = float("0.062255")
+    mean = float("0.000553101")
+    std = float("0.0194296")
+    data = None
+
+
+class Program_weight_tensor_parameter_97:
+    name = "parameter_97"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0937386")
+    max_val = float("0.100026")
+    mean = float("2.03894e-05")
+    std = float("0.0199983")
+    data = None
+
+
+class Program_weight_tensor_parameter_98:
+    name = "parameter_98"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0962021")
+    max_val = float("0.0924932")
+    mean = float("-2.08112e-05")
+    std = float("0.0199941")
+    data = None
+
+
+class Program_weight_tensor_parameter_99:
+    name = "parameter_99"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.100484")
+    max_val = float("0.0937384")
+    mean = float("7.2557e-06")
+    std = float("0.02001")
+    data = None
+
+
+class Program_weight_tensor_parameter_100:
+    name = "parameter_100"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0947122")
+    max_val = float("0.0929743")
+    mean = float("-4.40165e-06")
+    std = float("0.0200234")
+    data = None
+
+
+class Program_weight_tensor_parameter_101:
+    name = "parameter_101"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0993096")
+    max_val = float("0.100981")
+    mean = float("-1.43578e-05")
+    std = float("0.0199956")
+    data = None
+
+
+class Program_weight_tensor_parameter_102:
+    name = "parameter_102"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_103:
+    name = "parameter_103"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.105355")
+    max_val = float("0.0954512")
+    mean = float("-7.89531e-06")
+    std = float("0.019986")
+    data = None
+
+
+class Program_weight_tensor_parameter_104:
+    name = "parameter_104"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_105:
+    name = "parameter_105"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0976812")
+    max_val = float("0.0994407")
+    mean = float("-2.10747e-05")
+    std = float("0.0200068")
+    data = None
+
+
+class Program_weight_tensor_parameter_106:
+    name = "parameter_106"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_107:
+    name = "parameter_107"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_108:
+    name = "parameter_108"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_109:
+    name = "parameter_109"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_110:
+    name = "parameter_110"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0729831")
+    max_val = float("0.0634138")
+    mean = float("-3.56534e-05")
+    std = float("0.0201453")
+    data = None
+
+
+class Program_weight_tensor_parameter_111:
+    name = "parameter_111"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0880961")
+    max_val = float("0.0590779")
+    mean = float("0.00160808")
+    std = float("0.02059")
+    data = None
+
+
+class Program_weight_tensor_parameter_112:
+    name = "parameter_112"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0665995")
+    max_val = float("0.0610122")
+    mean = float("-0.000392904")
+    std = float("0.0201161")
+    data = None
+
+
+class Program_weight_tensor_parameter_113:
+    name = "parameter_113"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0674377")
+    max_val = float("0.0548331")
+    mean = float("9.5121e-05")
+    std = float("0.0202969")
+    data = None
+
+
+class Program_weight_tensor_parameter_114:
+    name = "parameter_114"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.106774")
+    max_val = float("0.0950007")
+    mean = float("-3.00636e-05")
+    std = float("0.0200442")
+    data = None
+
+
+class Program_weight_tensor_parameter_115:
+    name = "parameter_115"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.103048")
+    max_val = float("0.0974949")
+    mean = float("-6.42821e-06")
+    std = float("0.0199987")
+    data = None
+
+
+class Program_weight_tensor_parameter_116:
+    name = "parameter_116"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.101272")
+    max_val = float("0.0955136")
+    mean = float("1.79861e-05")
+    std = float("0.0200075")
+    data = None
+
+
+class Program_weight_tensor_parameter_117:
+    name = "parameter_117"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0916424")
+    max_val = float("0.101734")
+    mean = float("4.76916e-07")
+    std = float("0.0199606")
+    data = None
+
+
+class Program_weight_tensor_parameter_118:
+    name = "parameter_118"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0961003")
+    max_val = float("0.0940374")
+    mean = float("-6.02494e-06")
+    std = float("0.0199904")
+    data = None
+
+
+class Program_weight_tensor_parameter_119:
+    name = "parameter_119"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_120:
+    name = "parameter_120"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0973096")
+    max_val = float("0.0955032")
+    mean = float("6.05686e-06")
+    std = float("0.0200104")
+    data = None
+
+
+class Program_weight_tensor_parameter_121:
+    name = "parameter_121"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_122:
+    name = "parameter_122"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.104251")
+    max_val = float("0.0985613")
+    mean = float("-4.24987e-06")
+    std = float("0.019993")
+    data = None
+
+
+class Program_weight_tensor_parameter_123:
+    name = "parameter_123"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_124:
+    name = "parameter_124"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_125:
+    name = "parameter_125"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_126:
+    name = "parameter_126"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_127:
+    name = "parameter_127"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0781129")
+    max_val = float("0.0686083")
+    mean = float("-0.000277737")
+    std = float("0.019889")
+    data = None
+
+
+class Program_weight_tensor_parameter_128:
+    name = "parameter_128"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0549032")
+    max_val = float("0.0549252")
+    mean = float("0.000373617")
+    std = float("0.0193584")
+    data = None
+
+
+class Program_weight_tensor_parameter_129:
+    name = "parameter_129"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0671851")
+    max_val = float("0.0564571")
+    mean = float("-0.000553835")
+    std = float("0.0207093")
+    data = None
+
+
+class Program_weight_tensor_parameter_130:
+    name = "parameter_130"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0684723")
+    max_val = float("0.0828474")
+    mean = float("0.000446164")
+    std = float("0.020192")
+    data = None
+
+
+class Program_weight_tensor_parameter_131:
+    name = "parameter_131"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.093358")
+    max_val = float("0.0928076")
+    mean = float("-2.09587e-05")
+    std = float("0.0199891")
+    data = None
+
+
+class Program_weight_tensor_parameter_132:
+    name = "parameter_132"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0949628")
+    max_val = float("0.0958047")
+    mean = float("-2.08746e-05")
+    std = float("0.0200089")
+    data = None
+
+
+class Program_weight_tensor_parameter_133:
+    name = "parameter_133"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.094868")
+    max_val = float("0.0951138")
+    mean = float("1.27738e-05")
+    std = float("0.0199868")
+    data = None
+
+
+class Program_weight_tensor_parameter_134:
+    name = "parameter_134"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0953018")
+    max_val = float("0.0948122")
+    mean = float("5.91242e-06")
+    std = float("0.0199676")
+    data = None
+
+
+class Program_weight_tensor_parameter_135:
+    name = "parameter_135"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0918979")
+    max_val = float("0.0891179")
+    mean = float("-3.84901e-05")
+    std = float("0.019996")
+    data = None
+
+
+class Program_weight_tensor_parameter_136:
+    name = "parameter_136"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_137:
+    name = "parameter_137"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0960438")
+    max_val = float("0.102767")
+    mean = float("-6.15459e-06")
+    std = float("0.0200163")
+    data = None
+
+
+class Program_weight_tensor_parameter_138:
+    name = "parameter_138"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_139:
+    name = "parameter_139"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0998533")
+    max_val = float("0.100147")
+    mean = float("1.70663e-06")
+    std = float("0.0200142")
+    data = None
+
+
+class Program_weight_tensor_parameter_140:
+    name = "parameter_140"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_141:
+    name = "parameter_141"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_142:
+    name = "parameter_142"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_143:
+    name = "parameter_143"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_144:
+    name = "parameter_144"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0589635")
+    max_val = float("0.0621262")
+    mean = float("0.00104112")
+    std = float("0.0198215")
+    data = None
+
+
+class Program_weight_tensor_parameter_145:
+    name = "parameter_145"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0600285")
+    max_val = float("0.0777009")
+    mean = float("-0.000173019")
+    std = float("0.0207801")
+    data = None
+
+
+class Program_weight_tensor_parameter_146:
+    name = "parameter_146"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0604432")
+    max_val = float("0.0553187")
+    mean = float("-0.000111711")
+    std = float("0.0195423")
+    data = None
+
+
+class Program_weight_tensor_parameter_147:
+    name = "parameter_147"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.056655")
+    max_val = float("0.0710528")
+    mean = float("-0.000181629")
+    std = float("0.0202749")
+    data = None
+
+
+class Program_weight_tensor_parameter_148:
+    name = "parameter_148"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0920498")
+    max_val = float("0.0956225")
+    mean = float("-3.74604e-05")
+    std = float("0.0199711")
+    data = None
+
+
+class Program_weight_tensor_parameter_149:
+    name = "parameter_149"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0965613")
+    max_val = float("0.0990465")
+    mean = float("2.11627e-05")
+    std = float("0.0199962")
+    data = None
+
+
+class Program_weight_tensor_parameter_150:
+    name = "parameter_150"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0878856")
+    max_val = float("0.089544")
+    mean = float("-3.45358e-05")
+    std = float("0.0200235")
+    data = None
+
+
+class Program_weight_tensor_parameter_151:
+    name = "parameter_151"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0932969")
+    max_val = float("0.0873382")
+    mean = float("1.34506e-05")
+    std = float("0.019995")
+    data = None
+
+
+class Program_weight_tensor_parameter_152:
+    name = "parameter_152"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0937826")
+    max_val = float("0.0977121")
+    mean = float("-3.7703e-05")
+    std = float("0.0200068")
+    data = None
+
+
+class Program_weight_tensor_parameter_153:
+    name = "parameter_153"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_154:
+    name = "parameter_154"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0980782")
+    max_val = float("0.0965137")
+    mean = float("2.04605e-06")
+    std = float("0.0200081")
+    data = None
+
+
+class Program_weight_tensor_parameter_155:
+    name = "parameter_155"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_156:
+    name = "parameter_156"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0943608")
+    max_val = float("0.0971133")
+    mean = float("-6.8108e-06")
+    std = float("0.0200112")
+    data = None
+
+
+class Program_weight_tensor_parameter_157:
+    name = "parameter_157"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_158:
+    name = "parameter_158"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_159:
+    name = "parameter_159"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_160:
+    name = "parameter_160"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_161:
+    name = "parameter_161"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0611639")
+    max_val = float("0.0706575")
+    mean = float("-0.000616581")
+    std = float("0.0195134")
+    data = None
+
+
+class Program_weight_tensor_parameter_162:
+    name = "parameter_162"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0601015")
+    max_val = float("0.0596957")
+    mean = float("-0.00045851")
+    std = float("0.0206736")
+    data = None
+
+
+class Program_weight_tensor_parameter_163:
+    name = "parameter_163"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0710343")
+    max_val = float("0.0623504")
+    mean = float("0.000134263")
+    std = float("0.020837")
+    data = None
+
+
+class Program_weight_tensor_parameter_164:
+    name = "parameter_164"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0560268")
+    max_val = float("0.0666249")
+    mean = float("0.000222676")
+    std = float("0.0204334")
+    data = None
+
+
+class Program_weight_tensor_parameter_165:
+    name = "parameter_165"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.102647")
+    max_val = float("0.0960256")
+    mean = float("-1.14217e-05")
+    std = float("0.0200022")
+    data = None
+
+
+class Program_weight_tensor_parameter_166:
+    name = "parameter_166"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0977201")
+    max_val = float("0.0934928")
+    mean = float("-7.01381e-05")
+    std = float("0.0199996")
+    data = None
+
+
+class Program_weight_tensor_parameter_167:
+    name = "parameter_167"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0908481")
+    max_val = float("0.0940214")
+    mean = float("2.29566e-05")
+    std = float("0.0199939")
+    data = None
+
+
+class Program_weight_tensor_parameter_168:
+    name = "parameter_168"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0902858")
+    max_val = float("0.102385")
+    mean = float("-1.86941e-06")
+    std = float("0.0200058")
+    data = None
+
+
+class Program_weight_tensor_parameter_169:
+    name = "parameter_169"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0961338")
+    max_val = float("0.104653")
+    mean = float("2.49637e-05")
+    std = float("0.0200507")
+    data = None
+
+
+class Program_weight_tensor_parameter_170:
+    name = "parameter_170"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_171:
+    name = "parameter_171"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.101178")
+    max_val = float("0.0997435")
+    mean = float("-1.84101e-06")
+    std = float("0.0199933")
+    data = None
+
+
+class Program_weight_tensor_parameter_172:
+    name = "parameter_172"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_173:
+    name = "parameter_173"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0986081")
+    max_val = float("0.0935717")
+    mean = float("-7.97306e-06")
+    std = float("0.0199965")
+    data = None
+
+
+class Program_weight_tensor_parameter_174:
+    name = "parameter_174"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_175:
+    name = "parameter_175"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_176:
+    name = "parameter_176"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_177:
+    name = "parameter_177"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_178:
+    name = "parameter_178"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0756449")
+    max_val = float("0.0717684")
+    mean = float("0.000543476")
+    std = float("0.0199753")
+    data = None
+
+
+class Program_weight_tensor_parameter_179:
+    name = "parameter_179"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0529804")
+    max_val = float("0.0627784")
+    mean = float("0.000439584")
+    std = float("0.0196506")
+    data = None
+
+
+class Program_weight_tensor_parameter_180:
+    name = "parameter_180"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0617306")
+    max_val = float("0.0535067")
+    mean = float("-0.000375348")
+    std = float("0.0199388")
+    data = None
+
+
+class Program_weight_tensor_parameter_181:
+    name = "parameter_181"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0633299")
+    max_val = float("0.0582609")
+    mean = float("0.000747421")
+    std = float("0.0194937")
+    data = None
+
+
+class Program_weight_tensor_parameter_182:
+    name = "parameter_182"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.106269")
+    max_val = float("0.0909546")
+    mean = float("2.20117e-05")
+    std = float("0.0199926")
+    data = None
+
+
+class Program_weight_tensor_parameter_183:
+    name = "parameter_183"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.101218")
+    max_val = float("0.0890308")
+    mean = float("-3.23903e-05")
+    std = float("0.0199803")
+    data = None
+
+
+class Program_weight_tensor_parameter_184:
+    name = "parameter_184"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0932448")
+    max_val = float("0.0936962")
+    mean = float("1.98542e-05")
+    std = float("0.0200273")
+    data = None
+
+
+class Program_weight_tensor_parameter_185:
+    name = "parameter_185"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0983727")
+    max_val = float("0.0921061")
+    mean = float("3.60788e-06")
+    std = float("0.020025")
+    data = None
+
+
+class Program_weight_tensor_parameter_186:
+    name = "parameter_186"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0954569")
+    max_val = float("0.0943231")
+    mean = float("-1.62991e-05")
+    std = float("0.0200094")
+    data = None
+
+
+class Program_weight_tensor_parameter_187:
+    name = "parameter_187"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_188:
+    name = "parameter_188"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0996068")
+    max_val = float("0.102469")
+    mean = float("1.54505e-05")
+    std = float("0.0200034")
+    data = None
+
+
+class Program_weight_tensor_parameter_189:
+    name = "parameter_189"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_190:
+    name = "parameter_190"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0980855")
+    max_val = float("0.104687")
+    mean = float("-1.86281e-05")
+    std = float("0.0199991")
+    data = None
+
+
+class Program_weight_tensor_parameter_191:
+    name = "parameter_191"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_192:
+    name = "parameter_192"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_193:
+    name = "parameter_193"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_194:
+    name = "parameter_194"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_195:
+    name = "parameter_195"
+    shape = [2, 12, 64]
+    dtype = "float32"
+    min_val = float("-0.0611005")
+    max_val = float("0.0627168")
+    mean = float("0.000420174")
+    std = float("0.0202319")
+    data = None
+
+
+class Program_weight_tensor_parameter_196:
+    name = "parameter_196"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0615742")
+    max_val = float("0.0505877")
+    mean = float("-0.000225133")
+    std = float("0.0195975")
+    data = None
+
+
+class Program_weight_tensor_parameter_197:
+    name = "parameter_197"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0751916")
+    max_val = float("0.0657023")
+    mean = float("0.00057308")
+    std = float("0.0197271")
+    data = None
+
+
+class Program_weight_tensor_parameter_198:
+    name = "parameter_198"
+    shape = [12, 64]
+    dtype = "float32"
+    min_val = float("-0.0620252")
+    max_val = float("0.0779363")
+    mean = float("-0.000147361")
+    std = float("0.0197645")
+    data = None
+
+
+class Program_weight_tensor_parameter_199:
+    name = "parameter_199"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.10821")
+    max_val = float("0.0981826")
+    mean = float("-2.55125e-05")
+    std = float("0.0199975")
+    data = None
+
+
+class Program_weight_tensor_parameter_200:
+    name = "parameter_200"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0960096")
+    max_val = float("0.107631")
+    mean = float("-2.79243e-06")
+    std = float("0.0200059")
+    data = None
+
+
+class Program_weight_tensor_parameter_201:
+    name = "parameter_201"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.100199")
+    max_val = float("0.0976366")
+    mean = float("4.94114e-05")
+    std = float("0.0200341")
+    data = None
+
+
+class Program_weight_tensor_parameter_202:
+    name = "parameter_202"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0939321")
+    max_val = float("0.0938979")
+    mean = float("3.61011e-06")
+    std = float("0.0200137")
+    data = None
+
+
+class Program_weight_tensor_parameter_203:
+    name = "parameter_203"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.093686")
+    max_val = float("0.0974261")
+    mean = float("-2.64717e-05")
+    std = float("0.020049")
+    data = None
+
+
+class Program_weight_tensor_parameter_204:
+    name = "parameter_204"
+    shape = [32000, 768]
+    dtype = "float32"
+    min_val = float("-0.104007")
+    max_val = float("0.108997")
+    mean = float("-6.62682e-06")
+    std = float("0.02")
+    data = None
+
+
+class Program_weight_tensor_parameter_205:
+    name = "parameter_205"
+    shape = [1, 1, 768]
+    dtype = "float32"
+    min_val = float("-0.0602759")
+    max_val = float("0.0636703")
+    mean = float("0.00026458")
+    std = float("0.0204492")
+    data = None
diff --git a/paddle_samples/PaddleNLP/xlnet-large-cased/graph_net.json b/paddle_samples/PaddleNLP/xlnet-large-cased/graph_net.json
new file mode 100644
index 000000000..04fb8d8f3
--- /dev/null
+++ b/paddle_samples/PaddleNLP/xlnet-large-cased/graph_net.json
@@ -0,0 +1,6 @@
+{
+    "framework": "paddle",
+    "model_name": "xlnet-large-cased",
+    "num_devices_required": 1,
+    "num_nodes_required": 1
+}
\ No newline at end of file
diff --git a/paddle_samples/PaddleNLP/xlnet-large-cased/input_meta.py b/paddle_samples/PaddleNLP/xlnet-large-cased/input_meta.py
new file mode 100644
index 000000000..feae33c5c
--- /dev/null
+++ b/paddle_samples/PaddleNLP/xlnet-large-cased/input_meta.py
@@ -0,0 +1,42 @@
+class Program_weight_tensor_data_0:
+    name = "data_0"
+    shape = [1, 22]
+    dtype = "int64"
+    data = [
+        17,
+        11368,
+        19,
+        94,
+        304,
+        27,
+        2656,
+        9,
+        35,
+        569,
+        1899,
+        75,
+        392,
+        1243,
+        2626,
+        21,
+        58,
+        4797,
+        23,
+        9,
+        4,
+        3,
+    ]
+
+
+class Program_weight_tensor_data_1:
+    name = "data_1"
+    shape = [1, 22]
+    dtype = "int64"
+    data = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]
+
+
+class Program_weight_tensor_data_2:
+    name = "data_2"
+    shape = [1, 22]
+    dtype = "int64"
+    data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
diff --git a/paddle_samples/PaddleNLP/xlnet-large-cased/model.py b/paddle_samples/PaddleNLP/xlnet-large-cased/model.py
new file mode 100644
index 000000000..35160f7f8
--- /dev/null
+++ b/paddle_samples/PaddleNLP/xlnet-large-cased/model.py
@@ -0,0 +1,8389 @@
+import paddle
+
+
+class GraphModule(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self,
+        parameter_0,
+        parameter_1,
+        parameter_2,
+        parameter_3,
+        parameter_4,
+        parameter_5,
+        parameter_6,
+        parameter_7,
+        parameter_8,
+        parameter_9,
+        parameter_10,
+        parameter_11,
+        parameter_12,
+        parameter_13,
+        parameter_14,
+        parameter_15,
+        parameter_16,
+        parameter_17,
+        parameter_18,
+        parameter_19,
+        parameter_20,
+        parameter_21,
+        parameter_22,
+        parameter_23,
+        parameter_24,
+        parameter_25,
+        parameter_26,
+        parameter_27,
+        parameter_28,
+        parameter_29,
+        parameter_30,
+        parameter_31,
+        parameter_32,
+        parameter_33,
+        parameter_34,
+        parameter_35,
+        parameter_36,
+        parameter_37,
+        parameter_38,
+        parameter_39,
+        parameter_40,
+        parameter_41,
+        parameter_42,
+        parameter_43,
+        parameter_44,
+        parameter_45,
+        parameter_46,
+        parameter_47,
+        parameter_48,
+        parameter_49,
+        parameter_50,
+        parameter_51,
+        parameter_52,
+        parameter_53,
+        parameter_54,
+        parameter_55,
+        parameter_56,
+        parameter_57,
+        parameter_58,
+        parameter_59,
+        parameter_60,
+        parameter_61,
+        parameter_62,
+        parameter_63,
+        parameter_64,
+        parameter_65,
+        parameter_66,
+        parameter_67,
+        parameter_68,
+        parameter_69,
+        parameter_70,
+        parameter_71,
+        parameter_72,
+        parameter_73,
+        parameter_74,
+        parameter_75,
+        parameter_76,
+        parameter_77,
+        parameter_78,
+        parameter_79,
+        parameter_80,
+        parameter_81,
+        parameter_82,
+        parameter_83,
+        parameter_84,
+        parameter_85,
+        parameter_86,
+        parameter_87,
+        parameter_88,
+        parameter_89,
+        parameter_90,
+        parameter_91,
+        parameter_92,
+        parameter_93,
+        parameter_94,
+        parameter_95,
+        parameter_96,
+        parameter_97,
+        parameter_98,
+        parameter_99,
+        parameter_100,
+        parameter_101,
+        parameter_102,
+        parameter_103,
+        parameter_104,
+        parameter_105,
+        parameter_106,
+        parameter_107,
+        parameter_108,
+        parameter_109,
+        parameter_110,
+        parameter_111,
+        parameter_112,
+        parameter_113,
+        parameter_114,
+        parameter_115,
+        parameter_116,
+        parameter_117,
+        parameter_118,
+        parameter_119,
+        parameter_120,
+        parameter_121,
+        parameter_122,
+        parameter_123,
+        parameter_124,
+        parameter_125,
+        parameter_126,
+        parameter_127,
+        parameter_128,
+        parameter_129,
+        parameter_130,
+        parameter_131,
+        parameter_132,
+        parameter_133,
+        parameter_134,
+        parameter_135,
+        parameter_136,
+        parameter_137,
+        parameter_138,
+        parameter_139,
+        parameter_140,
+        parameter_141,
+        parameter_142,
+        parameter_143,
+        parameter_144,
+        parameter_145,
+        parameter_146,
+        parameter_147,
+        parameter_148,
+        parameter_149,
+        parameter_150,
+        parameter_151,
+        parameter_152,
+        parameter_153,
+        parameter_154,
+        parameter_155,
+        parameter_156,
+        parameter_157,
+        parameter_158,
+        parameter_159,
+        parameter_160,
+        parameter_161,
+        parameter_162,
+        parameter_163,
+        parameter_164,
+        parameter_165,
+        parameter_166,
+        parameter_167,
+        parameter_168,
+        parameter_169,
+        parameter_170,
+        parameter_171,
+        parameter_172,
+        parameter_173,
+        parameter_174,
+        parameter_175,
+        parameter_176,
+        parameter_177,
+        parameter_178,
+        parameter_179,
+        parameter_180,
+        parameter_181,
+        parameter_182,
+        parameter_183,
+        parameter_184,
+        parameter_185,
+        parameter_186,
+        parameter_187,
+        parameter_188,
+        parameter_189,
+        parameter_190,
+        parameter_191,
+        parameter_192,
+        parameter_193,
+        parameter_194,
+        parameter_195,
+        parameter_196,
+        parameter_197,
+        parameter_198,
+        parameter_199,
+        parameter_200,
+        parameter_201,
+        parameter_202,
+        parameter_203,
+        parameter_204,
+        parameter_205,
+        parameter_206,
+        parameter_207,
+        parameter_208,
+        parameter_209,
+        parameter_210,
+        parameter_211,
+        parameter_212,
+        parameter_213,
+        parameter_214,
+        parameter_215,
+        parameter_216,
+        parameter_217,
+        parameter_218,
+        parameter_219,
+        parameter_220,
+        parameter_221,
+        parameter_222,
+        parameter_223,
+        parameter_224,
+        parameter_225,
+        parameter_226,
+        parameter_227,
+        parameter_228,
+        parameter_229,
+        parameter_230,
+        parameter_231,
+        parameter_232,
+        parameter_233,
+        parameter_234,
+        parameter_235,
+        parameter_236,
+        parameter_237,
+        parameter_238,
+        parameter_239,
+        parameter_240,
+        parameter_241,
+        parameter_242,
+        parameter_243,
+        parameter_244,
+        parameter_245,
+        parameter_246,
+        parameter_247,
+        parameter_248,
+        parameter_249,
+        parameter_250,
+        parameter_251,
+        parameter_252,
+        parameter_253,
+        parameter_254,
+        parameter_255,
+        parameter_256,
+        parameter_257,
+        parameter_258,
+        parameter_259,
+        parameter_260,
+        parameter_261,
+        parameter_262,
+        parameter_263,
+        parameter_264,
+        parameter_265,
+        parameter_266,
+        parameter_267,
+        parameter_268,
+        parameter_269,
+        parameter_270,
+        parameter_271,
+        parameter_272,
+        parameter_273,
+        parameter_274,
+        parameter_275,
+        parameter_276,
+        parameter_277,
+        parameter_278,
+        parameter_279,
+        parameter_280,
+        parameter_281,
+        parameter_282,
+        parameter_283,
+        parameter_284,
+        parameter_285,
+        parameter_286,
+        parameter_287,
+        parameter_288,
+        parameter_289,
+        parameter_290,
+        parameter_291,
+        parameter_292,
+        parameter_293,
+        parameter_294,
+        parameter_295,
+        parameter_296,
+        parameter_297,
+        parameter_298,
+        parameter_299,
+        parameter_300,
+        parameter_301,
+        parameter_302,
+        parameter_303,
+        parameter_304,
+        parameter_305,
+        parameter_306,
+        parameter_307,
+        parameter_308,
+        parameter_309,
+        parameter_310,
+        parameter_311,
+        parameter_312,
+        parameter_313,
+        parameter_314,
+        parameter_315,
+        parameter_316,
+        parameter_317,
+        parameter_318,
+        parameter_319,
+        parameter_320,
+        parameter_321,
+        parameter_322,
+        parameter_323,
+        parameter_324,
+        parameter_325,
+        parameter_326,
+        parameter_327,
+        parameter_328,
+        parameter_329,
+        parameter_330,
+        parameter_331,
+        parameter_332,
+        parameter_333,
+        parameter_334,
+        parameter_335,
+        parameter_336,
+        parameter_337,
+        parameter_338,
+        parameter_339,
+        parameter_340,
+        parameter_341,
+        parameter_342,
+        parameter_343,
+        parameter_344,
+        parameter_345,
+        parameter_346,
+        parameter_347,
+        parameter_348,
+        parameter_349,
+        parameter_350,
+        parameter_351,
+        parameter_352,
+        parameter_353,
+        parameter_354,
+        parameter_355,
+        parameter_356,
+        parameter_357,
+        parameter_358,
+        parameter_359,
+        parameter_360,
+        parameter_361,
+        parameter_362,
+        parameter_363,
+        parameter_364,
+        parameter_365,
+        parameter_366,
+        parameter_367,
+        parameter_368,
+        parameter_369,
+        parameter_370,
+        parameter_371,
+        parameter_372,
+        parameter_373,
+        parameter_374,
+        parameter_375,
+        parameter_376,
+        parameter_377,
+        parameter_378,
+        parameter_379,
+        parameter_380,
+        parameter_381,
+        parameter_382,
+        parameter_383,
+        parameter_384,
+        parameter_385,
+        parameter_386,
+        parameter_387,
+        parameter_388,
+        parameter_389,
+        parameter_390,
+        parameter_391,
+        parameter_392,
+        parameter_393,
+        parameter_394,
+        parameter_395,
+        parameter_396,
+        parameter_397,
+        parameter_398,
+        parameter_399,
+        parameter_400,
+        parameter_401,
+        parameter_402,
+        parameter_403,
+        parameter_404,
+        parameter_405,
+        parameter_406,
+        parameter_407,
+        parameter_408,
+        parameter_409,
+        data_0,
+        data_1,
+        data_2,
+    ):
+        # pd_op.transpose: (22x1xi64) <- (1x22xi64)
+        transpose_1 = paddle._C_ops.transpose(data_0, [1, 0])
+        del data_0
+
+        # pd_op.transpose: (22x1xi64) <- (1x22xi64)
+        transpose_2 = paddle._C_ops.transpose(data_1, [1, 0])
+        del data_1
+
+        # pd_op.transpose: (22x1xi64) <- (1x22xi64)
+        transpose_3 = paddle._C_ops.transpose(data_2, [1, 0])
+        del data_2
+
+        # pd_op.cast: (22x1xf32) <- (22x1xi64)
+        cast_0 = paddle._C_ops.cast(transpose_3, paddle.float32)
+        del transpose_3
+
+        # pd_op.full: (1xf32) <- ()
+        full_0 = paddle._C_ops.full(
+            [1], float("-1"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (22x1xf32) <- (22x1xf32, 1xf32)
+        scale_0 = paddle._C_ops.scale(cast_0, full_0, float("1"), True)
+        del cast_0
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_0 = [0]
+
+        # pd_op.unsqueeze: (1x22x1xf32) <- (22x1xf32, 1xi64)
+        unsqueeze_0 = paddle._C_ops.unsqueeze(scale_0, full_int_array_0)
+        del scale_0
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_1 = [-1]
+
+        # pd_op.unsqueeze: (1x22x1x1xf32) <- (1x22x1xf32, 1xi64)
+        unsqueeze_1 = paddle._C_ops.unsqueeze(unsqueeze_0, full_int_array_1)
+        del full_int_array_1, unsqueeze_0
+
+        # pd_op.full: (xf32) <- ()
+        full_1 = paddle._C_ops.full(
+            [], float("0"), paddle.float32, paddle.framework._current_expected_place()
+        )
+
+        # pd_op.greater_than: (1x22x1x1xb) <- (1x22x1x1xf32, xf32)
+        greater_than_0 = paddle._C_ops.greater_than(unsqueeze_1, full_1)
+        del unsqueeze_1
+
+        # pd_op.cast: (1x22x1x1xf32) <- (1x22x1x1xb)
+        cast_1 = paddle._C_ops.cast(greater_than_0, paddle.float32)
+        del greater_than_0
+
+        # pd_op.full: (22xf32) <- ()
+        full_2 = paddle._C_ops.full(
+            [22], float("1"), paddle.float32, paddle.framework._current_expected_place()
+        )
+
+        # pd_op.diag: (22x22xf32) <- (22xf32)
+        diag_0 = paddle._C_ops.diag(full_2, 0, float("0"))
+        del full_2
+
+        # pd_op.scale: (22x22xf32) <- (22x22xf32, 1xf32)
+        scale_1 = paddle._C_ops.scale(diag_0, full_0, float("0"), True)
+        del diag_0, full_0
+
+        # pd_op.cast: (22x22xf32) <- (22x22xf32)
+        cast_2 = paddle._C_ops.cast(scale_1, paddle.float32)
+        del scale_1
+
+        # pd_op.full_int_array: (2xi64) <- ()
+        full_int_array_2 = [2, 3]
+
+        # pd_op.unsqueeze: (22x22x1x1xf32) <- (22x22xf32, 2xi64)
+        unsqueeze_2 = paddle._C_ops.unsqueeze(cast_2, full_int_array_2)
+        del cast_2, full_int_array_2
+
+        # pd_op.add: (22x22x1x1xf32) <- (1x22x1x1xf32, 22x22x1x1xf32)
+        add_0 = paddle._C_ops.add(cast_1, unsqueeze_2)
+        del cast_1, unsqueeze_2
+
+        # pd_op.greater_than: (22x22x1x1xb) <- (22x22x1x1xf32, xf32)
+        greater_than_1 = paddle._C_ops.greater_than(add_0, full_1)
+        del add_0, full_1
+
+        # pd_op.cast: (22x22x1x1xf32) <- (22x22x1x1xb)
+        cast_3 = paddle._C_ops.cast(greater_than_1, paddle.float32)
+        del greater_than_1
+
+        # pd_op.embedding: (22x1x1024xf32) <- (22x1xi64, 32000x1024xf32)
+        embedding_0 = paddle._C_ops.embedding(transpose_1, parameter_408, -1, False)
+        del parameter_408, transpose_1
+
+        # pd_op.full: (1xf32) <- ()
+        full_3 = paddle._C_ops.full(
+            [1], float("0.1"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_0, dropout_1 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                embedding_0, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del embedding_0
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_3 = [1]
+
+        # pd_op.unsqueeze: (22x1x1xi64) <- (22x1xi64, 1xi64)
+        unsqueeze_3 = paddle._C_ops.unsqueeze(transpose_2, full_int_array_3)
+
+        # pd_op.unsqueeze: (1x22x1xi64) <- (22x1xi64, 1xi64)
+        unsqueeze_4 = paddle._C_ops.unsqueeze(transpose_2, full_int_array_0)
+        del full_int_array_0, transpose_2
+
+        # pd_op.not_equal: (22x22x1xb) <- (22x1x1xi64, 1x22x1xi64)
+        not_equal_0 = paddle._C_ops.not_equal(unsqueeze_3, unsqueeze_4)
+        del unsqueeze_3, unsqueeze_4
+
+        # pd_op.cast: (22x22x1xi64) <- (22x22x1xb)
+        cast_4 = paddle._C_ops.cast(not_equal_0, paddle.int64)
+        del not_equal_0
+
+        # pd_op.full: (1xi32) <- ()
+        full_4 = paddle._C_ops.full(
+            [1], float("2"), paddle.int32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.one_hot: (22x22x1x2xf32) <- (22x22x1xi64, 1xi32)
+        one_hot_0 = paddle._C_ops.one_hot(
+            cast_4 % paddle.cast(full_4, cast_4.dtype), full_4
+        )
+        del cast_4, full_4
+
+        # pd_op.cast: (22x22x1x2xf32) <- (22x22x1x2xf32)
+        cast_5 = paddle._C_ops.cast(one_hot_0, paddle.float32)
+        del one_hot_0
+
+        # pd_op.full: (1xf64) <- ()
+        full_5 = paddle._C_ops.full(
+            [1], float("0"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full: (1xf64) <- ()
+        full_6 = paddle._C_ops.full(
+            [1], float("1024"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full: (1xf64) <- ()
+        full_7 = paddle._C_ops.full(
+            [1], float("2"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.arange: (512xf32) <- (1xf64, 1xf64, 1xf64)
+        arange_0 = paddle.arange(full_5, full_6, full_7, dtype="float32")
+        del full_6, full_7
+
+        # pd_op.full: (1xf32) <- ()
+        full_8 = paddle._C_ops.full(
+            [1], float("0.000976562"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (512xf32) <- (512xf32, 1xf32)
+        scale_2 = paddle._C_ops.scale(arange_0, full_8, float("0"), True)
+        del arange_0, full_8
+
+        # pd_op.full: (512xf32) <- ()
+        full_9 = paddle._C_ops.full(
+            [512],
+            float("10000"),
+            paddle.float32,
+            paddle.framework._current_expected_place(),
+        )
+
+        # pd_op.elementwise_pow: (512xf32) <- (512xf32, 512xf32)
+        elementwise_pow_0 = paddle._C_ops.elementwise_pow(full_9, scale_2)
+        del full_9, scale_2
+
+        # pd_op.full: (512xf32) <- ()
+        full_10 = paddle._C_ops.full(
+            [512],
+            float("1"),
+            paddle.float32,
+            paddle.framework._current_expected_place(),
+        )
+
+        # pd_op.divide: (512xf32) <- (512xf32, 512xf32)
+        divide_0 = paddle._C_ops.divide(full_10, elementwise_pow_0)
+        del elementwise_pow_0, full_10
+
+        # pd_op.full: (1xf64) <- ()
+        full_11 = paddle._C_ops.full(
+            [1], float("22"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full: (1xf64) <- ()
+        full_12 = paddle._C_ops.full(
+            [1], float("-22"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full: (1xf64) <- ()
+        full_13 = paddle._C_ops.full(
+            [1], float("-1"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.arange: (44xf32) <- (1xf64, 1xf64, 1xf64)
+        arange_1 = paddle.arange(full_11, full_12, full_13, dtype="float32")
+        del full_12, full_13
+
+        # builtin.combine: ([44xf32, 512xf32]) <- (44xf32, 512xf32)
+        combine_0 = [arange_1, divide_0]
+        del arange_1, divide_0
+
+        # pd_op.einsum: (44x512xf32, [0xf32, 0xf32], [44xf32, 512xf32]) <- ([44xf32, 512xf32])
+        einsum_0, einsum_1, einsum_2 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_0, "i,d->id"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_0
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_0,
+            split_1,
+        ) = einsum_1
+        del einsum_1
+
+        # builtin.split: (44xf32, 512xf32) <- ([44xf32, 512xf32])
+        (
+            split_2,
+            split_3,
+        ) = einsum_2
+        del einsum_2
+
+        # pd_op.sin: (44x512xf32) <- (44x512xf32)
+        sin_0 = paddle._C_ops.sin(einsum_0)
+
+        # pd_op.cos: (44x512xf32) <- (44x512xf32)
+        cos_0 = paddle._C_ops.cos(einsum_0)
+        del einsum_0
+
+        # pd_op.full: (1xi32) <- ()
+        full_14 = paddle._C_ops.full(
+            [1], float("-1"), paddle.int32, paddle.core.CPUPlace()
+        )
+
+        # builtin.combine: ([44x512xf32, 44x512xf32]) <- (44x512xf32, 44x512xf32)
+        combine_1 = [sin_0, cos_0]
+        del cos_0, sin_0
+
+        # pd_op.concat: (44x1024xf32) <- ([44x512xf32, 44x512xf32], 1xi32)
+        concat_0 = paddle._C_ops.concat(combine_1, full_14)
+        del combine_1, full_14
+
+        # pd_op.unsqueeze: (44x1x1024xf32) <- (44x1024xf32, 1xi64)
+        unsqueeze_5 = paddle._C_ops.unsqueeze(concat_0, full_int_array_3)
+        del concat_0
+
+        # pd_op.full_int_array: (3xi64) <- ()
+        full_int_array_4 = [-1, 1, -1]
+
+        # pd_op.expand: (44x1x1024xf32) <- (44x1x1024xf32, 3xi64)
+        expand_0 = paddle._C_ops.expand(unsqueeze_5, full_int_array_4)
+        del full_int_array_4, unsqueeze_5
+
+        # pd_op.dropout: (44x1x1024xf32, 44x1x1024xui8) <- (44x1x1024xf32, None, 1xf32)
+        dropout_2, dropout_3 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                expand_0, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del expand_0
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_0 = paddle._C_ops.matmul(dropout_0, parameter_407, False, False)
+        del parameter_407
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_5 = [22, 1, 16, 64]
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_0 = paddle._C_ops.reshape(matmul_0, full_int_array_5)
+        del matmul_0
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_1 = paddle._C_ops.matmul(dropout_0, parameter_406, False, False)
+        del parameter_406
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_1 = paddle._C_ops.reshape(matmul_1, full_int_array_5)
+        del matmul_1
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_2 = paddle._C_ops.matmul(dropout_0, parameter_405, False, False)
+        del parameter_405
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_2 = paddle._C_ops.reshape(matmul_2, full_int_array_5)
+        del matmul_2
+
+        # pd_op.matmul: (44x1x1024xf32) <- (44x1x1024xf32, 1024x1024xf32)
+        matmul_3 = paddle._C_ops.matmul(dropout_2, parameter_403, False, False)
+        del parameter_403
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_6 = [44, -1, 16, 64]
+
+        # pd_op.reshape: (44x1x16x64xf32) <- (44x1x1024xf32, 4xi64)
+        reshape_3 = paddle._C_ops.reshape(matmul_3, full_int_array_6)
+        del matmul_3
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_1 = paddle._C_ops.add(reshape_0, parameter_400)
+        del parameter_400
+
+        # builtin.combine: ([22x1x16x64xf32, 22x1x16x64xf32]) <- (22x1x16x64xf32, 22x1x16x64xf32)
+        combine_2 = [add_1, reshape_1]
+        del add_1, reshape_1
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x1x16x64xf32, 22x1x16x64xf32]) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        einsum_3, einsum_4, einsum_5 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_2, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_2
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_4,
+            split_5,
+        ) = einsum_4
+        del einsum_4
+
+        # builtin.split: (22x1x16x64xf32, 22x1x16x64xf32) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        (
+            split_6,
+            split_7,
+        ) = einsum_5
+        del einsum_5
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_2 = paddle._C_ops.add(reshape_0, parameter_402)
+        del parameter_402
+
+        # builtin.combine: ([22x1x16x64xf32, 44x1x16x64xf32]) <- (22x1x16x64xf32, 44x1x16x64xf32)
+        combine_3 = [add_2, reshape_3]
+        del add_2, reshape_3
+
+        # pd_op.einsum: (1x16x22x44xf32, [0xf32, 0xf32], [22x1x16x64xf32, 44x1x16x64xf32]) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        einsum_6, einsum_7, einsum_8 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_3, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_3
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_8,
+            split_9,
+        ) = einsum_7
+        del einsum_7
+
+        # builtin.split: (22x1x16x64xf32, 44x1x16x64xf32) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        (
+            split_10,
+            split_11,
+        ) = einsum_8
+        del einsum_8
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_7 = [1, 16, 44, 22]
+
+        # pd_op.reshape: (1x16x44x22xf32) <- (1x16x22x44xf32, 4xi64)
+        reshape_4 = paddle._C_ops.reshape(einsum_6, full_int_array_7)
+        del einsum_6
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_8 = [2147483647]
+
+        # pd_op.slice: (1x16x43x22xf32) <- (1x16x44x22xf32, 1xi64, 1xi64)
+        slice_0 = paddle._C_ops.slice(
+            reshape_4, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_4
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_9 = [1, 16, 22, 43]
+
+        # pd_op.reshape: (1x16x22x43xf32) <- (1x16x43x22xf32, 4xi64)
+        reshape_5 = paddle._C_ops.reshape(slice_0, full_int_array_9)
+        del slice_0
+
+        # pd_op.full: (1xf64) <- ()
+        full_15 = paddle._C_ops.full(
+            [1], float("1"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.arange: (22xi64) <- (1xf64, 1xf64, 1xf64)
+        arange_2 = paddle.arange(full_5, full_11, full_15, dtype="int64")
+        del full_11, full_15, full_5
+
+        # pd_op.index_select: (1x16x22x22xf32) <- (1x16x22x43xf32, 22xi64)
+        index_select_0 = paddle._C_ops.index_select(reshape_5, arange_2, 3)
+        del reshape_5
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_3 = paddle._C_ops.add(reshape_0, parameter_401)
+        del parameter_401, reshape_0
+
+        # builtin.combine: ([22x1x16x64xf32, 2x16x64xf32]) <- (22x1x16x64xf32, 2x16x64xf32)
+        combine_4 = [add_3, parameter_399]
+        del add_3, parameter_399
+
+        # pd_op.einsum: (22x1x16x2xf32, [0xf32, 0xf32], [22x1x16x64xf32, 2x16x64xf32]) <- ([22x1x16x64xf32, 2x16x64xf32])
+        einsum_9, einsum_10, einsum_11 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_4, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_4
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_12,
+            split_13,
+        ) = einsum_10
+        del einsum_10
+
+        # builtin.split: (22x1x16x64xf32, 2x16x64xf32) <- ([22x1x16x64xf32, 2x16x64xf32])
+        (
+            split_14,
+            split_15,
+        ) = einsum_11
+        del einsum_11
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x16x2xf32]) <- (22x22x1x2xf32, 22x1x16x2xf32)
+        combine_5 = [cast_5, einsum_9]
+        del einsum_9
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x16x2xf32]) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        einsum_12, einsum_13, einsum_14 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_5, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_5
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_16,
+            split_17,
+        ) = einsum_13
+        del einsum_13
+
+        # builtin.split: (22x22x1x2xf32, 22x1x16x2xf32) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        (
+            split_18,
+            split_19,
+        ) = einsum_14
+        del einsum_14
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_4 = paddle._C_ops.add(einsum_3, index_select_0)
+        del einsum_3, index_select_0
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_5 = paddle._C_ops.add(add_4, einsum_12)
+        del add_4, einsum_12
+
+        # pd_op.full: (1xf32) <- ()
+        full_16 = paddle._C_ops.full(
+            [1], float("0.125"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x16x22x22xf32) <- (1x16x22x22xf32, 1xf32)
+        scale_3 = paddle._C_ops.scale(add_5, full_16, float("0"), True)
+        del add_5
+
+        # pd_op.transpose: (1x1x22x22xf32) <- (22x22x1x1xf32)
+        transpose_4 = paddle._C_ops.transpose(cast_3, [2, 3, 0, 1])
+        del cast_3
+
+        # pd_op.full: (1xf32) <- ()
+        full_17 = paddle._C_ops.full(
+            [1], float("1e+30"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x1x22x22xf32) <- (1x1x22x22xf32, 1xf32)
+        scale_4 = paddle._C_ops.scale(transpose_4, full_17, float("0"), True)
+        del full_17, transpose_4
+
+        # pd_op.subtract: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x1x22x22xf32)
+        subtract_0 = paddle._C_ops.subtract(scale_3, scale_4)
+        del scale_3
+
+        # pd_op.softmax: (1x16x22x22xf32) <- (1x16x22x22xf32)
+        softmax_0 = paddle._C_ops.softmax(subtract_0, 3)
+        del subtract_0
+
+        # pd_op.dropout: (1x16x22x22xf32, 1x16x22x22xui8) <- (1x16x22x22xf32, None, 1xf32)
+        dropout_4, dropout_5 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_0, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_0
+
+        # builtin.combine: ([1x16x22x22xf32, 22x1x16x64xf32]) <- (1x16x22x22xf32, 22x1x16x64xf32)
+        combine_6 = [dropout_4, reshape_2]
+        del dropout_4, reshape_2
+
+        # pd_op.einsum: (22x1x16x64xf32, [0xf32, 0xf32], [1x16x22x22xf32, 22x1x16x64xf32]) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        einsum_15, einsum_16, einsum_17 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_6, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_6
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_20,
+            split_21,
+        ) = einsum_16
+        del einsum_16
+
+        # builtin.split: (1x16x22x22xf32, 22x1x16x64xf32) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        (
+            split_22,
+            split_23,
+        ) = einsum_17
+        del einsum_17
+
+        # pd_op.full_int_array: (3xi64) <- ()
+        full_int_array_10 = [22, 1, 1024]
+
+        # pd_op.reshape: (22x1x1024xf32) <- (22x1x16x64xf32, 3xi64)
+        reshape_6 = paddle._C_ops.reshape(einsum_15, full_int_array_10)
+        del einsum_15
+
+        # builtin.combine: ([22x1x1024xf32, 1024x1024xf32]) <- (22x1x1024xf32, 1024x1024xf32)
+        combine_7 = [reshape_6, parameter_404]
+        del parameter_404, reshape_6
+
+        # pd_op.einsum: (22x1x1024xf32, [0xf32, 0xf32], [22x1x1024xf32, 1024x1024xf32]) <- ([22x1x1024xf32, 1024x1024xf32])
+        einsum_18, einsum_19, einsum_20 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_7, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_7
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_24,
+            split_25,
+        ) = einsum_19
+        del einsum_19
+
+        # builtin.split: (22x1x1024xf32, 1024x1024xf32) <- ([22x1x1024xf32, 1024x1024xf32])
+        (
+            split_26,
+            split_27,
+        ) = einsum_20
+        del einsum_20
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_6, dropout_7 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_18, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_18
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_6 = paddle._C_ops.add(dropout_6, dropout_0)
+        del dropout_0, dropout_6
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_0, layer_norm_1, layer_norm_2 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_6, parameter_398, parameter_397, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_6, parameter_397, parameter_398
+
+        # pd_op.matmul: (22x1x4096xf32) <- (22x1x1024xf32, 1024x4096xf32)
+        matmul_4 = paddle._C_ops.matmul(layer_norm_0, parameter_394, False, False)
+        del parameter_394
+
+        # pd_op.add: (22x1x4096xf32) <- (22x1x4096xf32, 4096xf32)
+        add_7 = paddle._C_ops.add(matmul_4, parameter_393)
+        del matmul_4, parameter_393
+
+        # pd_op.gelu: (22x1x4096xf32) <- (22x1x4096xf32)
+        gelu_0 = paddle._C_ops.gelu(add_7, False)
+        del add_7
+
+        # pd_op.dropout: (22x1x4096xf32, 22x1x4096xui8) <- (22x1x4096xf32, None, 1xf32)
+        dropout_8, dropout_9 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_0, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_0
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x4096xf32, 4096x1024xf32)
+        matmul_5 = paddle._C_ops.matmul(dropout_8, parameter_392, False, False)
+        del dropout_8, parameter_392
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 1024xf32)
+        add_8 = paddle._C_ops.add(matmul_5, parameter_391)
+        del matmul_5, parameter_391
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_10, dropout_11 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_8, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_8
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_9 = paddle._C_ops.add(dropout_10, layer_norm_0)
+        del dropout_10, layer_norm_0
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_3, layer_norm_4, layer_norm_5 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_9, parameter_396, parameter_395, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_9, parameter_395, parameter_396
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_6 = paddle._C_ops.matmul(layer_norm_3, parameter_390, False, False)
+        del parameter_390
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_7 = paddle._C_ops.reshape(matmul_6, full_int_array_5)
+        del matmul_6
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_7 = paddle._C_ops.matmul(layer_norm_3, parameter_389, False, False)
+        del parameter_389
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_8 = paddle._C_ops.reshape(matmul_7, full_int_array_5)
+        del matmul_7
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_8 = paddle._C_ops.matmul(layer_norm_3, parameter_388, False, False)
+        del parameter_388
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_9 = paddle._C_ops.reshape(matmul_8, full_int_array_5)
+        del matmul_8
+
+        # pd_op.matmul: (44x1x1024xf32) <- (44x1x1024xf32, 1024x1024xf32)
+        matmul_9 = paddle._C_ops.matmul(dropout_2, parameter_386, False, False)
+        del parameter_386
+
+        # pd_op.reshape: (44x1x16x64xf32) <- (44x1x1024xf32, 4xi64)
+        reshape_10 = paddle._C_ops.reshape(matmul_9, full_int_array_6)
+        del matmul_9
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_10 = paddle._C_ops.add(reshape_7, parameter_383)
+        del parameter_383
+
+        # builtin.combine: ([22x1x16x64xf32, 22x1x16x64xf32]) <- (22x1x16x64xf32, 22x1x16x64xf32)
+        combine_8 = [add_10, reshape_8]
+        del add_10, reshape_8
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x1x16x64xf32, 22x1x16x64xf32]) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        einsum_21, einsum_22, einsum_23 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_8, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_8
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_28,
+            split_29,
+        ) = einsum_22
+        del einsum_22
+
+        # builtin.split: (22x1x16x64xf32, 22x1x16x64xf32) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        (
+            split_30,
+            split_31,
+        ) = einsum_23
+        del einsum_23
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_11 = paddle._C_ops.add(reshape_7, parameter_385)
+        del parameter_385
+
+        # builtin.combine: ([22x1x16x64xf32, 44x1x16x64xf32]) <- (22x1x16x64xf32, 44x1x16x64xf32)
+        combine_9 = [add_11, reshape_10]
+        del add_11, reshape_10
+
+        # pd_op.einsum: (1x16x22x44xf32, [0xf32, 0xf32], [22x1x16x64xf32, 44x1x16x64xf32]) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        einsum_24, einsum_25, einsum_26 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_9, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_9
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_32,
+            split_33,
+        ) = einsum_25
+        del einsum_25
+
+        # builtin.split: (22x1x16x64xf32, 44x1x16x64xf32) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        (
+            split_34,
+            split_35,
+        ) = einsum_26
+        del einsum_26
+
+        # pd_op.reshape: (1x16x44x22xf32) <- (1x16x22x44xf32, 4xi64)
+        reshape_11 = paddle._C_ops.reshape(einsum_24, full_int_array_7)
+        del einsum_24
+
+        # pd_op.slice: (1x16x43x22xf32) <- (1x16x44x22xf32, 1xi64, 1xi64)
+        slice_1 = paddle._C_ops.slice(
+            reshape_11, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_11
+
+        # pd_op.reshape: (1x16x22x43xf32) <- (1x16x43x22xf32, 4xi64)
+        reshape_12 = paddle._C_ops.reshape(slice_1, full_int_array_9)
+        del slice_1
+
+        # pd_op.index_select: (1x16x22x22xf32) <- (1x16x22x43xf32, 22xi64)
+        index_select_1 = paddle._C_ops.index_select(reshape_12, arange_2, 3)
+        del reshape_12
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_12 = paddle._C_ops.add(reshape_7, parameter_384)
+        del parameter_384, reshape_7
+
+        # builtin.combine: ([22x1x16x64xf32, 2x16x64xf32]) <- (22x1x16x64xf32, 2x16x64xf32)
+        combine_10 = [add_12, parameter_382]
+        del add_12, parameter_382
+
+        # pd_op.einsum: (22x1x16x2xf32, [0xf32, 0xf32], [22x1x16x64xf32, 2x16x64xf32]) <- ([22x1x16x64xf32, 2x16x64xf32])
+        einsum_27, einsum_28, einsum_29 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_10, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_10
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_36,
+            split_37,
+        ) = einsum_28
+        del einsum_28
+
+        # builtin.split: (22x1x16x64xf32, 2x16x64xf32) <- ([22x1x16x64xf32, 2x16x64xf32])
+        (
+            split_38,
+            split_39,
+        ) = einsum_29
+        del einsum_29
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x16x2xf32]) <- (22x22x1x2xf32, 22x1x16x2xf32)
+        combine_11 = [cast_5, einsum_27]
+        del einsum_27
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x16x2xf32]) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        einsum_30, einsum_31, einsum_32 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_11, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_11
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_40,
+            split_41,
+        ) = einsum_31
+        del einsum_31
+
+        # builtin.split: (22x22x1x2xf32, 22x1x16x2xf32) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        (
+            split_42,
+            split_43,
+        ) = einsum_32
+        del einsum_32
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_13 = paddle._C_ops.add(einsum_21, index_select_1)
+        del einsum_21, index_select_1
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_14 = paddle._C_ops.add(add_13, einsum_30)
+        del add_13, einsum_30
+
+        # pd_op.scale: (1x16x22x22xf32) <- (1x16x22x22xf32, 1xf32)
+        scale_5 = paddle._C_ops.scale(add_14, full_16, float("0"), True)
+        del add_14
+
+        # pd_op.subtract: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x1x22x22xf32)
+        subtract_1 = paddle._C_ops.subtract(scale_5, scale_4)
+        del scale_5
+
+        # pd_op.softmax: (1x16x22x22xf32) <- (1x16x22x22xf32)
+        softmax_1 = paddle._C_ops.softmax(subtract_1, 3)
+        del subtract_1
+
+        # pd_op.dropout: (1x16x22x22xf32, 1x16x22x22xui8) <- (1x16x22x22xf32, None, 1xf32)
+        dropout_12, dropout_13 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_1, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_1
+
+        # builtin.combine: ([1x16x22x22xf32, 22x1x16x64xf32]) <- (1x16x22x22xf32, 22x1x16x64xf32)
+        combine_12 = [dropout_12, reshape_9]
+        del dropout_12, reshape_9
+
+        # pd_op.einsum: (22x1x16x64xf32, [0xf32, 0xf32], [1x16x22x22xf32, 22x1x16x64xf32]) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        einsum_33, einsum_34, einsum_35 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_12, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_12
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_44,
+            split_45,
+        ) = einsum_34
+        del einsum_34
+
+        # builtin.split: (1x16x22x22xf32, 22x1x16x64xf32) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        (
+            split_46,
+            split_47,
+        ) = einsum_35
+        del einsum_35
+
+        # pd_op.reshape: (22x1x1024xf32) <- (22x1x16x64xf32, 3xi64)
+        reshape_13 = paddle._C_ops.reshape(einsum_33, full_int_array_10)
+        del einsum_33
+
+        # builtin.combine: ([22x1x1024xf32, 1024x1024xf32]) <- (22x1x1024xf32, 1024x1024xf32)
+        combine_13 = [reshape_13, parameter_387]
+        del parameter_387, reshape_13
+
+        # pd_op.einsum: (22x1x1024xf32, [0xf32, 0xf32], [22x1x1024xf32, 1024x1024xf32]) <- ([22x1x1024xf32, 1024x1024xf32])
+        einsum_36, einsum_37, einsum_38 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_13, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_13
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_48,
+            split_49,
+        ) = einsum_37
+        del einsum_37
+
+        # builtin.split: (22x1x1024xf32, 1024x1024xf32) <- ([22x1x1024xf32, 1024x1024xf32])
+        (
+            split_50,
+            split_51,
+        ) = einsum_38
+        del einsum_38
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_14, dropout_15 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_36, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_36
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_15 = paddle._C_ops.add(dropout_14, layer_norm_3)
+        del dropout_14, layer_norm_3
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_6, layer_norm_7, layer_norm_8 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_15, parameter_381, parameter_380, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_15, parameter_380, parameter_381
+
+        # pd_op.matmul: (22x1x4096xf32) <- (22x1x1024xf32, 1024x4096xf32)
+        matmul_10 = paddle._C_ops.matmul(layer_norm_6, parameter_377, False, False)
+        del parameter_377
+
+        # pd_op.add: (22x1x4096xf32) <- (22x1x4096xf32, 4096xf32)
+        add_16 = paddle._C_ops.add(matmul_10, parameter_376)
+        del matmul_10, parameter_376
+
+        # pd_op.gelu: (22x1x4096xf32) <- (22x1x4096xf32)
+        gelu_1 = paddle._C_ops.gelu(add_16, False)
+        del add_16
+
+        # pd_op.dropout: (22x1x4096xf32, 22x1x4096xui8) <- (22x1x4096xf32, None, 1xf32)
+        dropout_16, dropout_17 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_1, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_1
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x4096xf32, 4096x1024xf32)
+        matmul_11 = paddle._C_ops.matmul(dropout_16, parameter_375, False, False)
+        del dropout_16, parameter_375
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 1024xf32)
+        add_17 = paddle._C_ops.add(matmul_11, parameter_374)
+        del matmul_11, parameter_374
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_18, dropout_19 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_17, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_17
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_18 = paddle._C_ops.add(dropout_18, layer_norm_6)
+        del dropout_18, layer_norm_6
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_9, layer_norm_10, layer_norm_11 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_18, parameter_379, parameter_378, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_18, parameter_378, parameter_379
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_12 = paddle._C_ops.matmul(layer_norm_9, parameter_373, False, False)
+        del parameter_373
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_14 = paddle._C_ops.reshape(matmul_12, full_int_array_5)
+        del matmul_12
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_13 = paddle._C_ops.matmul(layer_norm_9, parameter_372, False, False)
+        del parameter_372
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_15 = paddle._C_ops.reshape(matmul_13, full_int_array_5)
+        del matmul_13
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_14 = paddle._C_ops.matmul(layer_norm_9, parameter_371, False, False)
+        del parameter_371
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_16 = paddle._C_ops.reshape(matmul_14, full_int_array_5)
+        del matmul_14
+
+        # pd_op.matmul: (44x1x1024xf32) <- (44x1x1024xf32, 1024x1024xf32)
+        matmul_15 = paddle._C_ops.matmul(dropout_2, parameter_369, False, False)
+        del parameter_369
+
+        # pd_op.reshape: (44x1x16x64xf32) <- (44x1x1024xf32, 4xi64)
+        reshape_17 = paddle._C_ops.reshape(matmul_15, full_int_array_6)
+        del matmul_15
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_19 = paddle._C_ops.add(reshape_14, parameter_366)
+        del parameter_366
+
+        # builtin.combine: ([22x1x16x64xf32, 22x1x16x64xf32]) <- (22x1x16x64xf32, 22x1x16x64xf32)
+        combine_14 = [add_19, reshape_15]
+        del add_19, reshape_15
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x1x16x64xf32, 22x1x16x64xf32]) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        einsum_39, einsum_40, einsum_41 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_14, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_14
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_52,
+            split_53,
+        ) = einsum_40
+        del einsum_40
+
+        # builtin.split: (22x1x16x64xf32, 22x1x16x64xf32) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        (
+            split_54,
+            split_55,
+        ) = einsum_41
+        del einsum_41
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_20 = paddle._C_ops.add(reshape_14, parameter_368)
+        del parameter_368
+
+        # builtin.combine: ([22x1x16x64xf32, 44x1x16x64xf32]) <- (22x1x16x64xf32, 44x1x16x64xf32)
+        combine_15 = [add_20, reshape_17]
+        del add_20, reshape_17
+
+        # pd_op.einsum: (1x16x22x44xf32, [0xf32, 0xf32], [22x1x16x64xf32, 44x1x16x64xf32]) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        einsum_42, einsum_43, einsum_44 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_15, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_15
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_56,
+            split_57,
+        ) = einsum_43
+        del einsum_43
+
+        # builtin.split: (22x1x16x64xf32, 44x1x16x64xf32) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        (
+            split_58,
+            split_59,
+        ) = einsum_44
+        del einsum_44
+
+        # pd_op.reshape: (1x16x44x22xf32) <- (1x16x22x44xf32, 4xi64)
+        reshape_18 = paddle._C_ops.reshape(einsum_42, full_int_array_7)
+        del einsum_42
+
+        # pd_op.slice: (1x16x43x22xf32) <- (1x16x44x22xf32, 1xi64, 1xi64)
+        slice_2 = paddle._C_ops.slice(
+            reshape_18, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_18
+
+        # pd_op.reshape: (1x16x22x43xf32) <- (1x16x43x22xf32, 4xi64)
+        reshape_19 = paddle._C_ops.reshape(slice_2, full_int_array_9)
+        del slice_2
+
+        # pd_op.index_select: (1x16x22x22xf32) <- (1x16x22x43xf32, 22xi64)
+        index_select_2 = paddle._C_ops.index_select(reshape_19, arange_2, 3)
+        del reshape_19
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_21 = paddle._C_ops.add(reshape_14, parameter_367)
+        del parameter_367, reshape_14
+
+        # builtin.combine: ([22x1x16x64xf32, 2x16x64xf32]) <- (22x1x16x64xf32, 2x16x64xf32)
+        combine_16 = [add_21, parameter_365]
+        del add_21, parameter_365
+
+        # pd_op.einsum: (22x1x16x2xf32, [0xf32, 0xf32], [22x1x16x64xf32, 2x16x64xf32]) <- ([22x1x16x64xf32, 2x16x64xf32])
+        einsum_45, einsum_46, einsum_47 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_16, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_16
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_60,
+            split_61,
+        ) = einsum_46
+        del einsum_46
+
+        # builtin.split: (22x1x16x64xf32, 2x16x64xf32) <- ([22x1x16x64xf32, 2x16x64xf32])
+        (
+            split_62,
+            split_63,
+        ) = einsum_47
+        del einsum_47
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x16x2xf32]) <- (22x22x1x2xf32, 22x1x16x2xf32)
+        combine_17 = [cast_5, einsum_45]
+        del einsum_45
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x16x2xf32]) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        einsum_48, einsum_49, einsum_50 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_17, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_17
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_64,
+            split_65,
+        ) = einsum_49
+        del einsum_49
+
+        # builtin.split: (22x22x1x2xf32, 22x1x16x2xf32) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        (
+            split_66,
+            split_67,
+        ) = einsum_50
+        del einsum_50
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_22 = paddle._C_ops.add(einsum_39, index_select_2)
+        del einsum_39, index_select_2
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_23 = paddle._C_ops.add(add_22, einsum_48)
+        del add_22, einsum_48
+
+        # pd_op.scale: (1x16x22x22xf32) <- (1x16x22x22xf32, 1xf32)
+        scale_6 = paddle._C_ops.scale(add_23, full_16, float("0"), True)
+        del add_23
+
+        # pd_op.subtract: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x1x22x22xf32)
+        subtract_2 = paddle._C_ops.subtract(scale_6, scale_4)
+        del scale_6
+
+        # pd_op.softmax: (1x16x22x22xf32) <- (1x16x22x22xf32)
+        softmax_2 = paddle._C_ops.softmax(subtract_2, 3)
+        del subtract_2
+
+        # pd_op.dropout: (1x16x22x22xf32, 1x16x22x22xui8) <- (1x16x22x22xf32, None, 1xf32)
+        dropout_20, dropout_21 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_2, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_2
+
+        # builtin.combine: ([1x16x22x22xf32, 22x1x16x64xf32]) <- (1x16x22x22xf32, 22x1x16x64xf32)
+        combine_18 = [dropout_20, reshape_16]
+        del dropout_20, reshape_16
+
+        # pd_op.einsum: (22x1x16x64xf32, [0xf32, 0xf32], [1x16x22x22xf32, 22x1x16x64xf32]) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        einsum_51, einsum_52, einsum_53 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_18, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_18
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_68,
+            split_69,
+        ) = einsum_52
+        del einsum_52
+
+        # builtin.split: (1x16x22x22xf32, 22x1x16x64xf32) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        (
+            split_70,
+            split_71,
+        ) = einsum_53
+        del einsum_53
+
+        # pd_op.reshape: (22x1x1024xf32) <- (22x1x16x64xf32, 3xi64)
+        reshape_20 = paddle._C_ops.reshape(einsum_51, full_int_array_10)
+        del einsum_51
+
+        # builtin.combine: ([22x1x1024xf32, 1024x1024xf32]) <- (22x1x1024xf32, 1024x1024xf32)
+        combine_19 = [reshape_20, parameter_370]
+        del parameter_370, reshape_20
+
+        # pd_op.einsum: (22x1x1024xf32, [0xf32, 0xf32], [22x1x1024xf32, 1024x1024xf32]) <- ([22x1x1024xf32, 1024x1024xf32])
+        einsum_54, einsum_55, einsum_56 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_19, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_19
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_72,
+            split_73,
+        ) = einsum_55
+        del einsum_55
+
+        # builtin.split: (22x1x1024xf32, 1024x1024xf32) <- ([22x1x1024xf32, 1024x1024xf32])
+        (
+            split_74,
+            split_75,
+        ) = einsum_56
+        del einsum_56
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_22, dropout_23 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_54, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_54
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_24 = paddle._C_ops.add(dropout_22, layer_norm_9)
+        del dropout_22, layer_norm_9
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_12, layer_norm_13, layer_norm_14 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_24, parameter_364, parameter_363, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_24, parameter_363, parameter_364
+
+        # pd_op.matmul: (22x1x4096xf32) <- (22x1x1024xf32, 1024x4096xf32)
+        matmul_16 = paddle._C_ops.matmul(layer_norm_12, parameter_360, False, False)
+        del parameter_360
+
+        # pd_op.add: (22x1x4096xf32) <- (22x1x4096xf32, 4096xf32)
+        add_25 = paddle._C_ops.add(matmul_16, parameter_359)
+        del matmul_16, parameter_359
+
+        # pd_op.gelu: (22x1x4096xf32) <- (22x1x4096xf32)
+        gelu_2 = paddle._C_ops.gelu(add_25, False)
+        del add_25
+
+        # pd_op.dropout: (22x1x4096xf32, 22x1x4096xui8) <- (22x1x4096xf32, None, 1xf32)
+        dropout_24, dropout_25 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_2, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_2
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x4096xf32, 4096x1024xf32)
+        matmul_17 = paddle._C_ops.matmul(dropout_24, parameter_358, False, False)
+        del dropout_24, parameter_358
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 1024xf32)
+        add_26 = paddle._C_ops.add(matmul_17, parameter_357)
+        del matmul_17, parameter_357
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_26, dropout_27 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_26, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_26
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_27 = paddle._C_ops.add(dropout_26, layer_norm_12)
+        del dropout_26, layer_norm_12
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_15, layer_norm_16, layer_norm_17 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_27, parameter_362, parameter_361, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_27, parameter_361, parameter_362
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_18 = paddle._C_ops.matmul(layer_norm_15, parameter_356, False, False)
+        del parameter_356
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_21 = paddle._C_ops.reshape(matmul_18, full_int_array_5)
+        del matmul_18
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_19 = paddle._C_ops.matmul(layer_norm_15, parameter_355, False, False)
+        del parameter_355
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_22 = paddle._C_ops.reshape(matmul_19, full_int_array_5)
+        del matmul_19
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_20 = paddle._C_ops.matmul(layer_norm_15, parameter_354, False, False)
+        del parameter_354
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_23 = paddle._C_ops.reshape(matmul_20, full_int_array_5)
+        del matmul_20
+
+        # pd_op.matmul: (44x1x1024xf32) <- (44x1x1024xf32, 1024x1024xf32)
+        matmul_21 = paddle._C_ops.matmul(dropout_2, parameter_352, False, False)
+        del parameter_352
+
+        # pd_op.reshape: (44x1x16x64xf32) <- (44x1x1024xf32, 4xi64)
+        reshape_24 = paddle._C_ops.reshape(matmul_21, full_int_array_6)
+        del matmul_21
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_28 = paddle._C_ops.add(reshape_21, parameter_349)
+        del parameter_349
+
+        # builtin.combine: ([22x1x16x64xf32, 22x1x16x64xf32]) <- (22x1x16x64xf32, 22x1x16x64xf32)
+        combine_20 = [add_28, reshape_22]
+        del add_28, reshape_22
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x1x16x64xf32, 22x1x16x64xf32]) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        einsum_57, einsum_58, einsum_59 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_20, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_20
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_76,
+            split_77,
+        ) = einsum_58
+        del einsum_58
+
+        # builtin.split: (22x1x16x64xf32, 22x1x16x64xf32) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        (
+            split_78,
+            split_79,
+        ) = einsum_59
+        del einsum_59
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_29 = paddle._C_ops.add(reshape_21, parameter_351)
+        del parameter_351
+
+        # builtin.combine: ([22x1x16x64xf32, 44x1x16x64xf32]) <- (22x1x16x64xf32, 44x1x16x64xf32)
+        combine_21 = [add_29, reshape_24]
+        del add_29, reshape_24
+
+        # pd_op.einsum: (1x16x22x44xf32, [0xf32, 0xf32], [22x1x16x64xf32, 44x1x16x64xf32]) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        einsum_60, einsum_61, einsum_62 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_21, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_21
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_80,
+            split_81,
+        ) = einsum_61
+        del einsum_61
+
+        # builtin.split: (22x1x16x64xf32, 44x1x16x64xf32) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        (
+            split_82,
+            split_83,
+        ) = einsum_62
+        del einsum_62
+
+        # pd_op.reshape: (1x16x44x22xf32) <- (1x16x22x44xf32, 4xi64)
+        reshape_25 = paddle._C_ops.reshape(einsum_60, full_int_array_7)
+        del einsum_60
+
+        # pd_op.slice: (1x16x43x22xf32) <- (1x16x44x22xf32, 1xi64, 1xi64)
+        slice_3 = paddle._C_ops.slice(
+            reshape_25, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_25
+
+        # pd_op.reshape: (1x16x22x43xf32) <- (1x16x43x22xf32, 4xi64)
+        reshape_26 = paddle._C_ops.reshape(slice_3, full_int_array_9)
+        del slice_3
+
+        # pd_op.index_select: (1x16x22x22xf32) <- (1x16x22x43xf32, 22xi64)
+        index_select_3 = paddle._C_ops.index_select(reshape_26, arange_2, 3)
+        del reshape_26
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_30 = paddle._C_ops.add(reshape_21, parameter_350)
+        del parameter_350, reshape_21
+
+        # builtin.combine: ([22x1x16x64xf32, 2x16x64xf32]) <- (22x1x16x64xf32, 2x16x64xf32)
+        combine_22 = [add_30, parameter_348]
+        del add_30, parameter_348
+
+        # pd_op.einsum: (22x1x16x2xf32, [0xf32, 0xf32], [22x1x16x64xf32, 2x16x64xf32]) <- ([22x1x16x64xf32, 2x16x64xf32])
+        einsum_63, einsum_64, einsum_65 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_22, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_22
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_84,
+            split_85,
+        ) = einsum_64
+        del einsum_64
+
+        # builtin.split: (22x1x16x64xf32, 2x16x64xf32) <- ([22x1x16x64xf32, 2x16x64xf32])
+        (
+            split_86,
+            split_87,
+        ) = einsum_65
+        del einsum_65
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x16x2xf32]) <- (22x22x1x2xf32, 22x1x16x2xf32)
+        combine_23 = [cast_5, einsum_63]
+        del einsum_63
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x16x2xf32]) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        einsum_66, einsum_67, einsum_68 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_23, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_23
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_88,
+            split_89,
+        ) = einsum_67
+        del einsum_67
+
+        # builtin.split: (22x22x1x2xf32, 22x1x16x2xf32) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        (
+            split_90,
+            split_91,
+        ) = einsum_68
+        del einsum_68
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_31 = paddle._C_ops.add(einsum_57, index_select_3)
+        del einsum_57, index_select_3
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_32 = paddle._C_ops.add(add_31, einsum_66)
+        del add_31, einsum_66
+
+        # pd_op.scale: (1x16x22x22xf32) <- (1x16x22x22xf32, 1xf32)
+        scale_7 = paddle._C_ops.scale(add_32, full_16, float("0"), True)
+        del add_32
+
+        # pd_op.subtract: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x1x22x22xf32)
+        subtract_3 = paddle._C_ops.subtract(scale_7, scale_4)
+        del scale_7
+
+        # pd_op.softmax: (1x16x22x22xf32) <- (1x16x22x22xf32)
+        softmax_3 = paddle._C_ops.softmax(subtract_3, 3)
+        del subtract_3
+
+        # pd_op.dropout: (1x16x22x22xf32, 1x16x22x22xui8) <- (1x16x22x22xf32, None, 1xf32)
+        dropout_28, dropout_29 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_3, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_3
+
+        # builtin.combine: ([1x16x22x22xf32, 22x1x16x64xf32]) <- (1x16x22x22xf32, 22x1x16x64xf32)
+        combine_24 = [dropout_28, reshape_23]
+        del dropout_28, reshape_23
+
+        # pd_op.einsum: (22x1x16x64xf32, [0xf32, 0xf32], [1x16x22x22xf32, 22x1x16x64xf32]) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        einsum_69, einsum_70, einsum_71 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_24, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_24
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_92,
+            split_93,
+        ) = einsum_70
+        del einsum_70
+
+        # builtin.split: (1x16x22x22xf32, 22x1x16x64xf32) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        (
+            split_94,
+            split_95,
+        ) = einsum_71
+        del einsum_71
+
+        # pd_op.reshape: (22x1x1024xf32) <- (22x1x16x64xf32, 3xi64)
+        reshape_27 = paddle._C_ops.reshape(einsum_69, full_int_array_10)
+        del einsum_69
+
+        # builtin.combine: ([22x1x1024xf32, 1024x1024xf32]) <- (22x1x1024xf32, 1024x1024xf32)
+        combine_25 = [reshape_27, parameter_353]
+        del parameter_353, reshape_27
+
+        # pd_op.einsum: (22x1x1024xf32, [0xf32, 0xf32], [22x1x1024xf32, 1024x1024xf32]) <- ([22x1x1024xf32, 1024x1024xf32])
+        einsum_72, einsum_73, einsum_74 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_25, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_25
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_96,
+            split_97,
+        ) = einsum_73
+        del einsum_73
+
+        # builtin.split: (22x1x1024xf32, 1024x1024xf32) <- ([22x1x1024xf32, 1024x1024xf32])
+        (
+            split_98,
+            split_99,
+        ) = einsum_74
+        del einsum_74
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_30, dropout_31 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_72, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_72
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_33 = paddle._C_ops.add(dropout_30, layer_norm_15)
+        del dropout_30, layer_norm_15
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_18, layer_norm_19, layer_norm_20 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_33, parameter_347, parameter_346, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_33, parameter_346, parameter_347
+
+        # pd_op.matmul: (22x1x4096xf32) <- (22x1x1024xf32, 1024x4096xf32)
+        matmul_22 = paddle._C_ops.matmul(layer_norm_18, parameter_343, False, False)
+        del parameter_343
+
+        # pd_op.add: (22x1x4096xf32) <- (22x1x4096xf32, 4096xf32)
+        add_34 = paddle._C_ops.add(matmul_22, parameter_342)
+        del matmul_22, parameter_342
+
+        # pd_op.gelu: (22x1x4096xf32) <- (22x1x4096xf32)
+        gelu_3 = paddle._C_ops.gelu(add_34, False)
+        del add_34
+
+        # pd_op.dropout: (22x1x4096xf32, 22x1x4096xui8) <- (22x1x4096xf32, None, 1xf32)
+        dropout_32, dropout_33 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_3, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_3
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x4096xf32, 4096x1024xf32)
+        matmul_23 = paddle._C_ops.matmul(dropout_32, parameter_341, False, False)
+        del dropout_32, parameter_341
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 1024xf32)
+        add_35 = paddle._C_ops.add(matmul_23, parameter_340)
+        del matmul_23, parameter_340
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_34, dropout_35 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_35, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_35
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_36 = paddle._C_ops.add(dropout_34, layer_norm_18)
+        del dropout_34, layer_norm_18
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_21, layer_norm_22, layer_norm_23 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_36, parameter_345, parameter_344, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_36, parameter_344, parameter_345
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_24 = paddle._C_ops.matmul(layer_norm_21, parameter_339, False, False)
+        del parameter_339
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_28 = paddle._C_ops.reshape(matmul_24, full_int_array_5)
+        del matmul_24
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_25 = paddle._C_ops.matmul(layer_norm_21, parameter_338, False, False)
+        del parameter_338
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_29 = paddle._C_ops.reshape(matmul_25, full_int_array_5)
+        del matmul_25
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_26 = paddle._C_ops.matmul(layer_norm_21, parameter_337, False, False)
+        del parameter_337
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_30 = paddle._C_ops.reshape(matmul_26, full_int_array_5)
+        del matmul_26
+
+        # pd_op.matmul: (44x1x1024xf32) <- (44x1x1024xf32, 1024x1024xf32)
+        matmul_27 = paddle._C_ops.matmul(dropout_2, parameter_335, False, False)
+        del parameter_335
+
+        # pd_op.reshape: (44x1x16x64xf32) <- (44x1x1024xf32, 4xi64)
+        reshape_31 = paddle._C_ops.reshape(matmul_27, full_int_array_6)
+        del matmul_27
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_37 = paddle._C_ops.add(reshape_28, parameter_332)
+        del parameter_332
+
+        # builtin.combine: ([22x1x16x64xf32, 22x1x16x64xf32]) <- (22x1x16x64xf32, 22x1x16x64xf32)
+        combine_26 = [add_37, reshape_29]
+        del add_37, reshape_29
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x1x16x64xf32, 22x1x16x64xf32]) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        einsum_75, einsum_76, einsum_77 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_26, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_26
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_100,
+            split_101,
+        ) = einsum_76
+        del einsum_76
+
+        # builtin.split: (22x1x16x64xf32, 22x1x16x64xf32) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        (
+            split_102,
+            split_103,
+        ) = einsum_77
+        del einsum_77
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_38 = paddle._C_ops.add(reshape_28, parameter_334)
+        del parameter_334
+
+        # builtin.combine: ([22x1x16x64xf32, 44x1x16x64xf32]) <- (22x1x16x64xf32, 44x1x16x64xf32)
+        combine_27 = [add_38, reshape_31]
+        del add_38, reshape_31
+
+        # pd_op.einsum: (1x16x22x44xf32, [0xf32, 0xf32], [22x1x16x64xf32, 44x1x16x64xf32]) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        einsum_78, einsum_79, einsum_80 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_27, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_27
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_104,
+            split_105,
+        ) = einsum_79
+        del einsum_79
+
+        # builtin.split: (22x1x16x64xf32, 44x1x16x64xf32) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        (
+            split_106,
+            split_107,
+        ) = einsum_80
+        del einsum_80
+
+        # pd_op.reshape: (1x16x44x22xf32) <- (1x16x22x44xf32, 4xi64)
+        reshape_32 = paddle._C_ops.reshape(einsum_78, full_int_array_7)
+        del einsum_78
+
+        # pd_op.slice: (1x16x43x22xf32) <- (1x16x44x22xf32, 1xi64, 1xi64)
+        slice_4 = paddle._C_ops.slice(
+            reshape_32, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_32
+
+        # pd_op.reshape: (1x16x22x43xf32) <- (1x16x43x22xf32, 4xi64)
+        reshape_33 = paddle._C_ops.reshape(slice_4, full_int_array_9)
+        del slice_4
+
+        # pd_op.index_select: (1x16x22x22xf32) <- (1x16x22x43xf32, 22xi64)
+        index_select_4 = paddle._C_ops.index_select(reshape_33, arange_2, 3)
+        del reshape_33
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_39 = paddle._C_ops.add(reshape_28, parameter_333)
+        del parameter_333, reshape_28
+
+        # builtin.combine: ([22x1x16x64xf32, 2x16x64xf32]) <- (22x1x16x64xf32, 2x16x64xf32)
+        combine_28 = [add_39, parameter_331]
+        del add_39, parameter_331
+
+        # pd_op.einsum: (22x1x16x2xf32, [0xf32, 0xf32], [22x1x16x64xf32, 2x16x64xf32]) <- ([22x1x16x64xf32, 2x16x64xf32])
+        einsum_81, einsum_82, einsum_83 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_28, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_28
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_108,
+            split_109,
+        ) = einsum_82
+        del einsum_82
+
+        # builtin.split: (22x1x16x64xf32, 2x16x64xf32) <- ([22x1x16x64xf32, 2x16x64xf32])
+        (
+            split_110,
+            split_111,
+        ) = einsum_83
+        del einsum_83
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x16x2xf32]) <- (22x22x1x2xf32, 22x1x16x2xf32)
+        combine_29 = [cast_5, einsum_81]
+        del einsum_81
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x16x2xf32]) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        einsum_84, einsum_85, einsum_86 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_29, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_29
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_112,
+            split_113,
+        ) = einsum_85
+        del einsum_85
+
+        # builtin.split: (22x22x1x2xf32, 22x1x16x2xf32) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        (
+            split_114,
+            split_115,
+        ) = einsum_86
+        del einsum_86
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_40 = paddle._C_ops.add(einsum_75, index_select_4)
+        del einsum_75, index_select_4
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_41 = paddle._C_ops.add(add_40, einsum_84)
+        del add_40, einsum_84
+
+        # pd_op.scale: (1x16x22x22xf32) <- (1x16x22x22xf32, 1xf32)
+        scale_8 = paddle._C_ops.scale(add_41, full_16, float("0"), True)
+        del add_41
+
+        # pd_op.subtract: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x1x22x22xf32)
+        subtract_4 = paddle._C_ops.subtract(scale_8, scale_4)
+        del scale_8
+
+        # pd_op.softmax: (1x16x22x22xf32) <- (1x16x22x22xf32)
+        softmax_4 = paddle._C_ops.softmax(subtract_4, 3)
+        del subtract_4
+
+        # pd_op.dropout: (1x16x22x22xf32, 1x16x22x22xui8) <- (1x16x22x22xf32, None, 1xf32)
+        dropout_36, dropout_37 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_4, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_4
+
+        # builtin.combine: ([1x16x22x22xf32, 22x1x16x64xf32]) <- (1x16x22x22xf32, 22x1x16x64xf32)
+        combine_30 = [dropout_36, reshape_30]
+        del dropout_36, reshape_30
+
+        # pd_op.einsum: (22x1x16x64xf32, [0xf32, 0xf32], [1x16x22x22xf32, 22x1x16x64xf32]) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        einsum_87, einsum_88, einsum_89 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_30, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_30
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_116,
+            split_117,
+        ) = einsum_88
+        del einsum_88
+
+        # builtin.split: (1x16x22x22xf32, 22x1x16x64xf32) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        (
+            split_118,
+            split_119,
+        ) = einsum_89
+        del einsum_89
+
+        # pd_op.reshape: (22x1x1024xf32) <- (22x1x16x64xf32, 3xi64)
+        reshape_34 = paddle._C_ops.reshape(einsum_87, full_int_array_10)
+        del einsum_87
+
+        # builtin.combine: ([22x1x1024xf32, 1024x1024xf32]) <- (22x1x1024xf32, 1024x1024xf32)
+        combine_31 = [reshape_34, parameter_336]
+        del parameter_336, reshape_34
+
+        # pd_op.einsum: (22x1x1024xf32, [0xf32, 0xf32], [22x1x1024xf32, 1024x1024xf32]) <- ([22x1x1024xf32, 1024x1024xf32])
+        einsum_90, einsum_91, einsum_92 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_31, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_31
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_120,
+            split_121,
+        ) = einsum_91
+        del einsum_91
+
+        # builtin.split: (22x1x1024xf32, 1024x1024xf32) <- ([22x1x1024xf32, 1024x1024xf32])
+        (
+            split_122,
+            split_123,
+        ) = einsum_92
+        del einsum_92
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_38, dropout_39 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_90, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_90
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_42 = paddle._C_ops.add(dropout_38, layer_norm_21)
+        del dropout_38, layer_norm_21
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_24, layer_norm_25, layer_norm_26 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_42, parameter_330, parameter_329, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_42, parameter_329, parameter_330
+
+        # pd_op.matmul: (22x1x4096xf32) <- (22x1x1024xf32, 1024x4096xf32)
+        matmul_28 = paddle._C_ops.matmul(layer_norm_24, parameter_326, False, False)
+        del parameter_326
+
+        # pd_op.add: (22x1x4096xf32) <- (22x1x4096xf32, 4096xf32)
+        add_43 = paddle._C_ops.add(matmul_28, parameter_325)
+        del matmul_28, parameter_325
+
+        # pd_op.gelu: (22x1x4096xf32) <- (22x1x4096xf32)
+        gelu_4 = paddle._C_ops.gelu(add_43, False)
+        del add_43
+
+        # pd_op.dropout: (22x1x4096xf32, 22x1x4096xui8) <- (22x1x4096xf32, None, 1xf32)
+        dropout_40, dropout_41 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_4, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_4
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x4096xf32, 4096x1024xf32)
+        matmul_29 = paddle._C_ops.matmul(dropout_40, parameter_324, False, False)
+        del dropout_40, parameter_324
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 1024xf32)
+        add_44 = paddle._C_ops.add(matmul_29, parameter_323)
+        del matmul_29, parameter_323
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_42, dropout_43 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_44, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_44
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_45 = paddle._C_ops.add(dropout_42, layer_norm_24)
+        del dropout_42, layer_norm_24
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_27, layer_norm_28, layer_norm_29 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_45, parameter_328, parameter_327, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_45, parameter_327, parameter_328
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_30 = paddle._C_ops.matmul(layer_norm_27, parameter_322, False, False)
+        del parameter_322
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_35 = paddle._C_ops.reshape(matmul_30, full_int_array_5)
+        del matmul_30
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_31 = paddle._C_ops.matmul(layer_norm_27, parameter_321, False, False)
+        del parameter_321
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_36 = paddle._C_ops.reshape(matmul_31, full_int_array_5)
+        del matmul_31
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_32 = paddle._C_ops.matmul(layer_norm_27, parameter_320, False, False)
+        del parameter_320
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_37 = paddle._C_ops.reshape(matmul_32, full_int_array_5)
+        del matmul_32
+
+        # pd_op.matmul: (44x1x1024xf32) <- (44x1x1024xf32, 1024x1024xf32)
+        matmul_33 = paddle._C_ops.matmul(dropout_2, parameter_318, False, False)
+        del parameter_318
+
+        # pd_op.reshape: (44x1x16x64xf32) <- (44x1x1024xf32, 4xi64)
+        reshape_38 = paddle._C_ops.reshape(matmul_33, full_int_array_6)
+        del matmul_33
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_46 = paddle._C_ops.add(reshape_35, parameter_315)
+        del parameter_315
+
+        # builtin.combine: ([22x1x16x64xf32, 22x1x16x64xf32]) <- (22x1x16x64xf32, 22x1x16x64xf32)
+        combine_32 = [add_46, reshape_36]
+        del add_46, reshape_36
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x1x16x64xf32, 22x1x16x64xf32]) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        einsum_93, einsum_94, einsum_95 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_32, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_32
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_124,
+            split_125,
+        ) = einsum_94
+        del einsum_94
+
+        # builtin.split: (22x1x16x64xf32, 22x1x16x64xf32) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        (
+            split_126,
+            split_127,
+        ) = einsum_95
+        del einsum_95
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_47 = paddle._C_ops.add(reshape_35, parameter_317)
+        del parameter_317
+
+        # builtin.combine: ([22x1x16x64xf32, 44x1x16x64xf32]) <- (22x1x16x64xf32, 44x1x16x64xf32)
+        combine_33 = [add_47, reshape_38]
+        del add_47, reshape_38
+
+        # pd_op.einsum: (1x16x22x44xf32, [0xf32, 0xf32], [22x1x16x64xf32, 44x1x16x64xf32]) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        einsum_96, einsum_97, einsum_98 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_33, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_33
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_128,
+            split_129,
+        ) = einsum_97
+        del einsum_97
+
+        # builtin.split: (22x1x16x64xf32, 44x1x16x64xf32) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        (
+            split_130,
+            split_131,
+        ) = einsum_98
+        del einsum_98
+
+        # pd_op.reshape: (1x16x44x22xf32) <- (1x16x22x44xf32, 4xi64)
+        reshape_39 = paddle._C_ops.reshape(einsum_96, full_int_array_7)
+        del einsum_96
+
+        # pd_op.slice: (1x16x43x22xf32) <- (1x16x44x22xf32, 1xi64, 1xi64)
+        slice_5 = paddle._C_ops.slice(
+            reshape_39, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_39
+
+        # pd_op.reshape: (1x16x22x43xf32) <- (1x16x43x22xf32, 4xi64)
+        reshape_40 = paddle._C_ops.reshape(slice_5, full_int_array_9)
+        del slice_5
+
+        # pd_op.index_select: (1x16x22x22xf32) <- (1x16x22x43xf32, 22xi64)
+        index_select_5 = paddle._C_ops.index_select(reshape_40, arange_2, 3)
+        del reshape_40
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_48 = paddle._C_ops.add(reshape_35, parameter_316)
+        del parameter_316, reshape_35
+
+        # builtin.combine: ([22x1x16x64xf32, 2x16x64xf32]) <- (22x1x16x64xf32, 2x16x64xf32)
+        combine_34 = [add_48, parameter_314]
+        del add_48, parameter_314
+
+        # pd_op.einsum: (22x1x16x2xf32, [0xf32, 0xf32], [22x1x16x64xf32, 2x16x64xf32]) <- ([22x1x16x64xf32, 2x16x64xf32])
+        einsum_99, einsum_100, einsum_101 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_34, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_34
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_132,
+            split_133,
+        ) = einsum_100
+        del einsum_100
+
+        # builtin.split: (22x1x16x64xf32, 2x16x64xf32) <- ([22x1x16x64xf32, 2x16x64xf32])
+        (
+            split_134,
+            split_135,
+        ) = einsum_101
+        del einsum_101
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x16x2xf32]) <- (22x22x1x2xf32, 22x1x16x2xf32)
+        combine_35 = [cast_5, einsum_99]
+        del einsum_99
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x16x2xf32]) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        einsum_102, einsum_103, einsum_104 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_35, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_35
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_136,
+            split_137,
+        ) = einsum_103
+        del einsum_103
+
+        # builtin.split: (22x22x1x2xf32, 22x1x16x2xf32) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        (
+            split_138,
+            split_139,
+        ) = einsum_104
+        del einsum_104
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_49 = paddle._C_ops.add(einsum_93, index_select_5)
+        del einsum_93, index_select_5
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_50 = paddle._C_ops.add(add_49, einsum_102)
+        del add_49, einsum_102
+
+        # pd_op.scale: (1x16x22x22xf32) <- (1x16x22x22xf32, 1xf32)
+        scale_9 = paddle._C_ops.scale(add_50, full_16, float("0"), True)
+        del add_50
+
+        # pd_op.subtract: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x1x22x22xf32)
+        subtract_5 = paddle._C_ops.subtract(scale_9, scale_4)
+        del scale_9
+
+        # pd_op.softmax: (1x16x22x22xf32) <- (1x16x22x22xf32)
+        softmax_5 = paddle._C_ops.softmax(subtract_5, 3)
+        del subtract_5
+
+        # pd_op.dropout: (1x16x22x22xf32, 1x16x22x22xui8) <- (1x16x22x22xf32, None, 1xf32)
+        dropout_44, dropout_45 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_5, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_5
+
+        # builtin.combine: ([1x16x22x22xf32, 22x1x16x64xf32]) <- (1x16x22x22xf32, 22x1x16x64xf32)
+        combine_36 = [dropout_44, reshape_37]
+        del dropout_44, reshape_37
+
+        # pd_op.einsum: (22x1x16x64xf32, [0xf32, 0xf32], [1x16x22x22xf32, 22x1x16x64xf32]) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        einsum_105, einsum_106, einsum_107 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_36, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_36
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_140,
+            split_141,
+        ) = einsum_106
+        del einsum_106
+
+        # builtin.split: (1x16x22x22xf32, 22x1x16x64xf32) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        (
+            split_142,
+            split_143,
+        ) = einsum_107
+        del einsum_107
+
+        # pd_op.reshape: (22x1x1024xf32) <- (22x1x16x64xf32, 3xi64)
+        reshape_41 = paddle._C_ops.reshape(einsum_105, full_int_array_10)
+        del einsum_105
+
+        # builtin.combine: ([22x1x1024xf32, 1024x1024xf32]) <- (22x1x1024xf32, 1024x1024xf32)
+        combine_37 = [reshape_41, parameter_319]
+        del parameter_319, reshape_41
+
+        # pd_op.einsum: (22x1x1024xf32, [0xf32, 0xf32], [22x1x1024xf32, 1024x1024xf32]) <- ([22x1x1024xf32, 1024x1024xf32])
+        einsum_108, einsum_109, einsum_110 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_37, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_37
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_144,
+            split_145,
+        ) = einsum_109
+        del einsum_109
+
+        # builtin.split: (22x1x1024xf32, 1024x1024xf32) <- ([22x1x1024xf32, 1024x1024xf32])
+        (
+            split_146,
+            split_147,
+        ) = einsum_110
+        del einsum_110
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_46, dropout_47 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_108, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_108
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_51 = paddle._C_ops.add(dropout_46, layer_norm_27)
+        del dropout_46, layer_norm_27
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_30, layer_norm_31, layer_norm_32 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_51, parameter_313, parameter_312, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_51, parameter_312, parameter_313
+
+        # pd_op.matmul: (22x1x4096xf32) <- (22x1x1024xf32, 1024x4096xf32)
+        matmul_34 = paddle._C_ops.matmul(layer_norm_30, parameter_309, False, False)
+        del parameter_309
+
+        # pd_op.add: (22x1x4096xf32) <- (22x1x4096xf32, 4096xf32)
+        add_52 = paddle._C_ops.add(matmul_34, parameter_308)
+        del matmul_34, parameter_308
+
+        # pd_op.gelu: (22x1x4096xf32) <- (22x1x4096xf32)
+        gelu_5 = paddle._C_ops.gelu(add_52, False)
+        del add_52
+
+        # pd_op.dropout: (22x1x4096xf32, 22x1x4096xui8) <- (22x1x4096xf32, None, 1xf32)
+        dropout_48, dropout_49 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_5, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_5
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x4096xf32, 4096x1024xf32)
+        matmul_35 = paddle._C_ops.matmul(dropout_48, parameter_307, False, False)
+        del dropout_48, parameter_307
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 1024xf32)
+        add_53 = paddle._C_ops.add(matmul_35, parameter_306)
+        del matmul_35, parameter_306
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_50, dropout_51 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_53, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_53
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_54 = paddle._C_ops.add(dropout_50, layer_norm_30)
+        del dropout_50, layer_norm_30
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_33, layer_norm_34, layer_norm_35 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_54, parameter_311, parameter_310, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_54, parameter_310, parameter_311
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_36 = paddle._C_ops.matmul(layer_norm_33, parameter_305, False, False)
+        del parameter_305
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_42 = paddle._C_ops.reshape(matmul_36, full_int_array_5)
+        del matmul_36
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_37 = paddle._C_ops.matmul(layer_norm_33, parameter_304, False, False)
+        del parameter_304
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_43 = paddle._C_ops.reshape(matmul_37, full_int_array_5)
+        del matmul_37
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_38 = paddle._C_ops.matmul(layer_norm_33, parameter_303, False, False)
+        del parameter_303
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_44 = paddle._C_ops.reshape(matmul_38, full_int_array_5)
+        del matmul_38
+
+        # pd_op.matmul: (44x1x1024xf32) <- (44x1x1024xf32, 1024x1024xf32)
+        matmul_39 = paddle._C_ops.matmul(dropout_2, parameter_301, False, False)
+        del parameter_301
+
+        # pd_op.reshape: (44x1x16x64xf32) <- (44x1x1024xf32, 4xi64)
+        reshape_45 = paddle._C_ops.reshape(matmul_39, full_int_array_6)
+        del matmul_39
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_55 = paddle._C_ops.add(reshape_42, parameter_298)
+        del parameter_298
+
+        # builtin.combine: ([22x1x16x64xf32, 22x1x16x64xf32]) <- (22x1x16x64xf32, 22x1x16x64xf32)
+        combine_38 = [add_55, reshape_43]
+        del add_55, reshape_43
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x1x16x64xf32, 22x1x16x64xf32]) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        einsum_111, einsum_112, einsum_113 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_38, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_38
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_148,
+            split_149,
+        ) = einsum_112
+        del einsum_112
+
+        # builtin.split: (22x1x16x64xf32, 22x1x16x64xf32) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        (
+            split_150,
+            split_151,
+        ) = einsum_113
+        del einsum_113
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_56 = paddle._C_ops.add(reshape_42, parameter_300)
+        del parameter_300
+
+        # builtin.combine: ([22x1x16x64xf32, 44x1x16x64xf32]) <- (22x1x16x64xf32, 44x1x16x64xf32)
+        combine_39 = [add_56, reshape_45]
+        del add_56, reshape_45
+
+        # pd_op.einsum: (1x16x22x44xf32, [0xf32, 0xf32], [22x1x16x64xf32, 44x1x16x64xf32]) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        einsum_114, einsum_115, einsum_116 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_39, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_39
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_152,
+            split_153,
+        ) = einsum_115
+        del einsum_115
+
+        # builtin.split: (22x1x16x64xf32, 44x1x16x64xf32) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        (
+            split_154,
+            split_155,
+        ) = einsum_116
+        del einsum_116
+
+        # pd_op.reshape: (1x16x44x22xf32) <- (1x16x22x44xf32, 4xi64)
+        reshape_46 = paddle._C_ops.reshape(einsum_114, full_int_array_7)
+        del einsum_114
+
+        # pd_op.slice: (1x16x43x22xf32) <- (1x16x44x22xf32, 1xi64, 1xi64)
+        slice_6 = paddle._C_ops.slice(
+            reshape_46, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_46
+
+        # pd_op.reshape: (1x16x22x43xf32) <- (1x16x43x22xf32, 4xi64)
+        reshape_47 = paddle._C_ops.reshape(slice_6, full_int_array_9)
+        del slice_6
+
+        # pd_op.index_select: (1x16x22x22xf32) <- (1x16x22x43xf32, 22xi64)
+        index_select_6 = paddle._C_ops.index_select(reshape_47, arange_2, 3)
+        del reshape_47
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_57 = paddle._C_ops.add(reshape_42, parameter_299)
+        del parameter_299, reshape_42
+
+        # builtin.combine: ([22x1x16x64xf32, 2x16x64xf32]) <- (22x1x16x64xf32, 2x16x64xf32)
+        combine_40 = [add_57, parameter_297]
+        del add_57, parameter_297
+
+        # pd_op.einsum: (22x1x16x2xf32, [0xf32, 0xf32], [22x1x16x64xf32, 2x16x64xf32]) <- ([22x1x16x64xf32, 2x16x64xf32])
+        einsum_117, einsum_118, einsum_119 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_40, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_40
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_156,
+            split_157,
+        ) = einsum_118
+        del einsum_118
+
+        # builtin.split: (22x1x16x64xf32, 2x16x64xf32) <- ([22x1x16x64xf32, 2x16x64xf32])
+        (
+            split_158,
+            split_159,
+        ) = einsum_119
+        del einsum_119
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x16x2xf32]) <- (22x22x1x2xf32, 22x1x16x2xf32)
+        combine_41 = [cast_5, einsum_117]
+        del einsum_117
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x16x2xf32]) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        einsum_120, einsum_121, einsum_122 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_41, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_41
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_160,
+            split_161,
+        ) = einsum_121
+        del einsum_121
+
+        # builtin.split: (22x22x1x2xf32, 22x1x16x2xf32) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        (
+            split_162,
+            split_163,
+        ) = einsum_122
+        del einsum_122
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_58 = paddle._C_ops.add(einsum_111, index_select_6)
+        del einsum_111, index_select_6
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_59 = paddle._C_ops.add(add_58, einsum_120)
+        del add_58, einsum_120
+
+        # pd_op.scale: (1x16x22x22xf32) <- (1x16x22x22xf32, 1xf32)
+        scale_10 = paddle._C_ops.scale(add_59, full_16, float("0"), True)
+        del add_59
+
+        # pd_op.subtract: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x1x22x22xf32)
+        subtract_6 = paddle._C_ops.subtract(scale_10, scale_4)
+        del scale_10
+
+        # pd_op.softmax: (1x16x22x22xf32) <- (1x16x22x22xf32)
+        softmax_6 = paddle._C_ops.softmax(subtract_6, 3)
+        del subtract_6
+
+        # pd_op.dropout: (1x16x22x22xf32, 1x16x22x22xui8) <- (1x16x22x22xf32, None, 1xf32)
+        dropout_52, dropout_53 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_6, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_6
+
+        # builtin.combine: ([1x16x22x22xf32, 22x1x16x64xf32]) <- (1x16x22x22xf32, 22x1x16x64xf32)
+        combine_42 = [dropout_52, reshape_44]
+        del dropout_52, reshape_44
+
+        # pd_op.einsum: (22x1x16x64xf32, [0xf32, 0xf32], [1x16x22x22xf32, 22x1x16x64xf32]) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        einsum_123, einsum_124, einsum_125 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_42, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_42
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_164,
+            split_165,
+        ) = einsum_124
+        del einsum_124
+
+        # builtin.split: (1x16x22x22xf32, 22x1x16x64xf32) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        (
+            split_166,
+            split_167,
+        ) = einsum_125
+        del einsum_125
+
+        # pd_op.reshape: (22x1x1024xf32) <- (22x1x16x64xf32, 3xi64)
+        reshape_48 = paddle._C_ops.reshape(einsum_123, full_int_array_10)
+        del einsum_123
+
+        # builtin.combine: ([22x1x1024xf32, 1024x1024xf32]) <- (22x1x1024xf32, 1024x1024xf32)
+        combine_43 = [reshape_48, parameter_302]
+        del parameter_302, reshape_48
+
+        # pd_op.einsum: (22x1x1024xf32, [0xf32, 0xf32], [22x1x1024xf32, 1024x1024xf32]) <- ([22x1x1024xf32, 1024x1024xf32])
+        einsum_126, einsum_127, einsum_128 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_43, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_43
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_168,
+            split_169,
+        ) = einsum_127
+        del einsum_127
+
+        # builtin.split: (22x1x1024xf32, 1024x1024xf32) <- ([22x1x1024xf32, 1024x1024xf32])
+        (
+            split_170,
+            split_171,
+        ) = einsum_128
+        del einsum_128
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_54, dropout_55 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_126, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_126
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_60 = paddle._C_ops.add(dropout_54, layer_norm_33)
+        del dropout_54, layer_norm_33
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_36, layer_norm_37, layer_norm_38 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_60, parameter_296, parameter_295, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_60, parameter_295, parameter_296
+
+        # pd_op.matmul: (22x1x4096xf32) <- (22x1x1024xf32, 1024x4096xf32)
+        matmul_40 = paddle._C_ops.matmul(layer_norm_36, parameter_292, False, False)
+        del parameter_292
+
+        # pd_op.add: (22x1x4096xf32) <- (22x1x4096xf32, 4096xf32)
+        add_61 = paddle._C_ops.add(matmul_40, parameter_291)
+        del matmul_40, parameter_291
+
+        # pd_op.gelu: (22x1x4096xf32) <- (22x1x4096xf32)
+        gelu_6 = paddle._C_ops.gelu(add_61, False)
+        del add_61
+
+        # pd_op.dropout: (22x1x4096xf32, 22x1x4096xui8) <- (22x1x4096xf32, None, 1xf32)
+        dropout_56, dropout_57 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_6, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_6
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x4096xf32, 4096x1024xf32)
+        matmul_41 = paddle._C_ops.matmul(dropout_56, parameter_290, False, False)
+        del dropout_56, parameter_290
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 1024xf32)
+        add_62 = paddle._C_ops.add(matmul_41, parameter_289)
+        del matmul_41, parameter_289
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_58, dropout_59 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_62, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_62
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_63 = paddle._C_ops.add(dropout_58, layer_norm_36)
+        del dropout_58, layer_norm_36
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_39, layer_norm_40, layer_norm_41 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_63, parameter_294, parameter_293, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_63, parameter_293, parameter_294
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_42 = paddle._C_ops.matmul(layer_norm_39, parameter_288, False, False)
+        del parameter_288
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_49 = paddle._C_ops.reshape(matmul_42, full_int_array_5)
+        del matmul_42
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_43 = paddle._C_ops.matmul(layer_norm_39, parameter_287, False, False)
+        del parameter_287
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_50 = paddle._C_ops.reshape(matmul_43, full_int_array_5)
+        del matmul_43
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_44 = paddle._C_ops.matmul(layer_norm_39, parameter_286, False, False)
+        del parameter_286
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_51 = paddle._C_ops.reshape(matmul_44, full_int_array_5)
+        del matmul_44
+
+        # pd_op.matmul: (44x1x1024xf32) <- (44x1x1024xf32, 1024x1024xf32)
+        matmul_45 = paddle._C_ops.matmul(dropout_2, parameter_284, False, False)
+        del parameter_284
+
+        # pd_op.reshape: (44x1x16x64xf32) <- (44x1x1024xf32, 4xi64)
+        reshape_52 = paddle._C_ops.reshape(matmul_45, full_int_array_6)
+        del matmul_45
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_64 = paddle._C_ops.add(reshape_49, parameter_281)
+        del parameter_281
+
+        # builtin.combine: ([22x1x16x64xf32, 22x1x16x64xf32]) <- (22x1x16x64xf32, 22x1x16x64xf32)
+        combine_44 = [add_64, reshape_50]
+        del add_64, reshape_50
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x1x16x64xf32, 22x1x16x64xf32]) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        einsum_129, einsum_130, einsum_131 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_44, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_44
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_172,
+            split_173,
+        ) = einsum_130
+        del einsum_130
+
+        # builtin.split: (22x1x16x64xf32, 22x1x16x64xf32) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        (
+            split_174,
+            split_175,
+        ) = einsum_131
+        del einsum_131
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_65 = paddle._C_ops.add(reshape_49, parameter_283)
+        del parameter_283
+
+        # builtin.combine: ([22x1x16x64xf32, 44x1x16x64xf32]) <- (22x1x16x64xf32, 44x1x16x64xf32)
+        combine_45 = [add_65, reshape_52]
+        del add_65, reshape_52
+
+        # pd_op.einsum: (1x16x22x44xf32, [0xf32, 0xf32], [22x1x16x64xf32, 44x1x16x64xf32]) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        einsum_132, einsum_133, einsum_134 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_45, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_45
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_176,
+            split_177,
+        ) = einsum_133
+        del einsum_133
+
+        # builtin.split: (22x1x16x64xf32, 44x1x16x64xf32) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        (
+            split_178,
+            split_179,
+        ) = einsum_134
+        del einsum_134
+
+        # pd_op.reshape: (1x16x44x22xf32) <- (1x16x22x44xf32, 4xi64)
+        reshape_53 = paddle._C_ops.reshape(einsum_132, full_int_array_7)
+        del einsum_132
+
+        # pd_op.slice: (1x16x43x22xf32) <- (1x16x44x22xf32, 1xi64, 1xi64)
+        slice_7 = paddle._C_ops.slice(
+            reshape_53, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_53
+
+        # pd_op.reshape: (1x16x22x43xf32) <- (1x16x43x22xf32, 4xi64)
+        reshape_54 = paddle._C_ops.reshape(slice_7, full_int_array_9)
+        del slice_7
+
+        # pd_op.index_select: (1x16x22x22xf32) <- (1x16x22x43xf32, 22xi64)
+        index_select_7 = paddle._C_ops.index_select(reshape_54, arange_2, 3)
+        del reshape_54
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_66 = paddle._C_ops.add(reshape_49, parameter_282)
+        del parameter_282, reshape_49
+
+        # builtin.combine: ([22x1x16x64xf32, 2x16x64xf32]) <- (22x1x16x64xf32, 2x16x64xf32)
+        combine_46 = [add_66, parameter_280]
+        del add_66, parameter_280
+
+        # pd_op.einsum: (22x1x16x2xf32, [0xf32, 0xf32], [22x1x16x64xf32, 2x16x64xf32]) <- ([22x1x16x64xf32, 2x16x64xf32])
+        einsum_135, einsum_136, einsum_137 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_46, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_46
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_180,
+            split_181,
+        ) = einsum_136
+        del einsum_136
+
+        # builtin.split: (22x1x16x64xf32, 2x16x64xf32) <- ([22x1x16x64xf32, 2x16x64xf32])
+        (
+            split_182,
+            split_183,
+        ) = einsum_137
+        del einsum_137
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x16x2xf32]) <- (22x22x1x2xf32, 22x1x16x2xf32)
+        combine_47 = [cast_5, einsum_135]
+        del einsum_135
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x16x2xf32]) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        einsum_138, einsum_139, einsum_140 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_47, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_47
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_184,
+            split_185,
+        ) = einsum_139
+        del einsum_139
+
+        # builtin.split: (22x22x1x2xf32, 22x1x16x2xf32) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        (
+            split_186,
+            split_187,
+        ) = einsum_140
+        del einsum_140
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_67 = paddle._C_ops.add(einsum_129, index_select_7)
+        del einsum_129, index_select_7
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_68 = paddle._C_ops.add(add_67, einsum_138)
+        del add_67, einsum_138
+
+        # pd_op.scale: (1x16x22x22xf32) <- (1x16x22x22xf32, 1xf32)
+        scale_11 = paddle._C_ops.scale(add_68, full_16, float("0"), True)
+        del add_68
+
+        # pd_op.subtract: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x1x22x22xf32)
+        subtract_7 = paddle._C_ops.subtract(scale_11, scale_4)
+        del scale_11
+
+        # pd_op.softmax: (1x16x22x22xf32) <- (1x16x22x22xf32)
+        softmax_7 = paddle._C_ops.softmax(subtract_7, 3)
+        del subtract_7
+
+        # pd_op.dropout: (1x16x22x22xf32, 1x16x22x22xui8) <- (1x16x22x22xf32, None, 1xf32)
+        dropout_60, dropout_61 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_7, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_7
+
+        # builtin.combine: ([1x16x22x22xf32, 22x1x16x64xf32]) <- (1x16x22x22xf32, 22x1x16x64xf32)
+        combine_48 = [dropout_60, reshape_51]
+        del dropout_60, reshape_51
+
+        # pd_op.einsum: (22x1x16x64xf32, [0xf32, 0xf32], [1x16x22x22xf32, 22x1x16x64xf32]) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        einsum_141, einsum_142, einsum_143 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_48, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_48
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_188,
+            split_189,
+        ) = einsum_142
+        del einsum_142
+
+        # builtin.split: (1x16x22x22xf32, 22x1x16x64xf32) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        (
+            split_190,
+            split_191,
+        ) = einsum_143
+        del einsum_143
+
+        # pd_op.reshape: (22x1x1024xf32) <- (22x1x16x64xf32, 3xi64)
+        reshape_55 = paddle._C_ops.reshape(einsum_141, full_int_array_10)
+        del einsum_141
+
+        # builtin.combine: ([22x1x1024xf32, 1024x1024xf32]) <- (22x1x1024xf32, 1024x1024xf32)
+        combine_49 = [reshape_55, parameter_285]
+        del parameter_285, reshape_55
+
+        # pd_op.einsum: (22x1x1024xf32, [0xf32, 0xf32], [22x1x1024xf32, 1024x1024xf32]) <- ([22x1x1024xf32, 1024x1024xf32])
+        einsum_144, einsum_145, einsum_146 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_49, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_49
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_192,
+            split_193,
+        ) = einsum_145
+        del einsum_145
+
+        # builtin.split: (22x1x1024xf32, 1024x1024xf32) <- ([22x1x1024xf32, 1024x1024xf32])
+        (
+            split_194,
+            split_195,
+        ) = einsum_146
+        del einsum_146
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_62, dropout_63 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_144, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_144
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_69 = paddle._C_ops.add(dropout_62, layer_norm_39)
+        del dropout_62, layer_norm_39
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_42, layer_norm_43, layer_norm_44 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_69, parameter_279, parameter_278, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_69, parameter_278, parameter_279
+
+        # pd_op.matmul: (22x1x4096xf32) <- (22x1x1024xf32, 1024x4096xf32)
+        matmul_46 = paddle._C_ops.matmul(layer_norm_42, parameter_275, False, False)
+        del parameter_275
+
+        # pd_op.add: (22x1x4096xf32) <- (22x1x4096xf32, 4096xf32)
+        add_70 = paddle._C_ops.add(matmul_46, parameter_274)
+        del matmul_46, parameter_274
+
+        # pd_op.gelu: (22x1x4096xf32) <- (22x1x4096xf32)
+        gelu_7 = paddle._C_ops.gelu(add_70, False)
+        del add_70
+
+        # pd_op.dropout: (22x1x4096xf32, 22x1x4096xui8) <- (22x1x4096xf32, None, 1xf32)
+        dropout_64, dropout_65 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_7, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_7
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x4096xf32, 4096x1024xf32)
+        matmul_47 = paddle._C_ops.matmul(dropout_64, parameter_273, False, False)
+        del dropout_64, parameter_273
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 1024xf32)
+        add_71 = paddle._C_ops.add(matmul_47, parameter_272)
+        del matmul_47, parameter_272
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_66, dropout_67 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_71, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_71
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_72 = paddle._C_ops.add(dropout_66, layer_norm_42)
+        del dropout_66, layer_norm_42
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_45, layer_norm_46, layer_norm_47 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_72, parameter_277, parameter_276, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_72, parameter_276, parameter_277
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_48 = paddle._C_ops.matmul(layer_norm_45, parameter_271, False, False)
+        del parameter_271
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_56 = paddle._C_ops.reshape(matmul_48, full_int_array_5)
+        del matmul_48
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_49 = paddle._C_ops.matmul(layer_norm_45, parameter_270, False, False)
+        del parameter_270
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_57 = paddle._C_ops.reshape(matmul_49, full_int_array_5)
+        del matmul_49
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_50 = paddle._C_ops.matmul(layer_norm_45, parameter_269, False, False)
+        del parameter_269
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_58 = paddle._C_ops.reshape(matmul_50, full_int_array_5)
+        del matmul_50
+
+        # pd_op.matmul: (44x1x1024xf32) <- (44x1x1024xf32, 1024x1024xf32)
+        matmul_51 = paddle._C_ops.matmul(dropout_2, parameter_267, False, False)
+        del parameter_267
+
+        # pd_op.reshape: (44x1x16x64xf32) <- (44x1x1024xf32, 4xi64)
+        reshape_59 = paddle._C_ops.reshape(matmul_51, full_int_array_6)
+        del matmul_51
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_73 = paddle._C_ops.add(reshape_56, parameter_264)
+        del parameter_264
+
+        # builtin.combine: ([22x1x16x64xf32, 22x1x16x64xf32]) <- (22x1x16x64xf32, 22x1x16x64xf32)
+        combine_50 = [add_73, reshape_57]
+        del add_73, reshape_57
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x1x16x64xf32, 22x1x16x64xf32]) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        einsum_147, einsum_148, einsum_149 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_50, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_50
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_196,
+            split_197,
+        ) = einsum_148
+        del einsum_148
+
+        # builtin.split: (22x1x16x64xf32, 22x1x16x64xf32) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        (
+            split_198,
+            split_199,
+        ) = einsum_149
+        del einsum_149
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_74 = paddle._C_ops.add(reshape_56, parameter_266)
+        del parameter_266
+
+        # builtin.combine: ([22x1x16x64xf32, 44x1x16x64xf32]) <- (22x1x16x64xf32, 44x1x16x64xf32)
+        combine_51 = [add_74, reshape_59]
+        del add_74, reshape_59
+
+        # pd_op.einsum: (1x16x22x44xf32, [0xf32, 0xf32], [22x1x16x64xf32, 44x1x16x64xf32]) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        einsum_150, einsum_151, einsum_152 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_51, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_51
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_200,
+            split_201,
+        ) = einsum_151
+        del einsum_151
+
+        # builtin.split: (22x1x16x64xf32, 44x1x16x64xf32) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        (
+            split_202,
+            split_203,
+        ) = einsum_152
+        del einsum_152
+
+        # pd_op.reshape: (1x16x44x22xf32) <- (1x16x22x44xf32, 4xi64)
+        reshape_60 = paddle._C_ops.reshape(einsum_150, full_int_array_7)
+        del einsum_150
+
+        # pd_op.slice: (1x16x43x22xf32) <- (1x16x44x22xf32, 1xi64, 1xi64)
+        slice_8 = paddle._C_ops.slice(
+            reshape_60, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_60
+
+        # pd_op.reshape: (1x16x22x43xf32) <- (1x16x43x22xf32, 4xi64)
+        reshape_61 = paddle._C_ops.reshape(slice_8, full_int_array_9)
+        del slice_8
+
+        # pd_op.index_select: (1x16x22x22xf32) <- (1x16x22x43xf32, 22xi64)
+        index_select_8 = paddle._C_ops.index_select(reshape_61, arange_2, 3)
+        del reshape_61
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_75 = paddle._C_ops.add(reshape_56, parameter_265)
+        del parameter_265, reshape_56
+
+        # builtin.combine: ([22x1x16x64xf32, 2x16x64xf32]) <- (22x1x16x64xf32, 2x16x64xf32)
+        combine_52 = [add_75, parameter_263]
+        del add_75, parameter_263
+
+        # pd_op.einsum: (22x1x16x2xf32, [0xf32, 0xf32], [22x1x16x64xf32, 2x16x64xf32]) <- ([22x1x16x64xf32, 2x16x64xf32])
+        einsum_153, einsum_154, einsum_155 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_52, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_52
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_204,
+            split_205,
+        ) = einsum_154
+        del einsum_154
+
+        # builtin.split: (22x1x16x64xf32, 2x16x64xf32) <- ([22x1x16x64xf32, 2x16x64xf32])
+        (
+            split_206,
+            split_207,
+        ) = einsum_155
+        del einsum_155
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x16x2xf32]) <- (22x22x1x2xf32, 22x1x16x2xf32)
+        combine_53 = [cast_5, einsum_153]
+        del einsum_153
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x16x2xf32]) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        einsum_156, einsum_157, einsum_158 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_53, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_53
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_208,
+            split_209,
+        ) = einsum_157
+        del einsum_157
+
+        # builtin.split: (22x22x1x2xf32, 22x1x16x2xf32) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        (
+            split_210,
+            split_211,
+        ) = einsum_158
+        del einsum_158
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_76 = paddle._C_ops.add(einsum_147, index_select_8)
+        del einsum_147, index_select_8
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_77 = paddle._C_ops.add(add_76, einsum_156)
+        del add_76, einsum_156
+
+        # pd_op.scale: (1x16x22x22xf32) <- (1x16x22x22xf32, 1xf32)
+        scale_12 = paddle._C_ops.scale(add_77, full_16, float("0"), True)
+        del add_77
+
+        # pd_op.subtract: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x1x22x22xf32)
+        subtract_8 = paddle._C_ops.subtract(scale_12, scale_4)
+        del scale_12
+
+        # pd_op.softmax: (1x16x22x22xf32) <- (1x16x22x22xf32)
+        softmax_8 = paddle._C_ops.softmax(subtract_8, 3)
+        del subtract_8
+
+        # pd_op.dropout: (1x16x22x22xf32, 1x16x22x22xui8) <- (1x16x22x22xf32, None, 1xf32)
+        dropout_68, dropout_69 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_8, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_8
+
+        # builtin.combine: ([1x16x22x22xf32, 22x1x16x64xf32]) <- (1x16x22x22xf32, 22x1x16x64xf32)
+        combine_54 = [dropout_68, reshape_58]
+        del dropout_68, reshape_58
+
+        # pd_op.einsum: (22x1x16x64xf32, [0xf32, 0xf32], [1x16x22x22xf32, 22x1x16x64xf32]) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        einsum_159, einsum_160, einsum_161 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_54, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_54
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_212,
+            split_213,
+        ) = einsum_160
+        del einsum_160
+
+        # builtin.split: (1x16x22x22xf32, 22x1x16x64xf32) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        (
+            split_214,
+            split_215,
+        ) = einsum_161
+        del einsum_161
+
+        # pd_op.reshape: (22x1x1024xf32) <- (22x1x16x64xf32, 3xi64)
+        reshape_62 = paddle._C_ops.reshape(einsum_159, full_int_array_10)
+        del einsum_159
+
+        # builtin.combine: ([22x1x1024xf32, 1024x1024xf32]) <- (22x1x1024xf32, 1024x1024xf32)
+        combine_55 = [reshape_62, parameter_268]
+        del parameter_268, reshape_62
+
+        # pd_op.einsum: (22x1x1024xf32, [0xf32, 0xf32], [22x1x1024xf32, 1024x1024xf32]) <- ([22x1x1024xf32, 1024x1024xf32])
+        einsum_162, einsum_163, einsum_164 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_55, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_55
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_216,
+            split_217,
+        ) = einsum_163
+        del einsum_163
+
+        # builtin.split: (22x1x1024xf32, 1024x1024xf32) <- ([22x1x1024xf32, 1024x1024xf32])
+        (
+            split_218,
+            split_219,
+        ) = einsum_164
+        del einsum_164
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_70, dropout_71 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_162, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_162
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_78 = paddle._C_ops.add(dropout_70, layer_norm_45)
+        del dropout_70, layer_norm_45
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_48, layer_norm_49, layer_norm_50 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_78, parameter_262, parameter_261, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_78, parameter_261, parameter_262
+
+        # pd_op.matmul: (22x1x4096xf32) <- (22x1x1024xf32, 1024x4096xf32)
+        matmul_52 = paddle._C_ops.matmul(layer_norm_48, parameter_258, False, False)
+        del parameter_258
+
+        # pd_op.add: (22x1x4096xf32) <- (22x1x4096xf32, 4096xf32)
+        add_79 = paddle._C_ops.add(matmul_52, parameter_257)
+        del matmul_52, parameter_257
+
+        # pd_op.gelu: (22x1x4096xf32) <- (22x1x4096xf32)
+        gelu_8 = paddle._C_ops.gelu(add_79, False)
+        del add_79
+
+        # pd_op.dropout: (22x1x4096xf32, 22x1x4096xui8) <- (22x1x4096xf32, None, 1xf32)
+        dropout_72, dropout_73 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_8, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_8
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x4096xf32, 4096x1024xf32)
+        matmul_53 = paddle._C_ops.matmul(dropout_72, parameter_256, False, False)
+        del dropout_72, parameter_256
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 1024xf32)
+        add_80 = paddle._C_ops.add(matmul_53, parameter_255)
+        del matmul_53, parameter_255
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_74, dropout_75 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_80, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_80
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_81 = paddle._C_ops.add(dropout_74, layer_norm_48)
+        del dropout_74, layer_norm_48
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_51, layer_norm_52, layer_norm_53 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_81, parameter_260, parameter_259, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_81, parameter_259, parameter_260
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_54 = paddle._C_ops.matmul(layer_norm_51, parameter_254, False, False)
+        del parameter_254
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_63 = paddle._C_ops.reshape(matmul_54, full_int_array_5)
+        del matmul_54
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_55 = paddle._C_ops.matmul(layer_norm_51, parameter_253, False, False)
+        del parameter_253
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_64 = paddle._C_ops.reshape(matmul_55, full_int_array_5)
+        del matmul_55
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_56 = paddle._C_ops.matmul(layer_norm_51, parameter_252, False, False)
+        del parameter_252
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_65 = paddle._C_ops.reshape(matmul_56, full_int_array_5)
+        del matmul_56
+
+        # pd_op.matmul: (44x1x1024xf32) <- (44x1x1024xf32, 1024x1024xf32)
+        matmul_57 = paddle._C_ops.matmul(dropout_2, parameter_250, False, False)
+        del parameter_250
+
+        # pd_op.reshape: (44x1x16x64xf32) <- (44x1x1024xf32, 4xi64)
+        reshape_66 = paddle._C_ops.reshape(matmul_57, full_int_array_6)
+        del matmul_57
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_82 = paddle._C_ops.add(reshape_63, parameter_247)
+        del parameter_247
+
+        # builtin.combine: ([22x1x16x64xf32, 22x1x16x64xf32]) <- (22x1x16x64xf32, 22x1x16x64xf32)
+        combine_56 = [add_82, reshape_64]
+        del add_82, reshape_64
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x1x16x64xf32, 22x1x16x64xf32]) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        einsum_165, einsum_166, einsum_167 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_56, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_56
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_220,
+            split_221,
+        ) = einsum_166
+        del einsum_166
+
+        # builtin.split: (22x1x16x64xf32, 22x1x16x64xf32) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        (
+            split_222,
+            split_223,
+        ) = einsum_167
+        del einsum_167
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_83 = paddle._C_ops.add(reshape_63, parameter_249)
+        del parameter_249
+
+        # builtin.combine: ([22x1x16x64xf32, 44x1x16x64xf32]) <- (22x1x16x64xf32, 44x1x16x64xf32)
+        combine_57 = [add_83, reshape_66]
+        del add_83, reshape_66
+
+        # pd_op.einsum: (1x16x22x44xf32, [0xf32, 0xf32], [22x1x16x64xf32, 44x1x16x64xf32]) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        einsum_168, einsum_169, einsum_170 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_57, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_57
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_224,
+            split_225,
+        ) = einsum_169
+        del einsum_169
+
+        # builtin.split: (22x1x16x64xf32, 44x1x16x64xf32) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        (
+            split_226,
+            split_227,
+        ) = einsum_170
+        del einsum_170
+
+        # pd_op.reshape: (1x16x44x22xf32) <- (1x16x22x44xf32, 4xi64)
+        reshape_67 = paddle._C_ops.reshape(einsum_168, full_int_array_7)
+        del einsum_168
+
+        # pd_op.slice: (1x16x43x22xf32) <- (1x16x44x22xf32, 1xi64, 1xi64)
+        slice_9 = paddle._C_ops.slice(
+            reshape_67, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_67
+
+        # pd_op.reshape: (1x16x22x43xf32) <- (1x16x43x22xf32, 4xi64)
+        reshape_68 = paddle._C_ops.reshape(slice_9, full_int_array_9)
+        del slice_9
+
+        # pd_op.index_select: (1x16x22x22xf32) <- (1x16x22x43xf32, 22xi64)
+        index_select_9 = paddle._C_ops.index_select(reshape_68, arange_2, 3)
+        del reshape_68
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_84 = paddle._C_ops.add(reshape_63, parameter_248)
+        del parameter_248, reshape_63
+
+        # builtin.combine: ([22x1x16x64xf32, 2x16x64xf32]) <- (22x1x16x64xf32, 2x16x64xf32)
+        combine_58 = [add_84, parameter_246]
+        del add_84, parameter_246
+
+        # pd_op.einsum: (22x1x16x2xf32, [0xf32, 0xf32], [22x1x16x64xf32, 2x16x64xf32]) <- ([22x1x16x64xf32, 2x16x64xf32])
+        einsum_171, einsum_172, einsum_173 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_58, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_58
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_228,
+            split_229,
+        ) = einsum_172
+        del einsum_172
+
+        # builtin.split: (22x1x16x64xf32, 2x16x64xf32) <- ([22x1x16x64xf32, 2x16x64xf32])
+        (
+            split_230,
+            split_231,
+        ) = einsum_173
+        del einsum_173
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x16x2xf32]) <- (22x22x1x2xf32, 22x1x16x2xf32)
+        combine_59 = [cast_5, einsum_171]
+        del einsum_171
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x16x2xf32]) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        einsum_174, einsum_175, einsum_176 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_59, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_59
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_232,
+            split_233,
+        ) = einsum_175
+        del einsum_175
+
+        # builtin.split: (22x22x1x2xf32, 22x1x16x2xf32) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        (
+            split_234,
+            split_235,
+        ) = einsum_176
+        del einsum_176
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_85 = paddle._C_ops.add(einsum_165, index_select_9)
+        del einsum_165, index_select_9
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_86 = paddle._C_ops.add(add_85, einsum_174)
+        del add_85, einsum_174
+
+        # pd_op.scale: (1x16x22x22xf32) <- (1x16x22x22xf32, 1xf32)
+        scale_13 = paddle._C_ops.scale(add_86, full_16, float("0"), True)
+        del add_86
+
+        # pd_op.subtract: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x1x22x22xf32)
+        subtract_9 = paddle._C_ops.subtract(scale_13, scale_4)
+        del scale_13
+
+        # pd_op.softmax: (1x16x22x22xf32) <- (1x16x22x22xf32)
+        softmax_9 = paddle._C_ops.softmax(subtract_9, 3)
+        del subtract_9
+
+        # pd_op.dropout: (1x16x22x22xf32, 1x16x22x22xui8) <- (1x16x22x22xf32, None, 1xf32)
+        dropout_76, dropout_77 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_9, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_9
+
+        # builtin.combine: ([1x16x22x22xf32, 22x1x16x64xf32]) <- (1x16x22x22xf32, 22x1x16x64xf32)
+        combine_60 = [dropout_76, reshape_65]
+        del dropout_76, reshape_65
+
+        # pd_op.einsum: (22x1x16x64xf32, [0xf32, 0xf32], [1x16x22x22xf32, 22x1x16x64xf32]) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        einsum_177, einsum_178, einsum_179 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_60, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_60
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_236,
+            split_237,
+        ) = einsum_178
+        del einsum_178
+
+        # builtin.split: (1x16x22x22xf32, 22x1x16x64xf32) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        (
+            split_238,
+            split_239,
+        ) = einsum_179
+        del einsum_179
+
+        # pd_op.reshape: (22x1x1024xf32) <- (22x1x16x64xf32, 3xi64)
+        reshape_69 = paddle._C_ops.reshape(einsum_177, full_int_array_10)
+        del einsum_177
+
+        # builtin.combine: ([22x1x1024xf32, 1024x1024xf32]) <- (22x1x1024xf32, 1024x1024xf32)
+        combine_61 = [reshape_69, parameter_251]
+        del parameter_251, reshape_69
+
+        # pd_op.einsum: (22x1x1024xf32, [0xf32, 0xf32], [22x1x1024xf32, 1024x1024xf32]) <- ([22x1x1024xf32, 1024x1024xf32])
+        einsum_180, einsum_181, einsum_182 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_61, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_61
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_240,
+            split_241,
+        ) = einsum_181
+        del einsum_181
+
+        # builtin.split: (22x1x1024xf32, 1024x1024xf32) <- ([22x1x1024xf32, 1024x1024xf32])
+        (
+            split_242,
+            split_243,
+        ) = einsum_182
+        del einsum_182
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_78, dropout_79 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_180, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_180
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_87 = paddle._C_ops.add(dropout_78, layer_norm_51)
+        del dropout_78, layer_norm_51
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_54, layer_norm_55, layer_norm_56 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_87, parameter_245, parameter_244, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_87, parameter_244, parameter_245
+
+        # pd_op.matmul: (22x1x4096xf32) <- (22x1x1024xf32, 1024x4096xf32)
+        matmul_58 = paddle._C_ops.matmul(layer_norm_54, parameter_241, False, False)
+        del parameter_241
+
+        # pd_op.add: (22x1x4096xf32) <- (22x1x4096xf32, 4096xf32)
+        add_88 = paddle._C_ops.add(matmul_58, parameter_240)
+        del matmul_58, parameter_240
+
+        # pd_op.gelu: (22x1x4096xf32) <- (22x1x4096xf32)
+        gelu_9 = paddle._C_ops.gelu(add_88, False)
+        del add_88
+
+        # pd_op.dropout: (22x1x4096xf32, 22x1x4096xui8) <- (22x1x4096xf32, None, 1xf32)
+        dropout_80, dropout_81 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_9, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_9
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x4096xf32, 4096x1024xf32)
+        matmul_59 = paddle._C_ops.matmul(dropout_80, parameter_239, False, False)
+        del dropout_80, parameter_239
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 1024xf32)
+        add_89 = paddle._C_ops.add(matmul_59, parameter_238)
+        del matmul_59, parameter_238
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_82, dropout_83 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_89, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_89
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_90 = paddle._C_ops.add(dropout_82, layer_norm_54)
+        del dropout_82, layer_norm_54
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_57, layer_norm_58, layer_norm_59 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_90, parameter_243, parameter_242, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_90, parameter_242, parameter_243
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_60 = paddle._C_ops.matmul(layer_norm_57, parameter_237, False, False)
+        del parameter_237
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_70 = paddle._C_ops.reshape(matmul_60, full_int_array_5)
+        del matmul_60
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_61 = paddle._C_ops.matmul(layer_norm_57, parameter_236, False, False)
+        del parameter_236
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_71 = paddle._C_ops.reshape(matmul_61, full_int_array_5)
+        del matmul_61
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_62 = paddle._C_ops.matmul(layer_norm_57, parameter_235, False, False)
+        del parameter_235
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_72 = paddle._C_ops.reshape(matmul_62, full_int_array_5)
+        del matmul_62
+
+        # pd_op.matmul: (44x1x1024xf32) <- (44x1x1024xf32, 1024x1024xf32)
+        matmul_63 = paddle._C_ops.matmul(dropout_2, parameter_233, False, False)
+        del parameter_233
+
+        # pd_op.reshape: (44x1x16x64xf32) <- (44x1x1024xf32, 4xi64)
+        reshape_73 = paddle._C_ops.reshape(matmul_63, full_int_array_6)
+        del matmul_63
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_91 = paddle._C_ops.add(reshape_70, parameter_230)
+        del parameter_230
+
+        # builtin.combine: ([22x1x16x64xf32, 22x1x16x64xf32]) <- (22x1x16x64xf32, 22x1x16x64xf32)
+        combine_62 = [add_91, reshape_71]
+        del add_91, reshape_71
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x1x16x64xf32, 22x1x16x64xf32]) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        einsum_183, einsum_184, einsum_185 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_62, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_62
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_244,
+            split_245,
+        ) = einsum_184
+        del einsum_184
+
+        # builtin.split: (22x1x16x64xf32, 22x1x16x64xf32) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        (
+            split_246,
+            split_247,
+        ) = einsum_185
+        del einsum_185
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_92 = paddle._C_ops.add(reshape_70, parameter_232)
+        del parameter_232
+
+        # builtin.combine: ([22x1x16x64xf32, 44x1x16x64xf32]) <- (22x1x16x64xf32, 44x1x16x64xf32)
+        combine_63 = [add_92, reshape_73]
+        del add_92, reshape_73
+
+        # pd_op.einsum: (1x16x22x44xf32, [0xf32, 0xf32], [22x1x16x64xf32, 44x1x16x64xf32]) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        einsum_186, einsum_187, einsum_188 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_63, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_63
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_248,
+            split_249,
+        ) = einsum_187
+        del einsum_187
+
+        # builtin.split: (22x1x16x64xf32, 44x1x16x64xf32) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        (
+            split_250,
+            split_251,
+        ) = einsum_188
+        del einsum_188
+
+        # pd_op.reshape: (1x16x44x22xf32) <- (1x16x22x44xf32, 4xi64)
+        reshape_74 = paddle._C_ops.reshape(einsum_186, full_int_array_7)
+        del einsum_186
+
+        # pd_op.slice: (1x16x43x22xf32) <- (1x16x44x22xf32, 1xi64, 1xi64)
+        slice_10 = paddle._C_ops.slice(
+            reshape_74, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_74
+
+        # pd_op.reshape: (1x16x22x43xf32) <- (1x16x43x22xf32, 4xi64)
+        reshape_75 = paddle._C_ops.reshape(slice_10, full_int_array_9)
+        del slice_10
+
+        # pd_op.index_select: (1x16x22x22xf32) <- (1x16x22x43xf32, 22xi64)
+        index_select_10 = paddle._C_ops.index_select(reshape_75, arange_2, 3)
+        del reshape_75
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_93 = paddle._C_ops.add(reshape_70, parameter_231)
+        del parameter_231, reshape_70
+
+        # builtin.combine: ([22x1x16x64xf32, 2x16x64xf32]) <- (22x1x16x64xf32, 2x16x64xf32)
+        combine_64 = [add_93, parameter_229]
+        del add_93, parameter_229
+
+        # pd_op.einsum: (22x1x16x2xf32, [0xf32, 0xf32], [22x1x16x64xf32, 2x16x64xf32]) <- ([22x1x16x64xf32, 2x16x64xf32])
+        einsum_189, einsum_190, einsum_191 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_64, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_64
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_252,
+            split_253,
+        ) = einsum_190
+        del einsum_190
+
+        # builtin.split: (22x1x16x64xf32, 2x16x64xf32) <- ([22x1x16x64xf32, 2x16x64xf32])
+        (
+            split_254,
+            split_255,
+        ) = einsum_191
+        del einsum_191
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x16x2xf32]) <- (22x22x1x2xf32, 22x1x16x2xf32)
+        combine_65 = [cast_5, einsum_189]
+        del einsum_189
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x16x2xf32]) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        einsum_192, einsum_193, einsum_194 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_65, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_65
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_256,
+            split_257,
+        ) = einsum_193
+        del einsum_193
+
+        # builtin.split: (22x22x1x2xf32, 22x1x16x2xf32) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        (
+            split_258,
+            split_259,
+        ) = einsum_194
+        del einsum_194
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_94 = paddle._C_ops.add(einsum_183, index_select_10)
+        del einsum_183, index_select_10
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_95 = paddle._C_ops.add(add_94, einsum_192)
+        del add_94, einsum_192
+
+        # pd_op.scale: (1x16x22x22xf32) <- (1x16x22x22xf32, 1xf32)
+        scale_14 = paddle._C_ops.scale(add_95, full_16, float("0"), True)
+        del add_95
+
+        # pd_op.subtract: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x1x22x22xf32)
+        subtract_10 = paddle._C_ops.subtract(scale_14, scale_4)
+        del scale_14
+
+        # pd_op.softmax: (1x16x22x22xf32) <- (1x16x22x22xf32)
+        softmax_10 = paddle._C_ops.softmax(subtract_10, 3)
+        del subtract_10
+
+        # pd_op.dropout: (1x16x22x22xf32, 1x16x22x22xui8) <- (1x16x22x22xf32, None, 1xf32)
+        dropout_84, dropout_85 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_10, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_10
+
+        # builtin.combine: ([1x16x22x22xf32, 22x1x16x64xf32]) <- (1x16x22x22xf32, 22x1x16x64xf32)
+        combine_66 = [dropout_84, reshape_72]
+        del dropout_84, reshape_72
+
+        # pd_op.einsum: (22x1x16x64xf32, [0xf32, 0xf32], [1x16x22x22xf32, 22x1x16x64xf32]) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        einsum_195, einsum_196, einsum_197 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_66, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_66
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_260,
+            split_261,
+        ) = einsum_196
+        del einsum_196
+
+        # builtin.split: (1x16x22x22xf32, 22x1x16x64xf32) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        (
+            split_262,
+            split_263,
+        ) = einsum_197
+        del einsum_197
+
+        # pd_op.reshape: (22x1x1024xf32) <- (22x1x16x64xf32, 3xi64)
+        reshape_76 = paddle._C_ops.reshape(einsum_195, full_int_array_10)
+        del einsum_195
+
+        # builtin.combine: ([22x1x1024xf32, 1024x1024xf32]) <- (22x1x1024xf32, 1024x1024xf32)
+        combine_67 = [reshape_76, parameter_234]
+        del parameter_234, reshape_76
+
+        # pd_op.einsum: (22x1x1024xf32, [0xf32, 0xf32], [22x1x1024xf32, 1024x1024xf32]) <- ([22x1x1024xf32, 1024x1024xf32])
+        einsum_198, einsum_199, einsum_200 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_67, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_67
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_264,
+            split_265,
+        ) = einsum_199
+        del einsum_199
+
+        # builtin.split: (22x1x1024xf32, 1024x1024xf32) <- ([22x1x1024xf32, 1024x1024xf32])
+        (
+            split_266,
+            split_267,
+        ) = einsum_200
+        del einsum_200
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_86, dropout_87 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_198, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_198
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_96 = paddle._C_ops.add(dropout_86, layer_norm_57)
+        del dropout_86, layer_norm_57
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_60, layer_norm_61, layer_norm_62 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_96, parameter_228, parameter_227, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_96, parameter_227, parameter_228
+
+        # pd_op.matmul: (22x1x4096xf32) <- (22x1x1024xf32, 1024x4096xf32)
+        matmul_64 = paddle._C_ops.matmul(layer_norm_60, parameter_224, False, False)
+        del parameter_224
+
+        # pd_op.add: (22x1x4096xf32) <- (22x1x4096xf32, 4096xf32)
+        add_97 = paddle._C_ops.add(matmul_64, parameter_223)
+        del matmul_64, parameter_223
+
+        # pd_op.gelu: (22x1x4096xf32) <- (22x1x4096xf32)
+        gelu_10 = paddle._C_ops.gelu(add_97, False)
+        del add_97
+
+        # pd_op.dropout: (22x1x4096xf32, 22x1x4096xui8) <- (22x1x4096xf32, None, 1xf32)
+        dropout_88, dropout_89 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_10, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_10
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x4096xf32, 4096x1024xf32)
+        matmul_65 = paddle._C_ops.matmul(dropout_88, parameter_222, False, False)
+        del dropout_88, parameter_222
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 1024xf32)
+        add_98 = paddle._C_ops.add(matmul_65, parameter_221)
+        del matmul_65, parameter_221
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_90, dropout_91 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_98, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_98
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_99 = paddle._C_ops.add(dropout_90, layer_norm_60)
+        del dropout_90, layer_norm_60
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_63, layer_norm_64, layer_norm_65 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_99, parameter_226, parameter_225, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_99, parameter_225, parameter_226
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_66 = paddle._C_ops.matmul(layer_norm_63, parameter_220, False, False)
+        del parameter_220
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_77 = paddle._C_ops.reshape(matmul_66, full_int_array_5)
+        del matmul_66
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_67 = paddle._C_ops.matmul(layer_norm_63, parameter_219, False, False)
+        del parameter_219
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_78 = paddle._C_ops.reshape(matmul_67, full_int_array_5)
+        del matmul_67
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_68 = paddle._C_ops.matmul(layer_norm_63, parameter_218, False, False)
+        del parameter_218
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_79 = paddle._C_ops.reshape(matmul_68, full_int_array_5)
+        del matmul_68
+
+        # pd_op.matmul: (44x1x1024xf32) <- (44x1x1024xf32, 1024x1024xf32)
+        matmul_69 = paddle._C_ops.matmul(dropout_2, parameter_216, False, False)
+        del parameter_216
+
+        # pd_op.reshape: (44x1x16x64xf32) <- (44x1x1024xf32, 4xi64)
+        reshape_80 = paddle._C_ops.reshape(matmul_69, full_int_array_6)
+        del matmul_69
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_100 = paddle._C_ops.add(reshape_77, parameter_213)
+        del parameter_213
+
+        # builtin.combine: ([22x1x16x64xf32, 22x1x16x64xf32]) <- (22x1x16x64xf32, 22x1x16x64xf32)
+        combine_68 = [add_100, reshape_78]
+        del add_100, reshape_78
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x1x16x64xf32, 22x1x16x64xf32]) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        einsum_201, einsum_202, einsum_203 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_68, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_68
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_268,
+            split_269,
+        ) = einsum_202
+        del einsum_202
+
+        # builtin.split: (22x1x16x64xf32, 22x1x16x64xf32) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        (
+            split_270,
+            split_271,
+        ) = einsum_203
+        del einsum_203
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_101 = paddle._C_ops.add(reshape_77, parameter_215)
+        del parameter_215
+
+        # builtin.combine: ([22x1x16x64xf32, 44x1x16x64xf32]) <- (22x1x16x64xf32, 44x1x16x64xf32)
+        combine_69 = [add_101, reshape_80]
+        del add_101, reshape_80
+
+        # pd_op.einsum: (1x16x22x44xf32, [0xf32, 0xf32], [22x1x16x64xf32, 44x1x16x64xf32]) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        einsum_204, einsum_205, einsum_206 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_69, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_69
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_272,
+            split_273,
+        ) = einsum_205
+        del einsum_205
+
+        # builtin.split: (22x1x16x64xf32, 44x1x16x64xf32) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        (
+            split_274,
+            split_275,
+        ) = einsum_206
+        del einsum_206
+
+        # pd_op.reshape: (1x16x44x22xf32) <- (1x16x22x44xf32, 4xi64)
+        reshape_81 = paddle._C_ops.reshape(einsum_204, full_int_array_7)
+        del einsum_204
+
+        # pd_op.slice: (1x16x43x22xf32) <- (1x16x44x22xf32, 1xi64, 1xi64)
+        slice_11 = paddle._C_ops.slice(
+            reshape_81, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_81
+
+        # pd_op.reshape: (1x16x22x43xf32) <- (1x16x43x22xf32, 4xi64)
+        reshape_82 = paddle._C_ops.reshape(slice_11, full_int_array_9)
+        del slice_11
+
+        # pd_op.index_select: (1x16x22x22xf32) <- (1x16x22x43xf32, 22xi64)
+        index_select_11 = paddle._C_ops.index_select(reshape_82, arange_2, 3)
+        del reshape_82
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_102 = paddle._C_ops.add(reshape_77, parameter_214)
+        del parameter_214, reshape_77
+
+        # builtin.combine: ([22x1x16x64xf32, 2x16x64xf32]) <- (22x1x16x64xf32, 2x16x64xf32)
+        combine_70 = [add_102, parameter_212]
+        del add_102, parameter_212
+
+        # pd_op.einsum: (22x1x16x2xf32, [0xf32, 0xf32], [22x1x16x64xf32, 2x16x64xf32]) <- ([22x1x16x64xf32, 2x16x64xf32])
+        einsum_207, einsum_208, einsum_209 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_70, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_70
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_276,
+            split_277,
+        ) = einsum_208
+        del einsum_208
+
+        # builtin.split: (22x1x16x64xf32, 2x16x64xf32) <- ([22x1x16x64xf32, 2x16x64xf32])
+        (
+            split_278,
+            split_279,
+        ) = einsum_209
+        del einsum_209
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x16x2xf32]) <- (22x22x1x2xf32, 22x1x16x2xf32)
+        combine_71 = [cast_5, einsum_207]
+        del einsum_207
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x16x2xf32]) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        einsum_210, einsum_211, einsum_212 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_71, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_71
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_280,
+            split_281,
+        ) = einsum_211
+        del einsum_211
+
+        # builtin.split: (22x22x1x2xf32, 22x1x16x2xf32) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        (
+            split_282,
+            split_283,
+        ) = einsum_212
+        del einsum_212
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_103 = paddle._C_ops.add(einsum_201, index_select_11)
+        del einsum_201, index_select_11
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_104 = paddle._C_ops.add(add_103, einsum_210)
+        del add_103, einsum_210
+
+        # pd_op.scale: (1x16x22x22xf32) <- (1x16x22x22xf32, 1xf32)
+        scale_15 = paddle._C_ops.scale(add_104, full_16, float("0"), True)
+        del add_104
+
+        # pd_op.subtract: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x1x22x22xf32)
+        subtract_11 = paddle._C_ops.subtract(scale_15, scale_4)
+        del scale_15
+
+        # pd_op.softmax: (1x16x22x22xf32) <- (1x16x22x22xf32)
+        softmax_11 = paddle._C_ops.softmax(subtract_11, 3)
+        del subtract_11
+
+        # pd_op.dropout: (1x16x22x22xf32, 1x16x22x22xui8) <- (1x16x22x22xf32, None, 1xf32)
+        dropout_92, dropout_93 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_11, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_11
+
+        # builtin.combine: ([1x16x22x22xf32, 22x1x16x64xf32]) <- (1x16x22x22xf32, 22x1x16x64xf32)
+        combine_72 = [dropout_92, reshape_79]
+        del dropout_92, reshape_79
+
+        # pd_op.einsum: (22x1x16x64xf32, [0xf32, 0xf32], [1x16x22x22xf32, 22x1x16x64xf32]) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        einsum_213, einsum_214, einsum_215 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_72, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_72
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_284,
+            split_285,
+        ) = einsum_214
+        del einsum_214
+
+        # builtin.split: (1x16x22x22xf32, 22x1x16x64xf32) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        (
+            split_286,
+            split_287,
+        ) = einsum_215
+        del einsum_215
+
+        # pd_op.reshape: (22x1x1024xf32) <- (22x1x16x64xf32, 3xi64)
+        reshape_83 = paddle._C_ops.reshape(einsum_213, full_int_array_10)
+        del einsum_213
+
+        # builtin.combine: ([22x1x1024xf32, 1024x1024xf32]) <- (22x1x1024xf32, 1024x1024xf32)
+        combine_73 = [reshape_83, parameter_217]
+        del parameter_217, reshape_83
+
+        # pd_op.einsum: (22x1x1024xf32, [0xf32, 0xf32], [22x1x1024xf32, 1024x1024xf32]) <- ([22x1x1024xf32, 1024x1024xf32])
+        einsum_216, einsum_217, einsum_218 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_73, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_73
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_288,
+            split_289,
+        ) = einsum_217
+        del einsum_217
+
+        # builtin.split: (22x1x1024xf32, 1024x1024xf32) <- ([22x1x1024xf32, 1024x1024xf32])
+        (
+            split_290,
+            split_291,
+        ) = einsum_218
+        del einsum_218
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_94, dropout_95 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_216, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_216
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_105 = paddle._C_ops.add(dropout_94, layer_norm_63)
+        del dropout_94, layer_norm_63
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_66, layer_norm_67, layer_norm_68 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_105, parameter_211, parameter_210, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_105, parameter_210, parameter_211
+
+        # pd_op.matmul: (22x1x4096xf32) <- (22x1x1024xf32, 1024x4096xf32)
+        matmul_70 = paddle._C_ops.matmul(layer_norm_66, parameter_207, False, False)
+        del parameter_207
+
+        # pd_op.add: (22x1x4096xf32) <- (22x1x4096xf32, 4096xf32)
+        add_106 = paddle._C_ops.add(matmul_70, parameter_206)
+        del matmul_70, parameter_206
+
+        # pd_op.gelu: (22x1x4096xf32) <- (22x1x4096xf32)
+        gelu_11 = paddle._C_ops.gelu(add_106, False)
+        del add_106
+
+        # pd_op.dropout: (22x1x4096xf32, 22x1x4096xui8) <- (22x1x4096xf32, None, 1xf32)
+        dropout_96, dropout_97 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_11, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_11
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x4096xf32, 4096x1024xf32)
+        matmul_71 = paddle._C_ops.matmul(dropout_96, parameter_205, False, False)
+        del dropout_96, parameter_205
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 1024xf32)
+        add_107 = paddle._C_ops.add(matmul_71, parameter_204)
+        del matmul_71, parameter_204
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_98, dropout_99 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_107, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_107
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_108 = paddle._C_ops.add(dropout_98, layer_norm_66)
+        del dropout_98, layer_norm_66
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_69, layer_norm_70, layer_norm_71 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_108, parameter_209, parameter_208, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_108, parameter_208, parameter_209
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_72 = paddle._C_ops.matmul(layer_norm_69, parameter_203, False, False)
+        del parameter_203
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_84 = paddle._C_ops.reshape(matmul_72, full_int_array_5)
+        del matmul_72
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_73 = paddle._C_ops.matmul(layer_norm_69, parameter_202, False, False)
+        del parameter_202
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_85 = paddle._C_ops.reshape(matmul_73, full_int_array_5)
+        del matmul_73
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_74 = paddle._C_ops.matmul(layer_norm_69, parameter_201, False, False)
+        del parameter_201
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_86 = paddle._C_ops.reshape(matmul_74, full_int_array_5)
+        del matmul_74
+
+        # pd_op.matmul: (44x1x1024xf32) <- (44x1x1024xf32, 1024x1024xf32)
+        matmul_75 = paddle._C_ops.matmul(dropout_2, parameter_199, False, False)
+        del parameter_199
+
+        # pd_op.reshape: (44x1x16x64xf32) <- (44x1x1024xf32, 4xi64)
+        reshape_87 = paddle._C_ops.reshape(matmul_75, full_int_array_6)
+        del matmul_75
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_109 = paddle._C_ops.add(reshape_84, parameter_196)
+        del parameter_196
+
+        # builtin.combine: ([22x1x16x64xf32, 22x1x16x64xf32]) <- (22x1x16x64xf32, 22x1x16x64xf32)
+        combine_74 = [add_109, reshape_85]
+        del add_109, reshape_85
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x1x16x64xf32, 22x1x16x64xf32]) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        einsum_219, einsum_220, einsum_221 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_74, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_74
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_292,
+            split_293,
+        ) = einsum_220
+        del einsum_220
+
+        # builtin.split: (22x1x16x64xf32, 22x1x16x64xf32) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        (
+            split_294,
+            split_295,
+        ) = einsum_221
+        del einsum_221
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_110 = paddle._C_ops.add(reshape_84, parameter_198)
+        del parameter_198
+
+        # builtin.combine: ([22x1x16x64xf32, 44x1x16x64xf32]) <- (22x1x16x64xf32, 44x1x16x64xf32)
+        combine_75 = [add_110, reshape_87]
+        del add_110, reshape_87
+
+        # pd_op.einsum: (1x16x22x44xf32, [0xf32, 0xf32], [22x1x16x64xf32, 44x1x16x64xf32]) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        einsum_222, einsum_223, einsum_224 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_75, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_75
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_296,
+            split_297,
+        ) = einsum_223
+        del einsum_223
+
+        # builtin.split: (22x1x16x64xf32, 44x1x16x64xf32) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        (
+            split_298,
+            split_299,
+        ) = einsum_224
+        del einsum_224
+
+        # pd_op.reshape: (1x16x44x22xf32) <- (1x16x22x44xf32, 4xi64)
+        reshape_88 = paddle._C_ops.reshape(einsum_222, full_int_array_7)
+        del einsum_222
+
+        # pd_op.slice: (1x16x43x22xf32) <- (1x16x44x22xf32, 1xi64, 1xi64)
+        slice_12 = paddle._C_ops.slice(
+            reshape_88, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_88
+
+        # pd_op.reshape: (1x16x22x43xf32) <- (1x16x43x22xf32, 4xi64)
+        reshape_89 = paddle._C_ops.reshape(slice_12, full_int_array_9)
+        del slice_12
+
+        # pd_op.index_select: (1x16x22x22xf32) <- (1x16x22x43xf32, 22xi64)
+        index_select_12 = paddle._C_ops.index_select(reshape_89, arange_2, 3)
+        del reshape_89
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_111 = paddle._C_ops.add(reshape_84, parameter_197)
+        del parameter_197, reshape_84
+
+        # builtin.combine: ([22x1x16x64xf32, 2x16x64xf32]) <- (22x1x16x64xf32, 2x16x64xf32)
+        combine_76 = [add_111, parameter_195]
+        del add_111, parameter_195
+
+        # pd_op.einsum: (22x1x16x2xf32, [0xf32, 0xf32], [22x1x16x64xf32, 2x16x64xf32]) <- ([22x1x16x64xf32, 2x16x64xf32])
+        einsum_225, einsum_226, einsum_227 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_76, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_76
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_300,
+            split_301,
+        ) = einsum_226
+        del einsum_226
+
+        # builtin.split: (22x1x16x64xf32, 2x16x64xf32) <- ([22x1x16x64xf32, 2x16x64xf32])
+        (
+            split_302,
+            split_303,
+        ) = einsum_227
+        del einsum_227
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x16x2xf32]) <- (22x22x1x2xf32, 22x1x16x2xf32)
+        combine_77 = [cast_5, einsum_225]
+        del einsum_225
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x16x2xf32]) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        einsum_228, einsum_229, einsum_230 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_77, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_77
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_304,
+            split_305,
+        ) = einsum_229
+        del einsum_229
+
+        # builtin.split: (22x22x1x2xf32, 22x1x16x2xf32) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        (
+            split_306,
+            split_307,
+        ) = einsum_230
+        del einsum_230
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_112 = paddle._C_ops.add(einsum_219, index_select_12)
+        del einsum_219, index_select_12
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_113 = paddle._C_ops.add(add_112, einsum_228)
+        del add_112, einsum_228
+
+        # pd_op.scale: (1x16x22x22xf32) <- (1x16x22x22xf32, 1xf32)
+        scale_16 = paddle._C_ops.scale(add_113, full_16, float("0"), True)
+        del add_113
+
+        # pd_op.subtract: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x1x22x22xf32)
+        subtract_12 = paddle._C_ops.subtract(scale_16, scale_4)
+        del scale_16
+
+        # pd_op.softmax: (1x16x22x22xf32) <- (1x16x22x22xf32)
+        softmax_12 = paddle._C_ops.softmax(subtract_12, 3)
+        del subtract_12
+
+        # pd_op.dropout: (1x16x22x22xf32, 1x16x22x22xui8) <- (1x16x22x22xf32, None, 1xf32)
+        dropout_100, dropout_101 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_12, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_12
+
+        # builtin.combine: ([1x16x22x22xf32, 22x1x16x64xf32]) <- (1x16x22x22xf32, 22x1x16x64xf32)
+        combine_78 = [dropout_100, reshape_86]
+        del dropout_100, reshape_86
+
+        # pd_op.einsum: (22x1x16x64xf32, [0xf32, 0xf32], [1x16x22x22xf32, 22x1x16x64xf32]) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        einsum_231, einsum_232, einsum_233 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_78, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_78
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_308,
+            split_309,
+        ) = einsum_232
+        del einsum_232
+
+        # builtin.split: (1x16x22x22xf32, 22x1x16x64xf32) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        (
+            split_310,
+            split_311,
+        ) = einsum_233
+        del einsum_233
+
+        # pd_op.reshape: (22x1x1024xf32) <- (22x1x16x64xf32, 3xi64)
+        reshape_90 = paddle._C_ops.reshape(einsum_231, full_int_array_10)
+        del einsum_231
+
+        # builtin.combine: ([22x1x1024xf32, 1024x1024xf32]) <- (22x1x1024xf32, 1024x1024xf32)
+        combine_79 = [reshape_90, parameter_200]
+        del parameter_200, reshape_90
+
+        # pd_op.einsum: (22x1x1024xf32, [0xf32, 0xf32], [22x1x1024xf32, 1024x1024xf32]) <- ([22x1x1024xf32, 1024x1024xf32])
+        einsum_234, einsum_235, einsum_236 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_79, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_79
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_312,
+            split_313,
+        ) = einsum_235
+        del einsum_235
+
+        # builtin.split: (22x1x1024xf32, 1024x1024xf32) <- ([22x1x1024xf32, 1024x1024xf32])
+        (
+            split_314,
+            split_315,
+        ) = einsum_236
+        del einsum_236
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_102, dropout_103 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_234, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_234
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_114 = paddle._C_ops.add(dropout_102, layer_norm_69)
+        del dropout_102, layer_norm_69
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_72, layer_norm_73, layer_norm_74 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_114, parameter_194, parameter_193, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_114, parameter_193, parameter_194
+
+        # pd_op.matmul: (22x1x4096xf32) <- (22x1x1024xf32, 1024x4096xf32)
+        matmul_76 = paddle._C_ops.matmul(layer_norm_72, parameter_190, False, False)
+        del parameter_190
+
+        # pd_op.add: (22x1x4096xf32) <- (22x1x4096xf32, 4096xf32)
+        add_115 = paddle._C_ops.add(matmul_76, parameter_189)
+        del matmul_76, parameter_189
+
+        # pd_op.gelu: (22x1x4096xf32) <- (22x1x4096xf32)
+        gelu_12 = paddle._C_ops.gelu(add_115, False)
+        del add_115
+
+        # pd_op.dropout: (22x1x4096xf32, 22x1x4096xui8) <- (22x1x4096xf32, None, 1xf32)
+        dropout_104, dropout_105 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_12, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_12
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x4096xf32, 4096x1024xf32)
+        matmul_77 = paddle._C_ops.matmul(dropout_104, parameter_188, False, False)
+        del dropout_104, parameter_188
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 1024xf32)
+        add_116 = paddle._C_ops.add(matmul_77, parameter_187)
+        del matmul_77, parameter_187
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_106, dropout_107 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_116, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_116
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_117 = paddle._C_ops.add(dropout_106, layer_norm_72)
+        del dropout_106, layer_norm_72
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_75, layer_norm_76, layer_norm_77 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_117, parameter_192, parameter_191, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_117, parameter_191, parameter_192
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_78 = paddle._C_ops.matmul(layer_norm_75, parameter_186, False, False)
+        del parameter_186
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_91 = paddle._C_ops.reshape(matmul_78, full_int_array_5)
+        del matmul_78
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_79 = paddle._C_ops.matmul(layer_norm_75, parameter_185, False, False)
+        del parameter_185
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_92 = paddle._C_ops.reshape(matmul_79, full_int_array_5)
+        del matmul_79
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_80 = paddle._C_ops.matmul(layer_norm_75, parameter_184, False, False)
+        del parameter_184
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_93 = paddle._C_ops.reshape(matmul_80, full_int_array_5)
+        del matmul_80
+
+        # pd_op.matmul: (44x1x1024xf32) <- (44x1x1024xf32, 1024x1024xf32)
+        matmul_81 = paddle._C_ops.matmul(dropout_2, parameter_182, False, False)
+        del parameter_182
+
+        # pd_op.reshape: (44x1x16x64xf32) <- (44x1x1024xf32, 4xi64)
+        reshape_94 = paddle._C_ops.reshape(matmul_81, full_int_array_6)
+        del matmul_81
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_118 = paddle._C_ops.add(reshape_91, parameter_179)
+        del parameter_179
+
+        # builtin.combine: ([22x1x16x64xf32, 22x1x16x64xf32]) <- (22x1x16x64xf32, 22x1x16x64xf32)
+        combine_80 = [add_118, reshape_92]
+        del add_118, reshape_92
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x1x16x64xf32, 22x1x16x64xf32]) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        einsum_237, einsum_238, einsum_239 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_80, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_80
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_316,
+            split_317,
+        ) = einsum_238
+        del einsum_238
+
+        # builtin.split: (22x1x16x64xf32, 22x1x16x64xf32) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        (
+            split_318,
+            split_319,
+        ) = einsum_239
+        del einsum_239
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_119 = paddle._C_ops.add(reshape_91, parameter_181)
+        del parameter_181
+
+        # builtin.combine: ([22x1x16x64xf32, 44x1x16x64xf32]) <- (22x1x16x64xf32, 44x1x16x64xf32)
+        combine_81 = [add_119, reshape_94]
+        del add_119, reshape_94
+
+        # pd_op.einsum: (1x16x22x44xf32, [0xf32, 0xf32], [22x1x16x64xf32, 44x1x16x64xf32]) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        einsum_240, einsum_241, einsum_242 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_81, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_81
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_320,
+            split_321,
+        ) = einsum_241
+        del einsum_241
+
+        # builtin.split: (22x1x16x64xf32, 44x1x16x64xf32) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        (
+            split_322,
+            split_323,
+        ) = einsum_242
+        del einsum_242
+
+        # pd_op.reshape: (1x16x44x22xf32) <- (1x16x22x44xf32, 4xi64)
+        reshape_95 = paddle._C_ops.reshape(einsum_240, full_int_array_7)
+        del einsum_240
+
+        # pd_op.slice: (1x16x43x22xf32) <- (1x16x44x22xf32, 1xi64, 1xi64)
+        slice_13 = paddle._C_ops.slice(
+            reshape_95, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_95
+
+        # pd_op.reshape: (1x16x22x43xf32) <- (1x16x43x22xf32, 4xi64)
+        reshape_96 = paddle._C_ops.reshape(slice_13, full_int_array_9)
+        del slice_13
+
+        # pd_op.index_select: (1x16x22x22xf32) <- (1x16x22x43xf32, 22xi64)
+        index_select_13 = paddle._C_ops.index_select(reshape_96, arange_2, 3)
+        del reshape_96
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_120 = paddle._C_ops.add(reshape_91, parameter_180)
+        del parameter_180, reshape_91
+
+        # builtin.combine: ([22x1x16x64xf32, 2x16x64xf32]) <- (22x1x16x64xf32, 2x16x64xf32)
+        combine_82 = [add_120, parameter_178]
+        del add_120, parameter_178
+
+        # pd_op.einsum: (22x1x16x2xf32, [0xf32, 0xf32], [22x1x16x64xf32, 2x16x64xf32]) <- ([22x1x16x64xf32, 2x16x64xf32])
+        einsum_243, einsum_244, einsum_245 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_82, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_82
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_324,
+            split_325,
+        ) = einsum_244
+        del einsum_244
+
+        # builtin.split: (22x1x16x64xf32, 2x16x64xf32) <- ([22x1x16x64xf32, 2x16x64xf32])
+        (
+            split_326,
+            split_327,
+        ) = einsum_245
+        del einsum_245
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x16x2xf32]) <- (22x22x1x2xf32, 22x1x16x2xf32)
+        combine_83 = [cast_5, einsum_243]
+        del einsum_243
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x16x2xf32]) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        einsum_246, einsum_247, einsum_248 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_83, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_83
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_328,
+            split_329,
+        ) = einsum_247
+        del einsum_247
+
+        # builtin.split: (22x22x1x2xf32, 22x1x16x2xf32) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        (
+            split_330,
+            split_331,
+        ) = einsum_248
+        del einsum_248
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_121 = paddle._C_ops.add(einsum_237, index_select_13)
+        del einsum_237, index_select_13
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_122 = paddle._C_ops.add(add_121, einsum_246)
+        del add_121, einsum_246
+
+        # pd_op.scale: (1x16x22x22xf32) <- (1x16x22x22xf32, 1xf32)
+        scale_17 = paddle._C_ops.scale(add_122, full_16, float("0"), True)
+        del add_122
+
+        # pd_op.subtract: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x1x22x22xf32)
+        subtract_13 = paddle._C_ops.subtract(scale_17, scale_4)
+        del scale_17
+
+        # pd_op.softmax: (1x16x22x22xf32) <- (1x16x22x22xf32)
+        softmax_13 = paddle._C_ops.softmax(subtract_13, 3)
+        del subtract_13
+
+        # pd_op.dropout: (1x16x22x22xf32, 1x16x22x22xui8) <- (1x16x22x22xf32, None, 1xf32)
+        dropout_108, dropout_109 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_13, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_13
+
+        # builtin.combine: ([1x16x22x22xf32, 22x1x16x64xf32]) <- (1x16x22x22xf32, 22x1x16x64xf32)
+        combine_84 = [dropout_108, reshape_93]
+        del dropout_108, reshape_93
+
+        # pd_op.einsum: (22x1x16x64xf32, [0xf32, 0xf32], [1x16x22x22xf32, 22x1x16x64xf32]) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        einsum_249, einsum_250, einsum_251 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_84, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_84
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_332,
+            split_333,
+        ) = einsum_250
+        del einsum_250
+
+        # builtin.split: (1x16x22x22xf32, 22x1x16x64xf32) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        (
+            split_334,
+            split_335,
+        ) = einsum_251
+        del einsum_251
+
+        # pd_op.reshape: (22x1x1024xf32) <- (22x1x16x64xf32, 3xi64)
+        reshape_97 = paddle._C_ops.reshape(einsum_249, full_int_array_10)
+        del einsum_249
+
+        # builtin.combine: ([22x1x1024xf32, 1024x1024xf32]) <- (22x1x1024xf32, 1024x1024xf32)
+        combine_85 = [reshape_97, parameter_183]
+        del parameter_183, reshape_97
+
+        # pd_op.einsum: (22x1x1024xf32, [0xf32, 0xf32], [22x1x1024xf32, 1024x1024xf32]) <- ([22x1x1024xf32, 1024x1024xf32])
+        einsum_252, einsum_253, einsum_254 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_85, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_85
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_336,
+            split_337,
+        ) = einsum_253
+        del einsum_253
+
+        # builtin.split: (22x1x1024xf32, 1024x1024xf32) <- ([22x1x1024xf32, 1024x1024xf32])
+        (
+            split_338,
+            split_339,
+        ) = einsum_254
+        del einsum_254
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_110, dropout_111 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_252, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_252
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_123 = paddle._C_ops.add(dropout_110, layer_norm_75)
+        del dropout_110, layer_norm_75
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_78, layer_norm_79, layer_norm_80 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_123, parameter_177, parameter_176, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_123, parameter_176, parameter_177
+
+        # pd_op.matmul: (22x1x4096xf32) <- (22x1x1024xf32, 1024x4096xf32)
+        matmul_82 = paddle._C_ops.matmul(layer_norm_78, parameter_173, False, False)
+        del parameter_173
+
+        # pd_op.add: (22x1x4096xf32) <- (22x1x4096xf32, 4096xf32)
+        add_124 = paddle._C_ops.add(matmul_82, parameter_172)
+        del matmul_82, parameter_172
+
+        # pd_op.gelu: (22x1x4096xf32) <- (22x1x4096xf32)
+        gelu_13 = paddle._C_ops.gelu(add_124, False)
+        del add_124
+
+        # pd_op.dropout: (22x1x4096xf32, 22x1x4096xui8) <- (22x1x4096xf32, None, 1xf32)
+        dropout_112, dropout_113 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_13, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_13
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x4096xf32, 4096x1024xf32)
+        matmul_83 = paddle._C_ops.matmul(dropout_112, parameter_171, False, False)
+        del dropout_112, parameter_171
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 1024xf32)
+        add_125 = paddle._C_ops.add(matmul_83, parameter_170)
+        del matmul_83, parameter_170
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_114, dropout_115 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_125, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_125
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_126 = paddle._C_ops.add(dropout_114, layer_norm_78)
+        del dropout_114, layer_norm_78
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_81, layer_norm_82, layer_norm_83 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_126, parameter_175, parameter_174, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_126, parameter_174, parameter_175
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_84 = paddle._C_ops.matmul(layer_norm_81, parameter_169, False, False)
+        del parameter_169
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_98 = paddle._C_ops.reshape(matmul_84, full_int_array_5)
+        del matmul_84
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_85 = paddle._C_ops.matmul(layer_norm_81, parameter_168, False, False)
+        del parameter_168
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_99 = paddle._C_ops.reshape(matmul_85, full_int_array_5)
+        del matmul_85
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_86 = paddle._C_ops.matmul(layer_norm_81, parameter_167, False, False)
+        del parameter_167
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_100 = paddle._C_ops.reshape(matmul_86, full_int_array_5)
+        del matmul_86
+
+        # pd_op.matmul: (44x1x1024xf32) <- (44x1x1024xf32, 1024x1024xf32)
+        matmul_87 = paddle._C_ops.matmul(dropout_2, parameter_165, False, False)
+        del parameter_165
+
+        # pd_op.reshape: (44x1x16x64xf32) <- (44x1x1024xf32, 4xi64)
+        reshape_101 = paddle._C_ops.reshape(matmul_87, full_int_array_6)
+        del matmul_87
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_127 = paddle._C_ops.add(reshape_98, parameter_162)
+        del parameter_162
+
+        # builtin.combine: ([22x1x16x64xf32, 22x1x16x64xf32]) <- (22x1x16x64xf32, 22x1x16x64xf32)
+        combine_86 = [add_127, reshape_99]
+        del add_127, reshape_99
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x1x16x64xf32, 22x1x16x64xf32]) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        einsum_255, einsum_256, einsum_257 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_86, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_86
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_340,
+            split_341,
+        ) = einsum_256
+        del einsum_256
+
+        # builtin.split: (22x1x16x64xf32, 22x1x16x64xf32) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        (
+            split_342,
+            split_343,
+        ) = einsum_257
+        del einsum_257
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_128 = paddle._C_ops.add(reshape_98, parameter_164)
+        del parameter_164
+
+        # builtin.combine: ([22x1x16x64xf32, 44x1x16x64xf32]) <- (22x1x16x64xf32, 44x1x16x64xf32)
+        combine_87 = [add_128, reshape_101]
+        del add_128, reshape_101
+
+        # pd_op.einsum: (1x16x22x44xf32, [0xf32, 0xf32], [22x1x16x64xf32, 44x1x16x64xf32]) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        einsum_258, einsum_259, einsum_260 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_87, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_87
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_344,
+            split_345,
+        ) = einsum_259
+        del einsum_259
+
+        # builtin.split: (22x1x16x64xf32, 44x1x16x64xf32) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        (
+            split_346,
+            split_347,
+        ) = einsum_260
+        del einsum_260
+
+        # pd_op.reshape: (1x16x44x22xf32) <- (1x16x22x44xf32, 4xi64)
+        reshape_102 = paddle._C_ops.reshape(einsum_258, full_int_array_7)
+        del einsum_258
+
+        # pd_op.slice: (1x16x43x22xf32) <- (1x16x44x22xf32, 1xi64, 1xi64)
+        slice_14 = paddle._C_ops.slice(
+            reshape_102, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_102
+
+        # pd_op.reshape: (1x16x22x43xf32) <- (1x16x43x22xf32, 4xi64)
+        reshape_103 = paddle._C_ops.reshape(slice_14, full_int_array_9)
+        del slice_14
+
+        # pd_op.index_select: (1x16x22x22xf32) <- (1x16x22x43xf32, 22xi64)
+        index_select_14 = paddle._C_ops.index_select(reshape_103, arange_2, 3)
+        del reshape_103
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_129 = paddle._C_ops.add(reshape_98, parameter_163)
+        del parameter_163, reshape_98
+
+        # builtin.combine: ([22x1x16x64xf32, 2x16x64xf32]) <- (22x1x16x64xf32, 2x16x64xf32)
+        combine_88 = [add_129, parameter_161]
+        del add_129, parameter_161
+
+        # pd_op.einsum: (22x1x16x2xf32, [0xf32, 0xf32], [22x1x16x64xf32, 2x16x64xf32]) <- ([22x1x16x64xf32, 2x16x64xf32])
+        einsum_261, einsum_262, einsum_263 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_88, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_88
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_348,
+            split_349,
+        ) = einsum_262
+        del einsum_262
+
+        # builtin.split: (22x1x16x64xf32, 2x16x64xf32) <- ([22x1x16x64xf32, 2x16x64xf32])
+        (
+            split_350,
+            split_351,
+        ) = einsum_263
+        del einsum_263
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x16x2xf32]) <- (22x22x1x2xf32, 22x1x16x2xf32)
+        combine_89 = [cast_5, einsum_261]
+        del einsum_261
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x16x2xf32]) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        einsum_264, einsum_265, einsum_266 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_89, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_89
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_352,
+            split_353,
+        ) = einsum_265
+        del einsum_265
+
+        # builtin.split: (22x22x1x2xf32, 22x1x16x2xf32) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        (
+            split_354,
+            split_355,
+        ) = einsum_266
+        del einsum_266
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_130 = paddle._C_ops.add(einsum_255, index_select_14)
+        del einsum_255, index_select_14
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_131 = paddle._C_ops.add(add_130, einsum_264)
+        del add_130, einsum_264
+
+        # pd_op.scale: (1x16x22x22xf32) <- (1x16x22x22xf32, 1xf32)
+        scale_18 = paddle._C_ops.scale(add_131, full_16, float("0"), True)
+        del add_131
+
+        # pd_op.subtract: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x1x22x22xf32)
+        subtract_14 = paddle._C_ops.subtract(scale_18, scale_4)
+        del scale_18
+
+        # pd_op.softmax: (1x16x22x22xf32) <- (1x16x22x22xf32)
+        softmax_14 = paddle._C_ops.softmax(subtract_14, 3)
+        del subtract_14
+
+        # pd_op.dropout: (1x16x22x22xf32, 1x16x22x22xui8) <- (1x16x22x22xf32, None, 1xf32)
+        dropout_116, dropout_117 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_14, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_14
+
+        # builtin.combine: ([1x16x22x22xf32, 22x1x16x64xf32]) <- (1x16x22x22xf32, 22x1x16x64xf32)
+        combine_90 = [dropout_116, reshape_100]
+        del dropout_116, reshape_100
+
+        # pd_op.einsum: (22x1x16x64xf32, [0xf32, 0xf32], [1x16x22x22xf32, 22x1x16x64xf32]) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        einsum_267, einsum_268, einsum_269 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_90, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_90
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_356,
+            split_357,
+        ) = einsum_268
+        del einsum_268
+
+        # builtin.split: (1x16x22x22xf32, 22x1x16x64xf32) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        (
+            split_358,
+            split_359,
+        ) = einsum_269
+        del einsum_269
+
+        # pd_op.reshape: (22x1x1024xf32) <- (22x1x16x64xf32, 3xi64)
+        reshape_104 = paddle._C_ops.reshape(einsum_267, full_int_array_10)
+        del einsum_267
+
+        # builtin.combine: ([22x1x1024xf32, 1024x1024xf32]) <- (22x1x1024xf32, 1024x1024xf32)
+        combine_91 = [reshape_104, parameter_166]
+        del parameter_166, reshape_104
+
+        # pd_op.einsum: (22x1x1024xf32, [0xf32, 0xf32], [22x1x1024xf32, 1024x1024xf32]) <- ([22x1x1024xf32, 1024x1024xf32])
+        einsum_270, einsum_271, einsum_272 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_91, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_91
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_360,
+            split_361,
+        ) = einsum_271
+        del einsum_271
+
+        # builtin.split: (22x1x1024xf32, 1024x1024xf32) <- ([22x1x1024xf32, 1024x1024xf32])
+        (
+            split_362,
+            split_363,
+        ) = einsum_272
+        del einsum_272
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_118, dropout_119 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_270, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_270
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_132 = paddle._C_ops.add(dropout_118, layer_norm_81)
+        del dropout_118, layer_norm_81
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_84, layer_norm_85, layer_norm_86 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_132, parameter_160, parameter_159, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_132, parameter_159, parameter_160
+
+        # pd_op.matmul: (22x1x4096xf32) <- (22x1x1024xf32, 1024x4096xf32)
+        matmul_88 = paddle._C_ops.matmul(layer_norm_84, parameter_156, False, False)
+        del parameter_156
+
+        # pd_op.add: (22x1x4096xf32) <- (22x1x4096xf32, 4096xf32)
+        add_133 = paddle._C_ops.add(matmul_88, parameter_155)
+        del matmul_88, parameter_155
+
+        # pd_op.gelu: (22x1x4096xf32) <- (22x1x4096xf32)
+        gelu_14 = paddle._C_ops.gelu(add_133, False)
+        del add_133
+
+        # pd_op.dropout: (22x1x4096xf32, 22x1x4096xui8) <- (22x1x4096xf32, None, 1xf32)
+        dropout_120, dropout_121 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_14, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_14
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x4096xf32, 4096x1024xf32)
+        matmul_89 = paddle._C_ops.matmul(dropout_120, parameter_154, False, False)
+        del dropout_120, parameter_154
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 1024xf32)
+        add_134 = paddle._C_ops.add(matmul_89, parameter_153)
+        del matmul_89, parameter_153
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_122, dropout_123 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_134, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_134
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_135 = paddle._C_ops.add(dropout_122, layer_norm_84)
+        del dropout_122, layer_norm_84
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_87, layer_norm_88, layer_norm_89 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_135, parameter_158, parameter_157, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_135, parameter_157, parameter_158
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_90 = paddle._C_ops.matmul(layer_norm_87, parameter_152, False, False)
+        del parameter_152
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_105 = paddle._C_ops.reshape(matmul_90, full_int_array_5)
+        del matmul_90
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_91 = paddle._C_ops.matmul(layer_norm_87, parameter_151, False, False)
+        del parameter_151
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_106 = paddle._C_ops.reshape(matmul_91, full_int_array_5)
+        del matmul_91
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_92 = paddle._C_ops.matmul(layer_norm_87, parameter_150, False, False)
+        del parameter_150
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_107 = paddle._C_ops.reshape(matmul_92, full_int_array_5)
+        del matmul_92
+
+        # pd_op.matmul: (44x1x1024xf32) <- (44x1x1024xf32, 1024x1024xf32)
+        matmul_93 = paddle._C_ops.matmul(dropout_2, parameter_148, False, False)
+        del parameter_148
+
+        # pd_op.reshape: (44x1x16x64xf32) <- (44x1x1024xf32, 4xi64)
+        reshape_108 = paddle._C_ops.reshape(matmul_93, full_int_array_6)
+        del matmul_93
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_136 = paddle._C_ops.add(reshape_105, parameter_145)
+        del parameter_145
+
+        # builtin.combine: ([22x1x16x64xf32, 22x1x16x64xf32]) <- (22x1x16x64xf32, 22x1x16x64xf32)
+        combine_92 = [add_136, reshape_106]
+        del add_136, reshape_106
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x1x16x64xf32, 22x1x16x64xf32]) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        einsum_273, einsum_274, einsum_275 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_92, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_92
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_364,
+            split_365,
+        ) = einsum_274
+        del einsum_274
+
+        # builtin.split: (22x1x16x64xf32, 22x1x16x64xf32) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        (
+            split_366,
+            split_367,
+        ) = einsum_275
+        del einsum_275
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_137 = paddle._C_ops.add(reshape_105, parameter_147)
+        del parameter_147
+
+        # builtin.combine: ([22x1x16x64xf32, 44x1x16x64xf32]) <- (22x1x16x64xf32, 44x1x16x64xf32)
+        combine_93 = [add_137, reshape_108]
+        del add_137, reshape_108
+
+        # pd_op.einsum: (1x16x22x44xf32, [0xf32, 0xf32], [22x1x16x64xf32, 44x1x16x64xf32]) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        einsum_276, einsum_277, einsum_278 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_93, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_93
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_368,
+            split_369,
+        ) = einsum_277
+        del einsum_277
+
+        # builtin.split: (22x1x16x64xf32, 44x1x16x64xf32) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        (
+            split_370,
+            split_371,
+        ) = einsum_278
+        del einsum_278
+
+        # pd_op.reshape: (1x16x44x22xf32) <- (1x16x22x44xf32, 4xi64)
+        reshape_109 = paddle._C_ops.reshape(einsum_276, full_int_array_7)
+        del einsum_276
+
+        # pd_op.slice: (1x16x43x22xf32) <- (1x16x44x22xf32, 1xi64, 1xi64)
+        slice_15 = paddle._C_ops.slice(
+            reshape_109, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_109
+
+        # pd_op.reshape: (1x16x22x43xf32) <- (1x16x43x22xf32, 4xi64)
+        reshape_110 = paddle._C_ops.reshape(slice_15, full_int_array_9)
+        del slice_15
+
+        # pd_op.index_select: (1x16x22x22xf32) <- (1x16x22x43xf32, 22xi64)
+        index_select_15 = paddle._C_ops.index_select(reshape_110, arange_2, 3)
+        del reshape_110
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_138 = paddle._C_ops.add(reshape_105, parameter_146)
+        del parameter_146, reshape_105
+
+        # builtin.combine: ([22x1x16x64xf32, 2x16x64xf32]) <- (22x1x16x64xf32, 2x16x64xf32)
+        combine_94 = [add_138, parameter_144]
+        del add_138, parameter_144
+
+        # pd_op.einsum: (22x1x16x2xf32, [0xf32, 0xf32], [22x1x16x64xf32, 2x16x64xf32]) <- ([22x1x16x64xf32, 2x16x64xf32])
+        einsum_279, einsum_280, einsum_281 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_94, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_94
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_372,
+            split_373,
+        ) = einsum_280
+        del einsum_280
+
+        # builtin.split: (22x1x16x64xf32, 2x16x64xf32) <- ([22x1x16x64xf32, 2x16x64xf32])
+        (
+            split_374,
+            split_375,
+        ) = einsum_281
+        del einsum_281
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x16x2xf32]) <- (22x22x1x2xf32, 22x1x16x2xf32)
+        combine_95 = [cast_5, einsum_279]
+        del einsum_279
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x16x2xf32]) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        einsum_282, einsum_283, einsum_284 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_95, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_95
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_376,
+            split_377,
+        ) = einsum_283
+        del einsum_283
+
+        # builtin.split: (22x22x1x2xf32, 22x1x16x2xf32) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        (
+            split_378,
+            split_379,
+        ) = einsum_284
+        del einsum_284
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_139 = paddle._C_ops.add(einsum_273, index_select_15)
+        del einsum_273, index_select_15
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_140 = paddle._C_ops.add(add_139, einsum_282)
+        del add_139, einsum_282
+
+        # pd_op.scale: (1x16x22x22xf32) <- (1x16x22x22xf32, 1xf32)
+        scale_19 = paddle._C_ops.scale(add_140, full_16, float("0"), True)
+        del add_140
+
+        # pd_op.subtract: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x1x22x22xf32)
+        subtract_15 = paddle._C_ops.subtract(scale_19, scale_4)
+        del scale_19
+
+        # pd_op.softmax: (1x16x22x22xf32) <- (1x16x22x22xf32)
+        softmax_15 = paddle._C_ops.softmax(subtract_15, 3)
+        del subtract_15
+
+        # pd_op.dropout: (1x16x22x22xf32, 1x16x22x22xui8) <- (1x16x22x22xf32, None, 1xf32)
+        dropout_124, dropout_125 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_15, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_15
+
+        # builtin.combine: ([1x16x22x22xf32, 22x1x16x64xf32]) <- (1x16x22x22xf32, 22x1x16x64xf32)
+        combine_96 = [dropout_124, reshape_107]
+        del dropout_124, reshape_107
+
+        # pd_op.einsum: (22x1x16x64xf32, [0xf32, 0xf32], [1x16x22x22xf32, 22x1x16x64xf32]) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        einsum_285, einsum_286, einsum_287 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_96, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_96
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_380,
+            split_381,
+        ) = einsum_286
+        del einsum_286
+
+        # builtin.split: (1x16x22x22xf32, 22x1x16x64xf32) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        (
+            split_382,
+            split_383,
+        ) = einsum_287
+        del einsum_287
+
+        # pd_op.reshape: (22x1x1024xf32) <- (22x1x16x64xf32, 3xi64)
+        reshape_111 = paddle._C_ops.reshape(einsum_285, full_int_array_10)
+        del einsum_285
+
+        # builtin.combine: ([22x1x1024xf32, 1024x1024xf32]) <- (22x1x1024xf32, 1024x1024xf32)
+        combine_97 = [reshape_111, parameter_149]
+        del parameter_149, reshape_111
+
+        # pd_op.einsum: (22x1x1024xf32, [0xf32, 0xf32], [22x1x1024xf32, 1024x1024xf32]) <- ([22x1x1024xf32, 1024x1024xf32])
+        einsum_288, einsum_289, einsum_290 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_97, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_97
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_384,
+            split_385,
+        ) = einsum_289
+        del einsum_289
+
+        # builtin.split: (22x1x1024xf32, 1024x1024xf32) <- ([22x1x1024xf32, 1024x1024xf32])
+        (
+            split_386,
+            split_387,
+        ) = einsum_290
+        del einsum_290
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_126, dropout_127 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_288, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_288
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_141 = paddle._C_ops.add(dropout_126, layer_norm_87)
+        del dropout_126, layer_norm_87
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_90, layer_norm_91, layer_norm_92 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_141, parameter_143, parameter_142, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_141, parameter_142, parameter_143
+
+        # pd_op.matmul: (22x1x4096xf32) <- (22x1x1024xf32, 1024x4096xf32)
+        matmul_94 = paddle._C_ops.matmul(layer_norm_90, parameter_139, False, False)
+        del parameter_139
+
+        # pd_op.add: (22x1x4096xf32) <- (22x1x4096xf32, 4096xf32)
+        add_142 = paddle._C_ops.add(matmul_94, parameter_138)
+        del matmul_94, parameter_138
+
+        # pd_op.gelu: (22x1x4096xf32) <- (22x1x4096xf32)
+        gelu_15 = paddle._C_ops.gelu(add_142, False)
+        del add_142
+
+        # pd_op.dropout: (22x1x4096xf32, 22x1x4096xui8) <- (22x1x4096xf32, None, 1xf32)
+        dropout_128, dropout_129 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_15, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_15
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x4096xf32, 4096x1024xf32)
+        matmul_95 = paddle._C_ops.matmul(dropout_128, parameter_137, False, False)
+        del dropout_128, parameter_137
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 1024xf32)
+        add_143 = paddle._C_ops.add(matmul_95, parameter_136)
+        del matmul_95, parameter_136
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_130, dropout_131 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_143, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_143
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_144 = paddle._C_ops.add(dropout_130, layer_norm_90)
+        del dropout_130, layer_norm_90
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_93, layer_norm_94, layer_norm_95 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_144, parameter_141, parameter_140, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_144, parameter_140, parameter_141
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_96 = paddle._C_ops.matmul(layer_norm_93, parameter_135, False, False)
+        del parameter_135
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_112 = paddle._C_ops.reshape(matmul_96, full_int_array_5)
+        del matmul_96
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_97 = paddle._C_ops.matmul(layer_norm_93, parameter_134, False, False)
+        del parameter_134
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_113 = paddle._C_ops.reshape(matmul_97, full_int_array_5)
+        del matmul_97
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_98 = paddle._C_ops.matmul(layer_norm_93, parameter_133, False, False)
+        del parameter_133
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_114 = paddle._C_ops.reshape(matmul_98, full_int_array_5)
+        del matmul_98
+
+        # pd_op.matmul: (44x1x1024xf32) <- (44x1x1024xf32, 1024x1024xf32)
+        matmul_99 = paddle._C_ops.matmul(dropout_2, parameter_131, False, False)
+        del parameter_131
+
+        # pd_op.reshape: (44x1x16x64xf32) <- (44x1x1024xf32, 4xi64)
+        reshape_115 = paddle._C_ops.reshape(matmul_99, full_int_array_6)
+        del matmul_99
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_145 = paddle._C_ops.add(reshape_112, parameter_128)
+        del parameter_128
+
+        # builtin.combine: ([22x1x16x64xf32, 22x1x16x64xf32]) <- (22x1x16x64xf32, 22x1x16x64xf32)
+        combine_98 = [add_145, reshape_113]
+        del add_145, reshape_113
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x1x16x64xf32, 22x1x16x64xf32]) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        einsum_291, einsum_292, einsum_293 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_98, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_98
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_388,
+            split_389,
+        ) = einsum_292
+        del einsum_292
+
+        # builtin.split: (22x1x16x64xf32, 22x1x16x64xf32) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        (
+            split_390,
+            split_391,
+        ) = einsum_293
+        del einsum_293
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_146 = paddle._C_ops.add(reshape_112, parameter_130)
+        del parameter_130
+
+        # builtin.combine: ([22x1x16x64xf32, 44x1x16x64xf32]) <- (22x1x16x64xf32, 44x1x16x64xf32)
+        combine_99 = [add_146, reshape_115]
+        del add_146, reshape_115
+
+        # pd_op.einsum: (1x16x22x44xf32, [0xf32, 0xf32], [22x1x16x64xf32, 44x1x16x64xf32]) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        einsum_294, einsum_295, einsum_296 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_99, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_99
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_392,
+            split_393,
+        ) = einsum_295
+        del einsum_295
+
+        # builtin.split: (22x1x16x64xf32, 44x1x16x64xf32) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        (
+            split_394,
+            split_395,
+        ) = einsum_296
+        del einsum_296
+
+        # pd_op.reshape: (1x16x44x22xf32) <- (1x16x22x44xf32, 4xi64)
+        reshape_116 = paddle._C_ops.reshape(einsum_294, full_int_array_7)
+        del einsum_294
+
+        # pd_op.slice: (1x16x43x22xf32) <- (1x16x44x22xf32, 1xi64, 1xi64)
+        slice_16 = paddle._C_ops.slice(
+            reshape_116, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_116
+
+        # pd_op.reshape: (1x16x22x43xf32) <- (1x16x43x22xf32, 4xi64)
+        reshape_117 = paddle._C_ops.reshape(slice_16, full_int_array_9)
+        del slice_16
+
+        # pd_op.index_select: (1x16x22x22xf32) <- (1x16x22x43xf32, 22xi64)
+        index_select_16 = paddle._C_ops.index_select(reshape_117, arange_2, 3)
+        del reshape_117
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_147 = paddle._C_ops.add(reshape_112, parameter_129)
+        del parameter_129, reshape_112
+
+        # builtin.combine: ([22x1x16x64xf32, 2x16x64xf32]) <- (22x1x16x64xf32, 2x16x64xf32)
+        combine_100 = [add_147, parameter_127]
+        del add_147, parameter_127
+
+        # pd_op.einsum: (22x1x16x2xf32, [0xf32, 0xf32], [22x1x16x64xf32, 2x16x64xf32]) <- ([22x1x16x64xf32, 2x16x64xf32])
+        einsum_297, einsum_298, einsum_299 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_100, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_100
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_396,
+            split_397,
+        ) = einsum_298
+        del einsum_298
+
+        # builtin.split: (22x1x16x64xf32, 2x16x64xf32) <- ([22x1x16x64xf32, 2x16x64xf32])
+        (
+            split_398,
+            split_399,
+        ) = einsum_299
+        del einsum_299
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x16x2xf32]) <- (22x22x1x2xf32, 22x1x16x2xf32)
+        combine_101 = [cast_5, einsum_297]
+        del einsum_297
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x16x2xf32]) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        einsum_300, einsum_301, einsum_302 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_101, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_101
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_400,
+            split_401,
+        ) = einsum_301
+        del einsum_301
+
+        # builtin.split: (22x22x1x2xf32, 22x1x16x2xf32) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        (
+            split_402,
+            split_403,
+        ) = einsum_302
+        del einsum_302
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_148 = paddle._C_ops.add(einsum_291, index_select_16)
+        del einsum_291, index_select_16
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_149 = paddle._C_ops.add(add_148, einsum_300)
+        del add_148, einsum_300
+
+        # pd_op.scale: (1x16x22x22xf32) <- (1x16x22x22xf32, 1xf32)
+        scale_20 = paddle._C_ops.scale(add_149, full_16, float("0"), True)
+        del add_149
+
+        # pd_op.subtract: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x1x22x22xf32)
+        subtract_16 = paddle._C_ops.subtract(scale_20, scale_4)
+        del scale_20
+
+        # pd_op.softmax: (1x16x22x22xf32) <- (1x16x22x22xf32)
+        softmax_16 = paddle._C_ops.softmax(subtract_16, 3)
+        del subtract_16
+
+        # pd_op.dropout: (1x16x22x22xf32, 1x16x22x22xui8) <- (1x16x22x22xf32, None, 1xf32)
+        dropout_132, dropout_133 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_16, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_16
+
+        # builtin.combine: ([1x16x22x22xf32, 22x1x16x64xf32]) <- (1x16x22x22xf32, 22x1x16x64xf32)
+        combine_102 = [dropout_132, reshape_114]
+        del dropout_132, reshape_114
+
+        # pd_op.einsum: (22x1x16x64xf32, [0xf32, 0xf32], [1x16x22x22xf32, 22x1x16x64xf32]) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        einsum_303, einsum_304, einsum_305 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_102, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_102
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_404,
+            split_405,
+        ) = einsum_304
+        del einsum_304
+
+        # builtin.split: (1x16x22x22xf32, 22x1x16x64xf32) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        (
+            split_406,
+            split_407,
+        ) = einsum_305
+        del einsum_305
+
+        # pd_op.reshape: (22x1x1024xf32) <- (22x1x16x64xf32, 3xi64)
+        reshape_118 = paddle._C_ops.reshape(einsum_303, full_int_array_10)
+        del einsum_303
+
+        # builtin.combine: ([22x1x1024xf32, 1024x1024xf32]) <- (22x1x1024xf32, 1024x1024xf32)
+        combine_103 = [reshape_118, parameter_132]
+        del parameter_132, reshape_118
+
+        # pd_op.einsum: (22x1x1024xf32, [0xf32, 0xf32], [22x1x1024xf32, 1024x1024xf32]) <- ([22x1x1024xf32, 1024x1024xf32])
+        einsum_306, einsum_307, einsum_308 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_103, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_103
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_408,
+            split_409,
+        ) = einsum_307
+        del einsum_307
+
+        # builtin.split: (22x1x1024xf32, 1024x1024xf32) <- ([22x1x1024xf32, 1024x1024xf32])
+        (
+            split_410,
+            split_411,
+        ) = einsum_308
+        del einsum_308
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_134, dropout_135 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_306, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_306
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_150 = paddle._C_ops.add(dropout_134, layer_norm_93)
+        del dropout_134, layer_norm_93
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_96, layer_norm_97, layer_norm_98 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_150, parameter_126, parameter_125, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_150, parameter_125, parameter_126
+
+        # pd_op.matmul: (22x1x4096xf32) <- (22x1x1024xf32, 1024x4096xf32)
+        matmul_100 = paddle._C_ops.matmul(layer_norm_96, parameter_122, False, False)
+        del parameter_122
+
+        # pd_op.add: (22x1x4096xf32) <- (22x1x4096xf32, 4096xf32)
+        add_151 = paddle._C_ops.add(matmul_100, parameter_121)
+        del matmul_100, parameter_121
+
+        # pd_op.gelu: (22x1x4096xf32) <- (22x1x4096xf32)
+        gelu_16 = paddle._C_ops.gelu(add_151, False)
+        del add_151
+
+        # pd_op.dropout: (22x1x4096xf32, 22x1x4096xui8) <- (22x1x4096xf32, None, 1xf32)
+        dropout_136, dropout_137 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_16, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_16
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x4096xf32, 4096x1024xf32)
+        matmul_101 = paddle._C_ops.matmul(dropout_136, parameter_120, False, False)
+        del dropout_136, parameter_120
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 1024xf32)
+        add_152 = paddle._C_ops.add(matmul_101, parameter_119)
+        del matmul_101, parameter_119
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_138, dropout_139 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_152, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_152
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_153 = paddle._C_ops.add(dropout_138, layer_norm_96)
+        del dropout_138, layer_norm_96
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_99, layer_norm_100, layer_norm_101 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_153, parameter_124, parameter_123, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_153, parameter_123, parameter_124
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_102 = paddle._C_ops.matmul(layer_norm_99, parameter_118, False, False)
+        del parameter_118
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_119 = paddle._C_ops.reshape(matmul_102, full_int_array_5)
+        del matmul_102
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_103 = paddle._C_ops.matmul(layer_norm_99, parameter_117, False, False)
+        del parameter_117
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_120 = paddle._C_ops.reshape(matmul_103, full_int_array_5)
+        del matmul_103
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_104 = paddle._C_ops.matmul(layer_norm_99, parameter_116, False, False)
+        del parameter_116
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_121 = paddle._C_ops.reshape(matmul_104, full_int_array_5)
+        del matmul_104
+
+        # pd_op.matmul: (44x1x1024xf32) <- (44x1x1024xf32, 1024x1024xf32)
+        matmul_105 = paddle._C_ops.matmul(dropout_2, parameter_114, False, False)
+        del parameter_114
+
+        # pd_op.reshape: (44x1x16x64xf32) <- (44x1x1024xf32, 4xi64)
+        reshape_122 = paddle._C_ops.reshape(matmul_105, full_int_array_6)
+        del matmul_105
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_154 = paddle._C_ops.add(reshape_119, parameter_111)
+        del parameter_111
+
+        # builtin.combine: ([22x1x16x64xf32, 22x1x16x64xf32]) <- (22x1x16x64xf32, 22x1x16x64xf32)
+        combine_104 = [add_154, reshape_120]
+        del add_154, reshape_120
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x1x16x64xf32, 22x1x16x64xf32]) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        einsum_309, einsum_310, einsum_311 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_104, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_104
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_412,
+            split_413,
+        ) = einsum_310
+        del einsum_310
+
+        # builtin.split: (22x1x16x64xf32, 22x1x16x64xf32) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        (
+            split_414,
+            split_415,
+        ) = einsum_311
+        del einsum_311
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_155 = paddle._C_ops.add(reshape_119, parameter_113)
+        del parameter_113
+
+        # builtin.combine: ([22x1x16x64xf32, 44x1x16x64xf32]) <- (22x1x16x64xf32, 44x1x16x64xf32)
+        combine_105 = [add_155, reshape_122]
+        del add_155, reshape_122
+
+        # pd_op.einsum: (1x16x22x44xf32, [0xf32, 0xf32], [22x1x16x64xf32, 44x1x16x64xf32]) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        einsum_312, einsum_313, einsum_314 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_105, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_105
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_416,
+            split_417,
+        ) = einsum_313
+        del einsum_313
+
+        # builtin.split: (22x1x16x64xf32, 44x1x16x64xf32) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        (
+            split_418,
+            split_419,
+        ) = einsum_314
+        del einsum_314
+
+        # pd_op.reshape: (1x16x44x22xf32) <- (1x16x22x44xf32, 4xi64)
+        reshape_123 = paddle._C_ops.reshape(einsum_312, full_int_array_7)
+        del einsum_312
+
+        # pd_op.slice: (1x16x43x22xf32) <- (1x16x44x22xf32, 1xi64, 1xi64)
+        slice_17 = paddle._C_ops.slice(
+            reshape_123, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_123
+
+        # pd_op.reshape: (1x16x22x43xf32) <- (1x16x43x22xf32, 4xi64)
+        reshape_124 = paddle._C_ops.reshape(slice_17, full_int_array_9)
+        del slice_17
+
+        # pd_op.index_select: (1x16x22x22xf32) <- (1x16x22x43xf32, 22xi64)
+        index_select_17 = paddle._C_ops.index_select(reshape_124, arange_2, 3)
+        del reshape_124
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_156 = paddle._C_ops.add(reshape_119, parameter_112)
+        del parameter_112, reshape_119
+
+        # builtin.combine: ([22x1x16x64xf32, 2x16x64xf32]) <- (22x1x16x64xf32, 2x16x64xf32)
+        combine_106 = [add_156, parameter_110]
+        del add_156, parameter_110
+
+        # pd_op.einsum: (22x1x16x2xf32, [0xf32, 0xf32], [22x1x16x64xf32, 2x16x64xf32]) <- ([22x1x16x64xf32, 2x16x64xf32])
+        einsum_315, einsum_316, einsum_317 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_106, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_106
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_420,
+            split_421,
+        ) = einsum_316
+        del einsum_316
+
+        # builtin.split: (22x1x16x64xf32, 2x16x64xf32) <- ([22x1x16x64xf32, 2x16x64xf32])
+        (
+            split_422,
+            split_423,
+        ) = einsum_317
+        del einsum_317
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x16x2xf32]) <- (22x22x1x2xf32, 22x1x16x2xf32)
+        combine_107 = [cast_5, einsum_315]
+        del einsum_315
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x16x2xf32]) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        einsum_318, einsum_319, einsum_320 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_107, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_107
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_424,
+            split_425,
+        ) = einsum_319
+        del einsum_319
+
+        # builtin.split: (22x22x1x2xf32, 22x1x16x2xf32) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        (
+            split_426,
+            split_427,
+        ) = einsum_320
+        del einsum_320
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_157 = paddle._C_ops.add(einsum_309, index_select_17)
+        del einsum_309, index_select_17
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_158 = paddle._C_ops.add(add_157, einsum_318)
+        del add_157, einsum_318
+
+        # pd_op.scale: (1x16x22x22xf32) <- (1x16x22x22xf32, 1xf32)
+        scale_21 = paddle._C_ops.scale(add_158, full_16, float("0"), True)
+        del add_158
+
+        # pd_op.subtract: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x1x22x22xf32)
+        subtract_17 = paddle._C_ops.subtract(scale_21, scale_4)
+        del scale_21
+
+        # pd_op.softmax: (1x16x22x22xf32) <- (1x16x22x22xf32)
+        softmax_17 = paddle._C_ops.softmax(subtract_17, 3)
+        del subtract_17
+
+        # pd_op.dropout: (1x16x22x22xf32, 1x16x22x22xui8) <- (1x16x22x22xf32, None, 1xf32)
+        dropout_140, dropout_141 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_17, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_17
+
+        # builtin.combine: ([1x16x22x22xf32, 22x1x16x64xf32]) <- (1x16x22x22xf32, 22x1x16x64xf32)
+        combine_108 = [dropout_140, reshape_121]
+        del dropout_140, reshape_121
+
+        # pd_op.einsum: (22x1x16x64xf32, [0xf32, 0xf32], [1x16x22x22xf32, 22x1x16x64xf32]) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        einsum_321, einsum_322, einsum_323 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_108, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_108
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_428,
+            split_429,
+        ) = einsum_322
+        del einsum_322
+
+        # builtin.split: (1x16x22x22xf32, 22x1x16x64xf32) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        (
+            split_430,
+            split_431,
+        ) = einsum_323
+        del einsum_323
+
+        # pd_op.reshape: (22x1x1024xf32) <- (22x1x16x64xf32, 3xi64)
+        reshape_125 = paddle._C_ops.reshape(einsum_321, full_int_array_10)
+        del einsum_321
+
+        # builtin.combine: ([22x1x1024xf32, 1024x1024xf32]) <- (22x1x1024xf32, 1024x1024xf32)
+        combine_109 = [reshape_125, parameter_115]
+        del parameter_115, reshape_125
+
+        # pd_op.einsum: (22x1x1024xf32, [0xf32, 0xf32], [22x1x1024xf32, 1024x1024xf32]) <- ([22x1x1024xf32, 1024x1024xf32])
+        einsum_324, einsum_325, einsum_326 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_109, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_109
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_432,
+            split_433,
+        ) = einsum_325
+        del einsum_325
+
+        # builtin.split: (22x1x1024xf32, 1024x1024xf32) <- ([22x1x1024xf32, 1024x1024xf32])
+        (
+            split_434,
+            split_435,
+        ) = einsum_326
+        del einsum_326
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_142, dropout_143 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_324, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_324
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_159 = paddle._C_ops.add(dropout_142, layer_norm_99)
+        del dropout_142, layer_norm_99
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_102, layer_norm_103, layer_norm_104 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_159, parameter_109, parameter_108, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_159, parameter_108, parameter_109
+
+        # pd_op.matmul: (22x1x4096xf32) <- (22x1x1024xf32, 1024x4096xf32)
+        matmul_106 = paddle._C_ops.matmul(layer_norm_102, parameter_105, False, False)
+        del parameter_105
+
+        # pd_op.add: (22x1x4096xf32) <- (22x1x4096xf32, 4096xf32)
+        add_160 = paddle._C_ops.add(matmul_106, parameter_104)
+        del matmul_106, parameter_104
+
+        # pd_op.gelu: (22x1x4096xf32) <- (22x1x4096xf32)
+        gelu_17 = paddle._C_ops.gelu(add_160, False)
+        del add_160
+
+        # pd_op.dropout: (22x1x4096xf32, 22x1x4096xui8) <- (22x1x4096xf32, None, 1xf32)
+        dropout_144, dropout_145 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_17, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_17
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x4096xf32, 4096x1024xf32)
+        matmul_107 = paddle._C_ops.matmul(dropout_144, parameter_103, False, False)
+        del dropout_144, parameter_103
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 1024xf32)
+        add_161 = paddle._C_ops.add(matmul_107, parameter_102)
+        del matmul_107, parameter_102
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_146, dropout_147 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_161, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_161
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_162 = paddle._C_ops.add(dropout_146, layer_norm_102)
+        del dropout_146, layer_norm_102
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_105, layer_norm_106, layer_norm_107 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_162, parameter_107, parameter_106, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_162, parameter_106, parameter_107
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_108 = paddle._C_ops.matmul(layer_norm_105, parameter_101, False, False)
+        del parameter_101
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_126 = paddle._C_ops.reshape(matmul_108, full_int_array_5)
+        del matmul_108
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_109 = paddle._C_ops.matmul(layer_norm_105, parameter_100, False, False)
+        del parameter_100
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_127 = paddle._C_ops.reshape(matmul_109, full_int_array_5)
+        del matmul_109
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_110 = paddle._C_ops.matmul(layer_norm_105, parameter_99, False, False)
+        del parameter_99
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_128 = paddle._C_ops.reshape(matmul_110, full_int_array_5)
+        del matmul_110
+
+        # pd_op.matmul: (44x1x1024xf32) <- (44x1x1024xf32, 1024x1024xf32)
+        matmul_111 = paddle._C_ops.matmul(dropout_2, parameter_97, False, False)
+        del parameter_97
+
+        # pd_op.reshape: (44x1x16x64xf32) <- (44x1x1024xf32, 4xi64)
+        reshape_129 = paddle._C_ops.reshape(matmul_111, full_int_array_6)
+        del matmul_111
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_163 = paddle._C_ops.add(reshape_126, parameter_94)
+        del parameter_94
+
+        # builtin.combine: ([22x1x16x64xf32, 22x1x16x64xf32]) <- (22x1x16x64xf32, 22x1x16x64xf32)
+        combine_110 = [add_163, reshape_127]
+        del add_163, reshape_127
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x1x16x64xf32, 22x1x16x64xf32]) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        einsum_327, einsum_328, einsum_329 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_110, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_110
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_436,
+            split_437,
+        ) = einsum_328
+        del einsum_328
+
+        # builtin.split: (22x1x16x64xf32, 22x1x16x64xf32) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        (
+            split_438,
+            split_439,
+        ) = einsum_329
+        del einsum_329
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_164 = paddle._C_ops.add(reshape_126, parameter_96)
+        del parameter_96
+
+        # builtin.combine: ([22x1x16x64xf32, 44x1x16x64xf32]) <- (22x1x16x64xf32, 44x1x16x64xf32)
+        combine_111 = [add_164, reshape_129]
+        del add_164, reshape_129
+
+        # pd_op.einsum: (1x16x22x44xf32, [0xf32, 0xf32], [22x1x16x64xf32, 44x1x16x64xf32]) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        einsum_330, einsum_331, einsum_332 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_111, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_111
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_440,
+            split_441,
+        ) = einsum_331
+        del einsum_331
+
+        # builtin.split: (22x1x16x64xf32, 44x1x16x64xf32) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        (
+            split_442,
+            split_443,
+        ) = einsum_332
+        del einsum_332
+
+        # pd_op.reshape: (1x16x44x22xf32) <- (1x16x22x44xf32, 4xi64)
+        reshape_130 = paddle._C_ops.reshape(einsum_330, full_int_array_7)
+        del einsum_330
+
+        # pd_op.slice: (1x16x43x22xf32) <- (1x16x44x22xf32, 1xi64, 1xi64)
+        slice_18 = paddle._C_ops.slice(
+            reshape_130, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_130
+
+        # pd_op.reshape: (1x16x22x43xf32) <- (1x16x43x22xf32, 4xi64)
+        reshape_131 = paddle._C_ops.reshape(slice_18, full_int_array_9)
+        del slice_18
+
+        # pd_op.index_select: (1x16x22x22xf32) <- (1x16x22x43xf32, 22xi64)
+        index_select_18 = paddle._C_ops.index_select(reshape_131, arange_2, 3)
+        del reshape_131
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_165 = paddle._C_ops.add(reshape_126, parameter_95)
+        del parameter_95, reshape_126
+
+        # builtin.combine: ([22x1x16x64xf32, 2x16x64xf32]) <- (22x1x16x64xf32, 2x16x64xf32)
+        combine_112 = [add_165, parameter_93]
+        del add_165, parameter_93
+
+        # pd_op.einsum: (22x1x16x2xf32, [0xf32, 0xf32], [22x1x16x64xf32, 2x16x64xf32]) <- ([22x1x16x64xf32, 2x16x64xf32])
+        einsum_333, einsum_334, einsum_335 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_112, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_112
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_444,
+            split_445,
+        ) = einsum_334
+        del einsum_334
+
+        # builtin.split: (22x1x16x64xf32, 2x16x64xf32) <- ([22x1x16x64xf32, 2x16x64xf32])
+        (
+            split_446,
+            split_447,
+        ) = einsum_335
+        del einsum_335
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x16x2xf32]) <- (22x22x1x2xf32, 22x1x16x2xf32)
+        combine_113 = [cast_5, einsum_333]
+        del einsum_333
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x16x2xf32]) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        einsum_336, einsum_337, einsum_338 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_113, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_113
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_448,
+            split_449,
+        ) = einsum_337
+        del einsum_337
+
+        # builtin.split: (22x22x1x2xf32, 22x1x16x2xf32) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        (
+            split_450,
+            split_451,
+        ) = einsum_338
+        del einsum_338
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_166 = paddle._C_ops.add(einsum_327, index_select_18)
+        del einsum_327, index_select_18
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_167 = paddle._C_ops.add(add_166, einsum_336)
+        del add_166, einsum_336
+
+        # pd_op.scale: (1x16x22x22xf32) <- (1x16x22x22xf32, 1xf32)
+        scale_22 = paddle._C_ops.scale(add_167, full_16, float("0"), True)
+        del add_167
+
+        # pd_op.subtract: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x1x22x22xf32)
+        subtract_18 = paddle._C_ops.subtract(scale_22, scale_4)
+        del scale_22
+
+        # pd_op.softmax: (1x16x22x22xf32) <- (1x16x22x22xf32)
+        softmax_18 = paddle._C_ops.softmax(subtract_18, 3)
+        del subtract_18
+
+        # pd_op.dropout: (1x16x22x22xf32, 1x16x22x22xui8) <- (1x16x22x22xf32, None, 1xf32)
+        dropout_148, dropout_149 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_18, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_18
+
+        # builtin.combine: ([1x16x22x22xf32, 22x1x16x64xf32]) <- (1x16x22x22xf32, 22x1x16x64xf32)
+        combine_114 = [dropout_148, reshape_128]
+        del dropout_148, reshape_128
+
+        # pd_op.einsum: (22x1x16x64xf32, [0xf32, 0xf32], [1x16x22x22xf32, 22x1x16x64xf32]) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        einsum_339, einsum_340, einsum_341 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_114, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_114
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_452,
+            split_453,
+        ) = einsum_340
+        del einsum_340
+
+        # builtin.split: (1x16x22x22xf32, 22x1x16x64xf32) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        (
+            split_454,
+            split_455,
+        ) = einsum_341
+        del einsum_341
+
+        # pd_op.reshape: (22x1x1024xf32) <- (22x1x16x64xf32, 3xi64)
+        reshape_132 = paddle._C_ops.reshape(einsum_339, full_int_array_10)
+        del einsum_339
+
+        # builtin.combine: ([22x1x1024xf32, 1024x1024xf32]) <- (22x1x1024xf32, 1024x1024xf32)
+        combine_115 = [reshape_132, parameter_98]
+        del parameter_98, reshape_132
+
+        # pd_op.einsum: (22x1x1024xf32, [0xf32, 0xf32], [22x1x1024xf32, 1024x1024xf32]) <- ([22x1x1024xf32, 1024x1024xf32])
+        einsum_342, einsum_343, einsum_344 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_115, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_115
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_456,
+            split_457,
+        ) = einsum_343
+        del einsum_343
+
+        # builtin.split: (22x1x1024xf32, 1024x1024xf32) <- ([22x1x1024xf32, 1024x1024xf32])
+        (
+            split_458,
+            split_459,
+        ) = einsum_344
+        del einsum_344
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_150, dropout_151 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_342, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_342
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_168 = paddle._C_ops.add(dropout_150, layer_norm_105)
+        del dropout_150, layer_norm_105
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_108, layer_norm_109, layer_norm_110 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_168, parameter_92, parameter_91, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_168, parameter_91, parameter_92
+
+        # pd_op.matmul: (22x1x4096xf32) <- (22x1x1024xf32, 1024x4096xf32)
+        matmul_112 = paddle._C_ops.matmul(layer_norm_108, parameter_88, False, False)
+        del parameter_88
+
+        # pd_op.add: (22x1x4096xf32) <- (22x1x4096xf32, 4096xf32)
+        add_169 = paddle._C_ops.add(matmul_112, parameter_87)
+        del matmul_112, parameter_87
+
+        # pd_op.gelu: (22x1x4096xf32) <- (22x1x4096xf32)
+        gelu_18 = paddle._C_ops.gelu(add_169, False)
+        del add_169
+
+        # pd_op.dropout: (22x1x4096xf32, 22x1x4096xui8) <- (22x1x4096xf32, None, 1xf32)
+        dropout_152, dropout_153 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_18, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_18
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x4096xf32, 4096x1024xf32)
+        matmul_113 = paddle._C_ops.matmul(dropout_152, parameter_86, False, False)
+        del dropout_152, parameter_86
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 1024xf32)
+        add_170 = paddle._C_ops.add(matmul_113, parameter_85)
+        del matmul_113, parameter_85
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_154, dropout_155 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_170, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_170
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_171 = paddle._C_ops.add(dropout_154, layer_norm_108)
+        del dropout_154, layer_norm_108
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_111, layer_norm_112, layer_norm_113 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_171, parameter_90, parameter_89, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_171, parameter_89, parameter_90
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_114 = paddle._C_ops.matmul(layer_norm_111, parameter_84, False, False)
+        del parameter_84
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_133 = paddle._C_ops.reshape(matmul_114, full_int_array_5)
+        del matmul_114
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_115 = paddle._C_ops.matmul(layer_norm_111, parameter_83, False, False)
+        del parameter_83
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_134 = paddle._C_ops.reshape(matmul_115, full_int_array_5)
+        del matmul_115
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_116 = paddle._C_ops.matmul(layer_norm_111, parameter_82, False, False)
+        del parameter_82
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_135 = paddle._C_ops.reshape(matmul_116, full_int_array_5)
+        del matmul_116
+
+        # pd_op.matmul: (44x1x1024xf32) <- (44x1x1024xf32, 1024x1024xf32)
+        matmul_117 = paddle._C_ops.matmul(dropout_2, parameter_80, False, False)
+        del parameter_80
+
+        # pd_op.reshape: (44x1x16x64xf32) <- (44x1x1024xf32, 4xi64)
+        reshape_136 = paddle._C_ops.reshape(matmul_117, full_int_array_6)
+        del matmul_117
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_172 = paddle._C_ops.add(reshape_133, parameter_77)
+        del parameter_77
+
+        # builtin.combine: ([22x1x16x64xf32, 22x1x16x64xf32]) <- (22x1x16x64xf32, 22x1x16x64xf32)
+        combine_116 = [add_172, reshape_134]
+        del add_172, reshape_134
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x1x16x64xf32, 22x1x16x64xf32]) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        einsum_345, einsum_346, einsum_347 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_116, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_116
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_460,
+            split_461,
+        ) = einsum_346
+        del einsum_346
+
+        # builtin.split: (22x1x16x64xf32, 22x1x16x64xf32) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        (
+            split_462,
+            split_463,
+        ) = einsum_347
+        del einsum_347
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_173 = paddle._C_ops.add(reshape_133, parameter_79)
+        del parameter_79
+
+        # builtin.combine: ([22x1x16x64xf32, 44x1x16x64xf32]) <- (22x1x16x64xf32, 44x1x16x64xf32)
+        combine_117 = [add_173, reshape_136]
+        del add_173, reshape_136
+
+        # pd_op.einsum: (1x16x22x44xf32, [0xf32, 0xf32], [22x1x16x64xf32, 44x1x16x64xf32]) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        einsum_348, einsum_349, einsum_350 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_117, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_117
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_464,
+            split_465,
+        ) = einsum_349
+        del einsum_349
+
+        # builtin.split: (22x1x16x64xf32, 44x1x16x64xf32) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        (
+            split_466,
+            split_467,
+        ) = einsum_350
+        del einsum_350
+
+        # pd_op.reshape: (1x16x44x22xf32) <- (1x16x22x44xf32, 4xi64)
+        reshape_137 = paddle._C_ops.reshape(einsum_348, full_int_array_7)
+        del einsum_348
+
+        # pd_op.slice: (1x16x43x22xf32) <- (1x16x44x22xf32, 1xi64, 1xi64)
+        slice_19 = paddle._C_ops.slice(
+            reshape_137, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_137
+
+        # pd_op.reshape: (1x16x22x43xf32) <- (1x16x43x22xf32, 4xi64)
+        reshape_138 = paddle._C_ops.reshape(slice_19, full_int_array_9)
+        del slice_19
+
+        # pd_op.index_select: (1x16x22x22xf32) <- (1x16x22x43xf32, 22xi64)
+        index_select_19 = paddle._C_ops.index_select(reshape_138, arange_2, 3)
+        del reshape_138
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_174 = paddle._C_ops.add(reshape_133, parameter_78)
+        del parameter_78, reshape_133
+
+        # builtin.combine: ([22x1x16x64xf32, 2x16x64xf32]) <- (22x1x16x64xf32, 2x16x64xf32)
+        combine_118 = [add_174, parameter_76]
+        del add_174, parameter_76
+
+        # pd_op.einsum: (22x1x16x2xf32, [0xf32, 0xf32], [22x1x16x64xf32, 2x16x64xf32]) <- ([22x1x16x64xf32, 2x16x64xf32])
+        einsum_351, einsum_352, einsum_353 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_118, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_118
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_468,
+            split_469,
+        ) = einsum_352
+        del einsum_352
+
+        # builtin.split: (22x1x16x64xf32, 2x16x64xf32) <- ([22x1x16x64xf32, 2x16x64xf32])
+        (
+            split_470,
+            split_471,
+        ) = einsum_353
+        del einsum_353
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x16x2xf32]) <- (22x22x1x2xf32, 22x1x16x2xf32)
+        combine_119 = [cast_5, einsum_351]
+        del einsum_351
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x16x2xf32]) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        einsum_354, einsum_355, einsum_356 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_119, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_119
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_472,
+            split_473,
+        ) = einsum_355
+        del einsum_355
+
+        # builtin.split: (22x22x1x2xf32, 22x1x16x2xf32) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        (
+            split_474,
+            split_475,
+        ) = einsum_356
+        del einsum_356
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_175 = paddle._C_ops.add(einsum_345, index_select_19)
+        del einsum_345, index_select_19
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_176 = paddle._C_ops.add(add_175, einsum_354)
+        del add_175, einsum_354
+
+        # pd_op.scale: (1x16x22x22xf32) <- (1x16x22x22xf32, 1xf32)
+        scale_23 = paddle._C_ops.scale(add_176, full_16, float("0"), True)
+        del add_176
+
+        # pd_op.subtract: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x1x22x22xf32)
+        subtract_19 = paddle._C_ops.subtract(scale_23, scale_4)
+        del scale_23
+
+        # pd_op.softmax: (1x16x22x22xf32) <- (1x16x22x22xf32)
+        softmax_19 = paddle._C_ops.softmax(subtract_19, 3)
+        del subtract_19
+
+        # pd_op.dropout: (1x16x22x22xf32, 1x16x22x22xui8) <- (1x16x22x22xf32, None, 1xf32)
+        dropout_156, dropout_157 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_19, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_19
+
+        # builtin.combine: ([1x16x22x22xf32, 22x1x16x64xf32]) <- (1x16x22x22xf32, 22x1x16x64xf32)
+        combine_120 = [dropout_156, reshape_135]
+        del dropout_156, reshape_135
+
+        # pd_op.einsum: (22x1x16x64xf32, [0xf32, 0xf32], [1x16x22x22xf32, 22x1x16x64xf32]) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        einsum_357, einsum_358, einsum_359 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_120, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_120
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_476,
+            split_477,
+        ) = einsum_358
+        del einsum_358
+
+        # builtin.split: (1x16x22x22xf32, 22x1x16x64xf32) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        (
+            split_478,
+            split_479,
+        ) = einsum_359
+        del einsum_359
+
+        # pd_op.reshape: (22x1x1024xf32) <- (22x1x16x64xf32, 3xi64)
+        reshape_139 = paddle._C_ops.reshape(einsum_357, full_int_array_10)
+        del einsum_357
+
+        # builtin.combine: ([22x1x1024xf32, 1024x1024xf32]) <- (22x1x1024xf32, 1024x1024xf32)
+        combine_121 = [reshape_139, parameter_81]
+        del parameter_81, reshape_139
+
+        # pd_op.einsum: (22x1x1024xf32, [0xf32, 0xf32], [22x1x1024xf32, 1024x1024xf32]) <- ([22x1x1024xf32, 1024x1024xf32])
+        einsum_360, einsum_361, einsum_362 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_121, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_121
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_480,
+            split_481,
+        ) = einsum_361
+        del einsum_361
+
+        # builtin.split: (22x1x1024xf32, 1024x1024xf32) <- ([22x1x1024xf32, 1024x1024xf32])
+        (
+            split_482,
+            split_483,
+        ) = einsum_362
+        del einsum_362
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_158, dropout_159 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_360, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_360
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_177 = paddle._C_ops.add(dropout_158, layer_norm_111)
+        del dropout_158, layer_norm_111
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_114, layer_norm_115, layer_norm_116 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_177, parameter_75, parameter_74, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_177, parameter_74, parameter_75
+
+        # pd_op.matmul: (22x1x4096xf32) <- (22x1x1024xf32, 1024x4096xf32)
+        matmul_118 = paddle._C_ops.matmul(layer_norm_114, parameter_71, False, False)
+        del parameter_71
+
+        # pd_op.add: (22x1x4096xf32) <- (22x1x4096xf32, 4096xf32)
+        add_178 = paddle._C_ops.add(matmul_118, parameter_70)
+        del matmul_118, parameter_70
+
+        # pd_op.gelu: (22x1x4096xf32) <- (22x1x4096xf32)
+        gelu_19 = paddle._C_ops.gelu(add_178, False)
+        del add_178
+
+        # pd_op.dropout: (22x1x4096xf32, 22x1x4096xui8) <- (22x1x4096xf32, None, 1xf32)
+        dropout_160, dropout_161 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_19, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_19
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x4096xf32, 4096x1024xf32)
+        matmul_119 = paddle._C_ops.matmul(dropout_160, parameter_69, False, False)
+        del dropout_160, parameter_69
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 1024xf32)
+        add_179 = paddle._C_ops.add(matmul_119, parameter_68)
+        del matmul_119, parameter_68
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_162, dropout_163 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_179, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_179
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_180 = paddle._C_ops.add(dropout_162, layer_norm_114)
+        del dropout_162, layer_norm_114
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_117, layer_norm_118, layer_norm_119 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_180, parameter_73, parameter_72, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_180, parameter_72, parameter_73
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_120 = paddle._C_ops.matmul(layer_norm_117, parameter_67, False, False)
+        del parameter_67
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_140 = paddle._C_ops.reshape(matmul_120, full_int_array_5)
+        del matmul_120
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_121 = paddle._C_ops.matmul(layer_norm_117, parameter_66, False, False)
+        del parameter_66
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_141 = paddle._C_ops.reshape(matmul_121, full_int_array_5)
+        del matmul_121
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_122 = paddle._C_ops.matmul(layer_norm_117, parameter_65, False, False)
+        del parameter_65
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_142 = paddle._C_ops.reshape(matmul_122, full_int_array_5)
+        del matmul_122
+
+        # pd_op.matmul: (44x1x1024xf32) <- (44x1x1024xf32, 1024x1024xf32)
+        matmul_123 = paddle._C_ops.matmul(dropout_2, parameter_63, False, False)
+        del parameter_63
+
+        # pd_op.reshape: (44x1x16x64xf32) <- (44x1x1024xf32, 4xi64)
+        reshape_143 = paddle._C_ops.reshape(matmul_123, full_int_array_6)
+        del matmul_123
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_181 = paddle._C_ops.add(reshape_140, parameter_60)
+        del parameter_60
+
+        # builtin.combine: ([22x1x16x64xf32, 22x1x16x64xf32]) <- (22x1x16x64xf32, 22x1x16x64xf32)
+        combine_122 = [add_181, reshape_141]
+        del add_181, reshape_141
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x1x16x64xf32, 22x1x16x64xf32]) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        einsum_363, einsum_364, einsum_365 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_122, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_122
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_484,
+            split_485,
+        ) = einsum_364
+        del einsum_364
+
+        # builtin.split: (22x1x16x64xf32, 22x1x16x64xf32) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        (
+            split_486,
+            split_487,
+        ) = einsum_365
+        del einsum_365
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_182 = paddle._C_ops.add(reshape_140, parameter_62)
+        del parameter_62
+
+        # builtin.combine: ([22x1x16x64xf32, 44x1x16x64xf32]) <- (22x1x16x64xf32, 44x1x16x64xf32)
+        combine_123 = [add_182, reshape_143]
+        del add_182, reshape_143
+
+        # pd_op.einsum: (1x16x22x44xf32, [0xf32, 0xf32], [22x1x16x64xf32, 44x1x16x64xf32]) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        einsum_366, einsum_367, einsum_368 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_123, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_123
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_488,
+            split_489,
+        ) = einsum_367
+        del einsum_367
+
+        # builtin.split: (22x1x16x64xf32, 44x1x16x64xf32) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        (
+            split_490,
+            split_491,
+        ) = einsum_368
+        del einsum_368
+
+        # pd_op.reshape: (1x16x44x22xf32) <- (1x16x22x44xf32, 4xi64)
+        reshape_144 = paddle._C_ops.reshape(einsum_366, full_int_array_7)
+        del einsum_366
+
+        # pd_op.slice: (1x16x43x22xf32) <- (1x16x44x22xf32, 1xi64, 1xi64)
+        slice_20 = paddle._C_ops.slice(
+            reshape_144, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_144
+
+        # pd_op.reshape: (1x16x22x43xf32) <- (1x16x43x22xf32, 4xi64)
+        reshape_145 = paddle._C_ops.reshape(slice_20, full_int_array_9)
+        del slice_20
+
+        # pd_op.index_select: (1x16x22x22xf32) <- (1x16x22x43xf32, 22xi64)
+        index_select_20 = paddle._C_ops.index_select(reshape_145, arange_2, 3)
+        del reshape_145
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_183 = paddle._C_ops.add(reshape_140, parameter_61)
+        del parameter_61, reshape_140
+
+        # builtin.combine: ([22x1x16x64xf32, 2x16x64xf32]) <- (22x1x16x64xf32, 2x16x64xf32)
+        combine_124 = [add_183, parameter_59]
+        del add_183, parameter_59
+
+        # pd_op.einsum: (22x1x16x2xf32, [0xf32, 0xf32], [22x1x16x64xf32, 2x16x64xf32]) <- ([22x1x16x64xf32, 2x16x64xf32])
+        einsum_369, einsum_370, einsum_371 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_124, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_124
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_492,
+            split_493,
+        ) = einsum_370
+        del einsum_370
+
+        # builtin.split: (22x1x16x64xf32, 2x16x64xf32) <- ([22x1x16x64xf32, 2x16x64xf32])
+        (
+            split_494,
+            split_495,
+        ) = einsum_371
+        del einsum_371
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x16x2xf32]) <- (22x22x1x2xf32, 22x1x16x2xf32)
+        combine_125 = [cast_5, einsum_369]
+        del einsum_369
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x16x2xf32]) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        einsum_372, einsum_373, einsum_374 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_125, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_125
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_496,
+            split_497,
+        ) = einsum_373
+        del einsum_373
+
+        # builtin.split: (22x22x1x2xf32, 22x1x16x2xf32) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        (
+            split_498,
+            split_499,
+        ) = einsum_374
+        del einsum_374
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_184 = paddle._C_ops.add(einsum_363, index_select_20)
+        del einsum_363, index_select_20
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_185 = paddle._C_ops.add(add_184, einsum_372)
+        del add_184, einsum_372
+
+        # pd_op.scale: (1x16x22x22xf32) <- (1x16x22x22xf32, 1xf32)
+        scale_24 = paddle._C_ops.scale(add_185, full_16, float("0"), True)
+        del add_185
+
+        # pd_op.subtract: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x1x22x22xf32)
+        subtract_20 = paddle._C_ops.subtract(scale_24, scale_4)
+        del scale_24
+
+        # pd_op.softmax: (1x16x22x22xf32) <- (1x16x22x22xf32)
+        softmax_20 = paddle._C_ops.softmax(subtract_20, 3)
+        del subtract_20
+
+        # pd_op.dropout: (1x16x22x22xf32, 1x16x22x22xui8) <- (1x16x22x22xf32, None, 1xf32)
+        dropout_164, dropout_165 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_20, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_20
+
+        # builtin.combine: ([1x16x22x22xf32, 22x1x16x64xf32]) <- (1x16x22x22xf32, 22x1x16x64xf32)
+        combine_126 = [dropout_164, reshape_142]
+        del dropout_164, reshape_142
+
+        # pd_op.einsum: (22x1x16x64xf32, [0xf32, 0xf32], [1x16x22x22xf32, 22x1x16x64xf32]) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        einsum_375, einsum_376, einsum_377 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_126, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_126
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_500,
+            split_501,
+        ) = einsum_376
+        del einsum_376
+
+        # builtin.split: (1x16x22x22xf32, 22x1x16x64xf32) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        (
+            split_502,
+            split_503,
+        ) = einsum_377
+        del einsum_377
+
+        # pd_op.reshape: (22x1x1024xf32) <- (22x1x16x64xf32, 3xi64)
+        reshape_146 = paddle._C_ops.reshape(einsum_375, full_int_array_10)
+        del einsum_375
+
+        # builtin.combine: ([22x1x1024xf32, 1024x1024xf32]) <- (22x1x1024xf32, 1024x1024xf32)
+        combine_127 = [reshape_146, parameter_64]
+        del parameter_64, reshape_146
+
+        # pd_op.einsum: (22x1x1024xf32, [0xf32, 0xf32], [22x1x1024xf32, 1024x1024xf32]) <- ([22x1x1024xf32, 1024x1024xf32])
+        einsum_378, einsum_379, einsum_380 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_127, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_127
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_504,
+            split_505,
+        ) = einsum_379
+        del einsum_379
+
+        # builtin.split: (22x1x1024xf32, 1024x1024xf32) <- ([22x1x1024xf32, 1024x1024xf32])
+        (
+            split_506,
+            split_507,
+        ) = einsum_380
+        del einsum_380
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_166, dropout_167 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_378, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_378
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_186 = paddle._C_ops.add(dropout_166, layer_norm_117)
+        del dropout_166, layer_norm_117
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_120, layer_norm_121, layer_norm_122 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_186, parameter_58, parameter_57, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_186, parameter_57, parameter_58
+
+        # pd_op.matmul: (22x1x4096xf32) <- (22x1x1024xf32, 1024x4096xf32)
+        matmul_124 = paddle._C_ops.matmul(layer_norm_120, parameter_54, False, False)
+        del parameter_54
+
+        # pd_op.add: (22x1x4096xf32) <- (22x1x4096xf32, 4096xf32)
+        add_187 = paddle._C_ops.add(matmul_124, parameter_53)
+        del matmul_124, parameter_53
+
+        # pd_op.gelu: (22x1x4096xf32) <- (22x1x4096xf32)
+        gelu_20 = paddle._C_ops.gelu(add_187, False)
+        del add_187
+
+        # pd_op.dropout: (22x1x4096xf32, 22x1x4096xui8) <- (22x1x4096xf32, None, 1xf32)
+        dropout_168, dropout_169 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_20, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_20
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x4096xf32, 4096x1024xf32)
+        matmul_125 = paddle._C_ops.matmul(dropout_168, parameter_52, False, False)
+        del dropout_168, parameter_52
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 1024xf32)
+        add_188 = paddle._C_ops.add(matmul_125, parameter_51)
+        del matmul_125, parameter_51
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_170, dropout_171 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_188, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_188
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_189 = paddle._C_ops.add(dropout_170, layer_norm_120)
+        del dropout_170, layer_norm_120
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_123, layer_norm_124, layer_norm_125 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_189, parameter_56, parameter_55, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_189, parameter_55, parameter_56
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_126 = paddle._C_ops.matmul(layer_norm_123, parameter_50, False, False)
+        del parameter_50
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_147 = paddle._C_ops.reshape(matmul_126, full_int_array_5)
+        del matmul_126
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_127 = paddle._C_ops.matmul(layer_norm_123, parameter_49, False, False)
+        del parameter_49
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_148 = paddle._C_ops.reshape(matmul_127, full_int_array_5)
+        del matmul_127
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_128 = paddle._C_ops.matmul(layer_norm_123, parameter_48, False, False)
+        del parameter_48
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_149 = paddle._C_ops.reshape(matmul_128, full_int_array_5)
+        del matmul_128
+
+        # pd_op.matmul: (44x1x1024xf32) <- (44x1x1024xf32, 1024x1024xf32)
+        matmul_129 = paddle._C_ops.matmul(dropout_2, parameter_46, False, False)
+        del parameter_46
+
+        # pd_op.reshape: (44x1x16x64xf32) <- (44x1x1024xf32, 4xi64)
+        reshape_150 = paddle._C_ops.reshape(matmul_129, full_int_array_6)
+        del matmul_129
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_190 = paddle._C_ops.add(reshape_147, parameter_43)
+        del parameter_43
+
+        # builtin.combine: ([22x1x16x64xf32, 22x1x16x64xf32]) <- (22x1x16x64xf32, 22x1x16x64xf32)
+        combine_128 = [add_190, reshape_148]
+        del add_190, reshape_148
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x1x16x64xf32, 22x1x16x64xf32]) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        einsum_381, einsum_382, einsum_383 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_128, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_128
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_508,
+            split_509,
+        ) = einsum_382
+        del einsum_382
+
+        # builtin.split: (22x1x16x64xf32, 22x1x16x64xf32) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        (
+            split_510,
+            split_511,
+        ) = einsum_383
+        del einsum_383
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_191 = paddle._C_ops.add(reshape_147, parameter_45)
+        del parameter_45
+
+        # builtin.combine: ([22x1x16x64xf32, 44x1x16x64xf32]) <- (22x1x16x64xf32, 44x1x16x64xf32)
+        combine_129 = [add_191, reshape_150]
+        del add_191, reshape_150
+
+        # pd_op.einsum: (1x16x22x44xf32, [0xf32, 0xf32], [22x1x16x64xf32, 44x1x16x64xf32]) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        einsum_384, einsum_385, einsum_386 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_129, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_129
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_512,
+            split_513,
+        ) = einsum_385
+        del einsum_385
+
+        # builtin.split: (22x1x16x64xf32, 44x1x16x64xf32) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        (
+            split_514,
+            split_515,
+        ) = einsum_386
+        del einsum_386
+
+        # pd_op.reshape: (1x16x44x22xf32) <- (1x16x22x44xf32, 4xi64)
+        reshape_151 = paddle._C_ops.reshape(einsum_384, full_int_array_7)
+        del einsum_384
+
+        # pd_op.slice: (1x16x43x22xf32) <- (1x16x44x22xf32, 1xi64, 1xi64)
+        slice_21 = paddle._C_ops.slice(
+            reshape_151, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_151
+
+        # pd_op.reshape: (1x16x22x43xf32) <- (1x16x43x22xf32, 4xi64)
+        reshape_152 = paddle._C_ops.reshape(slice_21, full_int_array_9)
+        del slice_21
+
+        # pd_op.index_select: (1x16x22x22xf32) <- (1x16x22x43xf32, 22xi64)
+        index_select_21 = paddle._C_ops.index_select(reshape_152, arange_2, 3)
+        del reshape_152
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_192 = paddle._C_ops.add(reshape_147, parameter_44)
+        del parameter_44, reshape_147
+
+        # builtin.combine: ([22x1x16x64xf32, 2x16x64xf32]) <- (22x1x16x64xf32, 2x16x64xf32)
+        combine_130 = [add_192, parameter_42]
+        del add_192, parameter_42
+
+        # pd_op.einsum: (22x1x16x2xf32, [0xf32, 0xf32], [22x1x16x64xf32, 2x16x64xf32]) <- ([22x1x16x64xf32, 2x16x64xf32])
+        einsum_387, einsum_388, einsum_389 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_130, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_130
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_516,
+            split_517,
+        ) = einsum_388
+        del einsum_388
+
+        # builtin.split: (22x1x16x64xf32, 2x16x64xf32) <- ([22x1x16x64xf32, 2x16x64xf32])
+        (
+            split_518,
+            split_519,
+        ) = einsum_389
+        del einsum_389
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x16x2xf32]) <- (22x22x1x2xf32, 22x1x16x2xf32)
+        combine_131 = [cast_5, einsum_387]
+        del einsum_387
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x16x2xf32]) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        einsum_390, einsum_391, einsum_392 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_131, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_131
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_520,
+            split_521,
+        ) = einsum_391
+        del einsum_391
+
+        # builtin.split: (22x22x1x2xf32, 22x1x16x2xf32) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        (
+            split_522,
+            split_523,
+        ) = einsum_392
+        del einsum_392
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_193 = paddle._C_ops.add(einsum_381, index_select_21)
+        del einsum_381, index_select_21
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_194 = paddle._C_ops.add(add_193, einsum_390)
+        del add_193, einsum_390
+
+        # pd_op.scale: (1x16x22x22xf32) <- (1x16x22x22xf32, 1xf32)
+        scale_25 = paddle._C_ops.scale(add_194, full_16, float("0"), True)
+        del add_194
+
+        # pd_op.subtract: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x1x22x22xf32)
+        subtract_21 = paddle._C_ops.subtract(scale_25, scale_4)
+        del scale_25
+
+        # pd_op.softmax: (1x16x22x22xf32) <- (1x16x22x22xf32)
+        softmax_21 = paddle._C_ops.softmax(subtract_21, 3)
+        del subtract_21
+
+        # pd_op.dropout: (1x16x22x22xf32, 1x16x22x22xui8) <- (1x16x22x22xf32, None, 1xf32)
+        dropout_172, dropout_173 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_21, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_21
+
+        # builtin.combine: ([1x16x22x22xf32, 22x1x16x64xf32]) <- (1x16x22x22xf32, 22x1x16x64xf32)
+        combine_132 = [dropout_172, reshape_149]
+        del dropout_172, reshape_149
+
+        # pd_op.einsum: (22x1x16x64xf32, [0xf32, 0xf32], [1x16x22x22xf32, 22x1x16x64xf32]) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        einsum_393, einsum_394, einsum_395 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_132, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_132
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_524,
+            split_525,
+        ) = einsum_394
+        del einsum_394
+
+        # builtin.split: (1x16x22x22xf32, 22x1x16x64xf32) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        (
+            split_526,
+            split_527,
+        ) = einsum_395
+        del einsum_395
+
+        # pd_op.reshape: (22x1x1024xf32) <- (22x1x16x64xf32, 3xi64)
+        reshape_153 = paddle._C_ops.reshape(einsum_393, full_int_array_10)
+        del einsum_393
+
+        # builtin.combine: ([22x1x1024xf32, 1024x1024xf32]) <- (22x1x1024xf32, 1024x1024xf32)
+        combine_133 = [reshape_153, parameter_47]
+        del parameter_47, reshape_153
+
+        # pd_op.einsum: (22x1x1024xf32, [0xf32, 0xf32], [22x1x1024xf32, 1024x1024xf32]) <- ([22x1x1024xf32, 1024x1024xf32])
+        einsum_396, einsum_397, einsum_398 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_133, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_133
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_528,
+            split_529,
+        ) = einsum_397
+        del einsum_397
+
+        # builtin.split: (22x1x1024xf32, 1024x1024xf32) <- ([22x1x1024xf32, 1024x1024xf32])
+        (
+            split_530,
+            split_531,
+        ) = einsum_398
+        del einsum_398
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_174, dropout_175 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_396, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_396
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_195 = paddle._C_ops.add(dropout_174, layer_norm_123)
+        del dropout_174, layer_norm_123
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_126, layer_norm_127, layer_norm_128 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_195, parameter_41, parameter_40, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_195, parameter_40, parameter_41
+
+        # pd_op.matmul: (22x1x4096xf32) <- (22x1x1024xf32, 1024x4096xf32)
+        matmul_130 = paddle._C_ops.matmul(layer_norm_126, parameter_37, False, False)
+        del parameter_37
+
+        # pd_op.add: (22x1x4096xf32) <- (22x1x4096xf32, 4096xf32)
+        add_196 = paddle._C_ops.add(matmul_130, parameter_36)
+        del matmul_130, parameter_36
+
+        # pd_op.gelu: (22x1x4096xf32) <- (22x1x4096xf32)
+        gelu_21 = paddle._C_ops.gelu(add_196, False)
+        del add_196
+
+        # pd_op.dropout: (22x1x4096xf32, 22x1x4096xui8) <- (22x1x4096xf32, None, 1xf32)
+        dropout_176, dropout_177 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_21, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_21
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x4096xf32, 4096x1024xf32)
+        matmul_131 = paddle._C_ops.matmul(dropout_176, parameter_35, False, False)
+        del dropout_176, parameter_35
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 1024xf32)
+        add_197 = paddle._C_ops.add(matmul_131, parameter_34)
+        del matmul_131, parameter_34
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_178, dropout_179 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_197, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_197
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_198 = paddle._C_ops.add(dropout_178, layer_norm_126)
+        del dropout_178, layer_norm_126
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_129, layer_norm_130, layer_norm_131 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_198, parameter_39, parameter_38, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_198, parameter_38, parameter_39
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_132 = paddle._C_ops.matmul(layer_norm_129, parameter_33, False, False)
+        del parameter_33
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_154 = paddle._C_ops.reshape(matmul_132, full_int_array_5)
+        del matmul_132
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_133 = paddle._C_ops.matmul(layer_norm_129, parameter_32, False, False)
+        del parameter_32
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_155 = paddle._C_ops.reshape(matmul_133, full_int_array_5)
+        del matmul_133
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_134 = paddle._C_ops.matmul(layer_norm_129, parameter_31, False, False)
+        del parameter_31
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_156 = paddle._C_ops.reshape(matmul_134, full_int_array_5)
+        del matmul_134
+
+        # pd_op.matmul: (44x1x1024xf32) <- (44x1x1024xf32, 1024x1024xf32)
+        matmul_135 = paddle._C_ops.matmul(dropout_2, parameter_29, False, False)
+        del parameter_29
+
+        # pd_op.reshape: (44x1x16x64xf32) <- (44x1x1024xf32, 4xi64)
+        reshape_157 = paddle._C_ops.reshape(matmul_135, full_int_array_6)
+        del matmul_135
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_199 = paddle._C_ops.add(reshape_154, parameter_26)
+        del parameter_26
+
+        # builtin.combine: ([22x1x16x64xf32, 22x1x16x64xf32]) <- (22x1x16x64xf32, 22x1x16x64xf32)
+        combine_134 = [add_199, reshape_155]
+        del add_199, reshape_155
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x1x16x64xf32, 22x1x16x64xf32]) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        einsum_399, einsum_400, einsum_401 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_134, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_134
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_532,
+            split_533,
+        ) = einsum_400
+        del einsum_400
+
+        # builtin.split: (22x1x16x64xf32, 22x1x16x64xf32) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        (
+            split_534,
+            split_535,
+        ) = einsum_401
+        del einsum_401
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_200 = paddle._C_ops.add(reshape_154, parameter_28)
+        del parameter_28
+
+        # builtin.combine: ([22x1x16x64xf32, 44x1x16x64xf32]) <- (22x1x16x64xf32, 44x1x16x64xf32)
+        combine_135 = [add_200, reshape_157]
+        del add_200, reshape_157
+
+        # pd_op.einsum: (1x16x22x44xf32, [0xf32, 0xf32], [22x1x16x64xf32, 44x1x16x64xf32]) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        einsum_402, einsum_403, einsum_404 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_135, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_135
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_536,
+            split_537,
+        ) = einsum_403
+        del einsum_403
+
+        # builtin.split: (22x1x16x64xf32, 44x1x16x64xf32) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        (
+            split_538,
+            split_539,
+        ) = einsum_404
+        del einsum_404
+
+        # pd_op.reshape: (1x16x44x22xf32) <- (1x16x22x44xf32, 4xi64)
+        reshape_158 = paddle._C_ops.reshape(einsum_402, full_int_array_7)
+        del einsum_402
+
+        # pd_op.slice: (1x16x43x22xf32) <- (1x16x44x22xf32, 1xi64, 1xi64)
+        slice_22 = paddle._C_ops.slice(
+            reshape_158, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del reshape_158
+
+        # pd_op.reshape: (1x16x22x43xf32) <- (1x16x43x22xf32, 4xi64)
+        reshape_159 = paddle._C_ops.reshape(slice_22, full_int_array_9)
+        del slice_22
+
+        # pd_op.index_select: (1x16x22x22xf32) <- (1x16x22x43xf32, 22xi64)
+        index_select_22 = paddle._C_ops.index_select(reshape_159, arange_2, 3)
+        del reshape_159
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_201 = paddle._C_ops.add(reshape_154, parameter_27)
+        del parameter_27, reshape_154
+
+        # builtin.combine: ([22x1x16x64xf32, 2x16x64xf32]) <- (22x1x16x64xf32, 2x16x64xf32)
+        combine_136 = [add_201, parameter_25]
+        del add_201, parameter_25
+
+        # pd_op.einsum: (22x1x16x2xf32, [0xf32, 0xf32], [22x1x16x64xf32, 2x16x64xf32]) <- ([22x1x16x64xf32, 2x16x64xf32])
+        einsum_405, einsum_406, einsum_407 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_136, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_136
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_540,
+            split_541,
+        ) = einsum_406
+        del einsum_406
+
+        # builtin.split: (22x1x16x64xf32, 2x16x64xf32) <- ([22x1x16x64xf32, 2x16x64xf32])
+        (
+            split_542,
+            split_543,
+        ) = einsum_407
+        del einsum_407
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x16x2xf32]) <- (22x22x1x2xf32, 22x1x16x2xf32)
+        combine_137 = [cast_5, einsum_405]
+        del einsum_405
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x16x2xf32]) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        einsum_408, einsum_409, einsum_410 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_137, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_137
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_544,
+            split_545,
+        ) = einsum_409
+        del einsum_409
+
+        # builtin.split: (22x22x1x2xf32, 22x1x16x2xf32) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        (
+            split_546,
+            split_547,
+        ) = einsum_410
+        del einsum_410
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_202 = paddle._C_ops.add(einsum_399, index_select_22)
+        del einsum_399, index_select_22
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_203 = paddle._C_ops.add(add_202, einsum_408)
+        del add_202, einsum_408
+
+        # pd_op.scale: (1x16x22x22xf32) <- (1x16x22x22xf32, 1xf32)
+        scale_26 = paddle._C_ops.scale(add_203, full_16, float("0"), True)
+        del add_203
+
+        # pd_op.subtract: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x1x22x22xf32)
+        subtract_22 = paddle._C_ops.subtract(scale_26, scale_4)
+        del scale_26
+
+        # pd_op.softmax: (1x16x22x22xf32) <- (1x16x22x22xf32)
+        softmax_22 = paddle._C_ops.softmax(subtract_22, 3)
+        del subtract_22
+
+        # pd_op.dropout: (1x16x22x22xf32, 1x16x22x22xui8) <- (1x16x22x22xf32, None, 1xf32)
+        dropout_180, dropout_181 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_22, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_22
+
+        # builtin.combine: ([1x16x22x22xf32, 22x1x16x64xf32]) <- (1x16x22x22xf32, 22x1x16x64xf32)
+        combine_138 = [dropout_180, reshape_156]
+        del dropout_180, reshape_156
+
+        # pd_op.einsum: (22x1x16x64xf32, [0xf32, 0xf32], [1x16x22x22xf32, 22x1x16x64xf32]) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        einsum_411, einsum_412, einsum_413 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_138, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_138
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_548,
+            split_549,
+        ) = einsum_412
+        del einsum_412
+
+        # builtin.split: (1x16x22x22xf32, 22x1x16x64xf32) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        (
+            split_550,
+            split_551,
+        ) = einsum_413
+        del einsum_413
+
+        # pd_op.reshape: (22x1x1024xf32) <- (22x1x16x64xf32, 3xi64)
+        reshape_160 = paddle._C_ops.reshape(einsum_411, full_int_array_10)
+        del einsum_411
+
+        # builtin.combine: ([22x1x1024xf32, 1024x1024xf32]) <- (22x1x1024xf32, 1024x1024xf32)
+        combine_139 = [reshape_160, parameter_30]
+        del parameter_30, reshape_160
+
+        # pd_op.einsum: (22x1x1024xf32, [0xf32, 0xf32], [22x1x1024xf32, 1024x1024xf32]) <- ([22x1x1024xf32, 1024x1024xf32])
+        einsum_414, einsum_415, einsum_416 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_139, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_139
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_552,
+            split_553,
+        ) = einsum_415
+        del einsum_415
+
+        # builtin.split: (22x1x1024xf32, 1024x1024xf32) <- ([22x1x1024xf32, 1024x1024xf32])
+        (
+            split_554,
+            split_555,
+        ) = einsum_416
+        del einsum_416
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_182, dropout_183 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_414, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_414
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_204 = paddle._C_ops.add(dropout_182, layer_norm_129)
+        del dropout_182, layer_norm_129
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_132, layer_norm_133, layer_norm_134 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_204, parameter_24, parameter_23, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_204, parameter_23, parameter_24
+
+        # pd_op.matmul: (22x1x4096xf32) <- (22x1x1024xf32, 1024x4096xf32)
+        matmul_136 = paddle._C_ops.matmul(layer_norm_132, parameter_20, False, False)
+        del parameter_20
+
+        # pd_op.add: (22x1x4096xf32) <- (22x1x4096xf32, 4096xf32)
+        add_205 = paddle._C_ops.add(matmul_136, parameter_19)
+        del matmul_136, parameter_19
+
+        # pd_op.gelu: (22x1x4096xf32) <- (22x1x4096xf32)
+        gelu_22 = paddle._C_ops.gelu(add_205, False)
+        del add_205
+
+        # pd_op.dropout: (22x1x4096xf32, 22x1x4096xui8) <- (22x1x4096xf32, None, 1xf32)
+        dropout_184, dropout_185 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_22, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_22
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x4096xf32, 4096x1024xf32)
+        matmul_137 = paddle._C_ops.matmul(dropout_184, parameter_18, False, False)
+        del dropout_184, parameter_18
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 1024xf32)
+        add_206 = paddle._C_ops.add(matmul_137, parameter_17)
+        del matmul_137, parameter_17
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_186, dropout_187 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_206, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_206
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_207 = paddle._C_ops.add(dropout_186, layer_norm_132)
+        del dropout_186, layer_norm_132
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_135, layer_norm_136, layer_norm_137 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_207, parameter_22, parameter_21, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_207, parameter_21, parameter_22
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_138 = paddle._C_ops.matmul(layer_norm_135, parameter_16, False, False)
+        del parameter_16
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_161 = paddle._C_ops.reshape(matmul_138, full_int_array_5)
+        del matmul_138
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_139 = paddle._C_ops.matmul(layer_norm_135, parameter_15, False, False)
+        del parameter_15
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_162 = paddle._C_ops.reshape(matmul_139, full_int_array_5)
+        del matmul_139
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x1024xf32, 1024x1024xf32)
+        matmul_140 = paddle._C_ops.matmul(layer_norm_135, parameter_14, False, False)
+        del parameter_14
+
+        # pd_op.reshape: (22x1x16x64xf32) <- (22x1x1024xf32, 4xi64)
+        reshape_163 = paddle._C_ops.reshape(matmul_140, full_int_array_5)
+        del full_int_array_5, matmul_140
+
+        # pd_op.matmul: (44x1x1024xf32) <- (44x1x1024xf32, 1024x1024xf32)
+        matmul_141 = paddle._C_ops.matmul(dropout_2, parameter_12, False, False)
+        del dropout_2, parameter_12
+
+        # pd_op.reshape: (44x1x16x64xf32) <- (44x1x1024xf32, 4xi64)
+        reshape_164 = paddle._C_ops.reshape(matmul_141, full_int_array_6)
+        del full_int_array_6, matmul_141
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_208 = paddle._C_ops.add(reshape_161, parameter_9)
+        del parameter_9
+
+        # builtin.combine: ([22x1x16x64xf32, 22x1x16x64xf32]) <- (22x1x16x64xf32, 22x1x16x64xf32)
+        combine_140 = [add_208, reshape_162]
+        del add_208, reshape_162
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x1x16x64xf32, 22x1x16x64xf32]) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        einsum_417, einsum_418, einsum_419 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_140, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_140
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_556,
+            split_557,
+        ) = einsum_418
+        del einsum_418
+
+        # builtin.split: (22x1x16x64xf32, 22x1x16x64xf32) <- ([22x1x16x64xf32, 22x1x16x64xf32])
+        (
+            split_558,
+            split_559,
+        ) = einsum_419
+        del einsum_419
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_209 = paddle._C_ops.add(reshape_161, parameter_11)
+        del parameter_11
+
+        # builtin.combine: ([22x1x16x64xf32, 44x1x16x64xf32]) <- (22x1x16x64xf32, 44x1x16x64xf32)
+        combine_141 = [add_209, reshape_164]
+        del add_209, reshape_164
+
+        # pd_op.einsum: (1x16x22x44xf32, [0xf32, 0xf32], [22x1x16x64xf32, 44x1x16x64xf32]) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        einsum_420, einsum_421, einsum_422 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_141, "ibnd,jbnd->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_141
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_560,
+            split_561,
+        ) = einsum_421
+        del einsum_421
+
+        # builtin.split: (22x1x16x64xf32, 44x1x16x64xf32) <- ([22x1x16x64xf32, 44x1x16x64xf32])
+        (
+            split_562,
+            split_563,
+        ) = einsum_422
+        del einsum_422
+
+        # pd_op.reshape: (1x16x44x22xf32) <- (1x16x22x44xf32, 4xi64)
+        reshape_165 = paddle._C_ops.reshape(einsum_420, full_int_array_7)
+        del einsum_420, full_int_array_7
+
+        # pd_op.slice: (1x16x43x22xf32) <- (1x16x44x22xf32, 1xi64, 1xi64)
+        slice_23 = paddle._C_ops.slice(
+            reshape_165, [2], full_int_array_3, full_int_array_8, [1], []
+        )
+        del full_int_array_3, full_int_array_8, reshape_165
+
+        # pd_op.reshape: (1x16x22x43xf32) <- (1x16x43x22xf32, 4xi64)
+        reshape_166 = paddle._C_ops.reshape(slice_23, full_int_array_9)
+        del full_int_array_9, slice_23
+
+        # pd_op.index_select: (1x16x22x22xf32) <- (1x16x22x43xf32, 22xi64)
+        index_select_23 = paddle._C_ops.index_select(reshape_166, arange_2, 3)
+        del arange_2, reshape_166
+
+        # pd_op.add: (22x1x16x64xf32) <- (22x1x16x64xf32, 16x64xf32)
+        add_210 = paddle._C_ops.add(reshape_161, parameter_10)
+        del parameter_10, reshape_161
+
+        # builtin.combine: ([22x1x16x64xf32, 2x16x64xf32]) <- (22x1x16x64xf32, 2x16x64xf32)
+        combine_142 = [add_210, parameter_8]
+        del add_210, parameter_8
+
+        # pd_op.einsum: (22x1x16x2xf32, [0xf32, 0xf32], [22x1x16x64xf32, 2x16x64xf32]) <- ([22x1x16x64xf32, 2x16x64xf32])
+        einsum_423, einsum_424, einsum_425 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_142, "ibnd,snd->ibns"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_142
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_564,
+            split_565,
+        ) = einsum_424
+        del einsum_424
+
+        # builtin.split: (22x1x16x64xf32, 2x16x64xf32) <- ([22x1x16x64xf32, 2x16x64xf32])
+        (
+            split_566,
+            split_567,
+        ) = einsum_425
+        del einsum_425
+
+        # builtin.combine: ([22x22x1x2xf32, 22x1x16x2xf32]) <- (22x22x1x2xf32, 22x1x16x2xf32)
+        combine_143 = [cast_5, einsum_423]
+        del cast_5, einsum_423
+
+        # pd_op.einsum: (1x16x22x22xf32, [0xf32, 0xf32], [22x22x1x2xf32, 22x1x16x2xf32]) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        einsum_426, einsum_427, einsum_428 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_143, "ijbs,ibns->bnij"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_143
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_568,
+            split_569,
+        ) = einsum_427
+        del einsum_427
+
+        # builtin.split: (22x22x1x2xf32, 22x1x16x2xf32) <- ([22x22x1x2xf32, 22x1x16x2xf32])
+        (
+            split_570,
+            split_571,
+        ) = einsum_428
+        del einsum_428
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_211 = paddle._C_ops.add(einsum_417, index_select_23)
+        del einsum_417, index_select_23
+
+        # pd_op.add: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x16x22x22xf32)
+        add_212 = paddle._C_ops.add(add_211, einsum_426)
+        del add_211, einsum_426
+
+        # pd_op.scale: (1x16x22x22xf32) <- (1x16x22x22xf32, 1xf32)
+        scale_27 = paddle._C_ops.scale(add_212, full_16, float("0"), True)
+        del add_212, full_16
+
+        # pd_op.subtract: (1x16x22x22xf32) <- (1x16x22x22xf32, 1x1x22x22xf32)
+        subtract_23 = paddle._C_ops.subtract(scale_27, scale_4)
+        del scale_27, scale_4
+
+        # pd_op.softmax: (1x16x22x22xf32) <- (1x16x22x22xf32)
+        softmax_23 = paddle._C_ops.softmax(subtract_23, 3)
+        del subtract_23
+
+        # pd_op.dropout: (1x16x22x22xf32, 1x16x22x22xui8) <- (1x16x22x22xf32, None, 1xf32)
+        dropout_188, dropout_189 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_23, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_23
+
+        # builtin.combine: ([1x16x22x22xf32, 22x1x16x64xf32]) <- (1x16x22x22xf32, 22x1x16x64xf32)
+        combine_144 = [dropout_188, reshape_163]
+        del dropout_188, reshape_163
+
+        # pd_op.einsum: (22x1x16x64xf32, [0xf32, 0xf32], [1x16x22x22xf32, 22x1x16x64xf32]) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        einsum_429, einsum_430, einsum_431 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_144, "bnij,jbnd->ibnd"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_144
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_572,
+            split_573,
+        ) = einsum_430
+        del einsum_430
+
+        # builtin.split: (1x16x22x22xf32, 22x1x16x64xf32) <- ([1x16x22x22xf32, 22x1x16x64xf32])
+        (
+            split_574,
+            split_575,
+        ) = einsum_431
+        del einsum_431
+
+        # pd_op.reshape: (22x1x1024xf32) <- (22x1x16x64xf32, 3xi64)
+        reshape_167 = paddle._C_ops.reshape(einsum_429, full_int_array_10)
+        del einsum_429, full_int_array_10
+
+        # builtin.combine: ([22x1x1024xf32, 1024x1024xf32]) <- (22x1x1024xf32, 1024x1024xf32)
+        combine_145 = [reshape_167, parameter_13]
+        del parameter_13, reshape_167
+
+        # pd_op.einsum: (22x1x1024xf32, [0xf32, 0xf32], [22x1x1024xf32, 1024x1024xf32]) <- ([22x1x1024xf32, 1024x1024xf32])
+        einsum_432, einsum_433, einsum_434 = (lambda x, f: f(x))(
+            paddle._C_ops.einsum(combine_145, "ibm,hm->ibh"),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del combine_145
+
+        # builtin.split: (0xf32, 0xf32) <- ([0xf32, 0xf32])
+        (
+            split_576,
+            split_577,
+        ) = einsum_433
+        del einsum_433
+
+        # builtin.split: (22x1x1024xf32, 1024x1024xf32) <- ([22x1x1024xf32, 1024x1024xf32])
+        (
+            split_578,
+            split_579,
+        ) = einsum_434
+        del einsum_434
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_190, dropout_191 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                einsum_432, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del einsum_432
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_213 = paddle._C_ops.add(dropout_190, layer_norm_135)
+        del dropout_190, layer_norm_135
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_138, layer_norm_139, layer_norm_140 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_213, parameter_7, parameter_6, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_213, parameter_6, parameter_7
+
+        # pd_op.matmul: (22x1x4096xf32) <- (22x1x1024xf32, 1024x4096xf32)
+        matmul_142 = paddle._C_ops.matmul(layer_norm_138, parameter_3, False, False)
+        del parameter_3
+
+        # pd_op.add: (22x1x4096xf32) <- (22x1x4096xf32, 4096xf32)
+        add_214 = paddle._C_ops.add(matmul_142, parameter_2)
+        del matmul_142, parameter_2
+
+        # pd_op.gelu: (22x1x4096xf32) <- (22x1x4096xf32)
+        gelu_23 = paddle._C_ops.gelu(add_214, False)
+        del add_214
+
+        # pd_op.dropout: (22x1x4096xf32, 22x1x4096xui8) <- (22x1x4096xf32, None, 1xf32)
+        dropout_192, dropout_193 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                gelu_23, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del gelu_23
+
+        # pd_op.matmul: (22x1x1024xf32) <- (22x1x4096xf32, 4096x1024xf32)
+        matmul_143 = paddle._C_ops.matmul(dropout_192, parameter_1, False, False)
+        del dropout_192, parameter_1
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 1024xf32)
+        add_215 = paddle._C_ops.add(matmul_143, parameter_0)
+        del matmul_143, parameter_0
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_194, dropout_195 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_215, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_215
+
+        # pd_op.add: (22x1x1024xf32) <- (22x1x1024xf32, 22x1x1024xf32)
+        add_216 = paddle._C_ops.add(dropout_194, layer_norm_138)
+        del dropout_194, layer_norm_138
+
+        # pd_op.layer_norm: (22x1x1024xf32, 22x1xf32, 22x1xf32) <- (22x1x1024xf32, 1024xf32, 1024xf32)
+        layer_norm_141, layer_norm_142, layer_norm_143 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_216, parameter_5, parameter_4, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_216, parameter_4, parameter_5
+
+        # pd_op.dropout: (22x1x1024xf32, 22x1x1024xui8) <- (22x1x1024xf32, None, 1xf32)
+        dropout_196, dropout_197 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                layer_norm_141, None, full_3, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del full_3, layer_norm_141
+
+        # pd_op.transpose: (1x22x1024xf32) <- (22x1x1024xf32)
+        transpose_0 = paddle._C_ops.transpose(dropout_196, [1, 0, 2])
+        del dropout_196
+
+        return transpose_0
diff --git a/paddle_samples/PaddleNLP/xlnet-large-cased/weight_meta.py b/paddle_samples/PaddleNLP/xlnet-large-cased/weight_meta.py
new file mode 100644
index 000000000..8b4095e62
--- /dev/null
+++ b/paddle_samples/PaddleNLP/xlnet-large-cased/weight_meta.py
@@ -0,0 +1,4076 @@
+class Program_weight_tensor_parameter_0:
+    name = "parameter_0"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_1:
+    name = "parameter_1"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.106054")
+    max_val = float("0.0960214")
+    mean = float("-1.15851e-05")
+    std = float("0.0200029")
+    data = None
+
+
+class Program_weight_tensor_parameter_2:
+    name = "parameter_2"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_3:
+    name = "parameter_3"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.10347")
+    max_val = float("0.108275")
+    mean = float("4.60236e-07")
+    std = float("0.0200048")
+    data = None
+
+
+class Program_weight_tensor_parameter_4:
+    name = "parameter_4"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_5:
+    name = "parameter_5"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_6:
+    name = "parameter_6"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_7:
+    name = "parameter_7"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_8:
+    name = "parameter_8"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0629102")
+    max_val = float("0.0841791")
+    mean = float("0.00041478")
+    std = float("0.0199864")
+    data = None
+
+
+class Program_weight_tensor_parameter_9:
+    name = "parameter_9"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0638675")
+    max_val = float("0.0699875")
+    mean = float("-6.22555e-05")
+    std = float("0.0203062")
+    data = None
+
+
+class Program_weight_tensor_parameter_10:
+    name = "parameter_10"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0627365")
+    max_val = float("0.0704744")
+    mean = float("0.00031257")
+    std = float("0.0198551")
+    data = None
+
+
+class Program_weight_tensor_parameter_11:
+    name = "parameter_11"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0653019")
+    max_val = float("0.0567174")
+    mean = float("0.000134436")
+    std = float("0.0194034")
+    data = None
+
+
+class Program_weight_tensor_parameter_12:
+    name = "parameter_12"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0946227")
+    max_val = float("0.109461")
+    mean = float("-9.10193e-06")
+    std = float("0.0200007")
+    data = None
+
+
+class Program_weight_tensor_parameter_13:
+    name = "parameter_13"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.10073")
+    max_val = float("0.0982617")
+    mean = float("2.79908e-05")
+    std = float("0.0199934")
+    data = None
+
+
+class Program_weight_tensor_parameter_14:
+    name = "parameter_14"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0949046")
+    max_val = float("0.0942779")
+    mean = float("-6.38509e-06")
+    std = float("0.0199833")
+    data = None
+
+
+class Program_weight_tensor_parameter_15:
+    name = "parameter_15"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.109448")
+    max_val = float("0.0951698")
+    mean = float("-1.58499e-05")
+    std = float("0.0200073")
+    data = None
+
+
+class Program_weight_tensor_parameter_16:
+    name = "parameter_16"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.106196")
+    max_val = float("0.103671")
+    mean = float("-2.10734e-05")
+    std = float("0.0200118")
+    data = None
+
+
+class Program_weight_tensor_parameter_17:
+    name = "parameter_17"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_18:
+    name = "parameter_18"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.0968363")
+    max_val = float("0.112204")
+    mean = float("4.902e-06")
+    std = float("0.0199906")
+    data = None
+
+
+class Program_weight_tensor_parameter_19:
+    name = "parameter_19"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_20:
+    name = "parameter_20"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.117731")
+    max_val = float("0.102028")
+    mean = float("1.14172e-05")
+    std = float("0.0200039")
+    data = None
+
+
+class Program_weight_tensor_parameter_21:
+    name = "parameter_21"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_22:
+    name = "parameter_22"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_23:
+    name = "parameter_23"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_24:
+    name = "parameter_24"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_25:
+    name = "parameter_25"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0727356")
+    max_val = float("0.0609156")
+    mean = float("-0.00057746")
+    std = float("0.0200492")
+    data = None
+
+
+class Program_weight_tensor_parameter_26:
+    name = "parameter_26"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0661265")
+    max_val = float("0.0651363")
+    mean = float("0.000445962")
+    std = float("0.0203104")
+    data = None
+
+
+class Program_weight_tensor_parameter_27:
+    name = "parameter_27"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0633563")
+    max_val = float("0.0703257")
+    mean = float("0.000476403")
+    std = float("0.0198287")
+    data = None
+
+
+class Program_weight_tensor_parameter_28:
+    name = "parameter_28"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0603359")
+    max_val = float("0.0811232")
+    mean = float("-0.000254285")
+    std = float("0.0196772")
+    data = None
+
+
+class Program_weight_tensor_parameter_29:
+    name = "parameter_29"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0982637")
+    max_val = float("0.0952011")
+    mean = float("3.30818e-05")
+    std = float("0.0200183")
+    data = None
+
+
+class Program_weight_tensor_parameter_30:
+    name = "parameter_30"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0940764")
+    max_val = float("0.0988642")
+    mean = float("2.51877e-05")
+    std = float("0.019996")
+    data = None
+
+
+class Program_weight_tensor_parameter_31:
+    name = "parameter_31"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.100339")
+    max_val = float("0.0961819")
+    mean = float("2.35292e-05")
+    std = float("0.0200088")
+    data = None
+
+
+class Program_weight_tensor_parameter_32:
+    name = "parameter_32"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0955299")
+    max_val = float("0.101229")
+    mean = float("2.87225e-05")
+    std = float("0.0199959")
+    data = None
+
+
+class Program_weight_tensor_parameter_33:
+    name = "parameter_33"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0997276")
+    max_val = float("0.0980562")
+    mean = float("1.44465e-05")
+    std = float("0.0199991")
+    data = None
+
+
+class Program_weight_tensor_parameter_34:
+    name = "parameter_34"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_35:
+    name = "parameter_35"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.0997595")
+    max_val = float("0.102574")
+    mean = float("-1.85995e-05")
+    std = float("0.0199962")
+    data = None
+
+
+class Program_weight_tensor_parameter_36:
+    name = "parameter_36"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_37:
+    name = "parameter_37"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.114702")
+    max_val = float("0.098215")
+    mean = float("7.8673e-06")
+    std = float("0.0199983")
+    data = None
+
+
+class Program_weight_tensor_parameter_38:
+    name = "parameter_38"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_39:
+    name = "parameter_39"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_40:
+    name = "parameter_40"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_41:
+    name = "parameter_41"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_42:
+    name = "parameter_42"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0717736")
+    max_val = float("0.0634031")
+    mean = float("0.000564224")
+    std = float("0.020665")
+    data = None
+
+
+class Program_weight_tensor_parameter_43:
+    name = "parameter_43"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0561762")
+    max_val = float("0.0550648")
+    mean = float("-0.00130685")
+    std = float("0.0204015")
+    data = None
+
+
+class Program_weight_tensor_parameter_44:
+    name = "parameter_44"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0634076")
+    max_val = float("0.072255")
+    mean = float("-0.000885043")
+    std = float("0.0197112")
+    data = None
+
+
+class Program_weight_tensor_parameter_45:
+    name = "parameter_45"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0707521")
+    max_val = float("0.0694411")
+    mean = float("8.53844e-05")
+    std = float("0.0199395")
+    data = None
+
+
+class Program_weight_tensor_parameter_46:
+    name = "parameter_46"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0967653")
+    max_val = float("0.103006")
+    mean = float("5.69153e-06")
+    std = float("0.0200125")
+    data = None
+
+
+class Program_weight_tensor_parameter_47:
+    name = "parameter_47"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0942901")
+    max_val = float("0.0982401")
+    mean = float("-2.33385e-05")
+    std = float("0.0200236")
+    data = None
+
+
+class Program_weight_tensor_parameter_48:
+    name = "parameter_48"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0997957")
+    max_val = float("0.101875")
+    mean = float("-2.06269e-05")
+    std = float("0.0199977")
+    data = None
+
+
+class Program_weight_tensor_parameter_49:
+    name = "parameter_49"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0967376")
+    max_val = float("0.0937413")
+    mean = float("2.44117e-07")
+    std = float("0.0199954")
+    data = None
+
+
+class Program_weight_tensor_parameter_50:
+    name = "parameter_50"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0995889")
+    max_val = float("0.0924654")
+    mean = float("1.09581e-05")
+    std = float("0.0200201")
+    data = None
+
+
+class Program_weight_tensor_parameter_51:
+    name = "parameter_51"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_52:
+    name = "parameter_52"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.106136")
+    max_val = float("0.110985")
+    mean = float("-3.1386e-07")
+    std = float("0.020002")
+    data = None
+
+
+class Program_weight_tensor_parameter_53:
+    name = "parameter_53"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_54:
+    name = "parameter_54"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.10291")
+    max_val = float("0.102207")
+    mean = float("2.15542e-06")
+    std = float("0.0199973")
+    data = None
+
+
+class Program_weight_tensor_parameter_55:
+    name = "parameter_55"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_56:
+    name = "parameter_56"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_57:
+    name = "parameter_57"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_58:
+    name = "parameter_58"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_59:
+    name = "parameter_59"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0759966")
+    max_val = float("0.0802708")
+    mean = float("-5.57094e-06")
+    std = float("0.0202897")
+    data = None
+
+
+class Program_weight_tensor_parameter_60:
+    name = "parameter_60"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0748273")
+    max_val = float("0.0596886")
+    mean = float("0.000434476")
+    std = float("0.0202084")
+    data = None
+
+
+class Program_weight_tensor_parameter_61:
+    name = "parameter_61"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0710389")
+    max_val = float("0.0645265")
+    mean = float("-0.000133568")
+    std = float("0.0199987")
+    data = None
+
+
+class Program_weight_tensor_parameter_62:
+    name = "parameter_62"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0611757")
+    max_val = float("0.0547125")
+    mean = float("-0.000343651")
+    std = float("0.0193835")
+    data = None
+
+
+class Program_weight_tensor_parameter_63:
+    name = "parameter_63"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0947259")
+    max_val = float("0.096012")
+    mean = float("-7.68183e-06")
+    std = float("0.0199989")
+    data = None
+
+
+class Program_weight_tensor_parameter_64:
+    name = "parameter_64"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.105741")
+    max_val = float("0.0998432")
+    mean = float("-2.21765e-05")
+    std = float("0.0199938")
+    data = None
+
+
+class Program_weight_tensor_parameter_65:
+    name = "parameter_65"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0974652")
+    max_val = float("0.110738")
+    mean = float("-2.11843e-05")
+    std = float("0.0200148")
+    data = None
+
+
+class Program_weight_tensor_parameter_66:
+    name = "parameter_66"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0902099")
+    max_val = float("0.0994524")
+    mean = float("-1.02443e-05")
+    std = float("0.0200169")
+    data = None
+
+
+class Program_weight_tensor_parameter_67:
+    name = "parameter_67"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0909302")
+    max_val = float("0.0948929")
+    mean = float("4.27818e-07")
+    std = float("0.0199923")
+    data = None
+
+
+class Program_weight_tensor_parameter_68:
+    name = "parameter_68"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_69:
+    name = "parameter_69"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.0983985")
+    max_val = float("0.104909")
+    mean = float("-1.20501e-05")
+    std = float("0.0199983")
+    data = None
+
+
+class Program_weight_tensor_parameter_70:
+    name = "parameter_70"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_71:
+    name = "parameter_71"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.102897")
+    max_val = float("0.100584")
+    mean = float("6.06252e-06")
+    std = float("0.019999")
+    data = None
+
+
+class Program_weight_tensor_parameter_72:
+    name = "parameter_72"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_73:
+    name = "parameter_73"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_74:
+    name = "parameter_74"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_75:
+    name = "parameter_75"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_76:
+    name = "parameter_76"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0824933")
+    max_val = float("0.059739")
+    mean = float("0.00019929")
+    std = float("0.0200107")
+    data = None
+
+
+class Program_weight_tensor_parameter_77:
+    name = "parameter_77"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0567373")
+    max_val = float("0.0523855")
+    mean = float("0.000189737")
+    std = float("0.0195964")
+    data = None
+
+
+class Program_weight_tensor_parameter_78:
+    name = "parameter_78"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0576826")
+    max_val = float("0.0661385")
+    mean = float("2.54318e-05")
+    std = float("0.0194682")
+    data = None
+
+
+class Program_weight_tensor_parameter_79:
+    name = "parameter_79"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0627012")
+    max_val = float("0.0644959")
+    mean = float("0.000415049")
+    std = float("0.0203708")
+    data = None
+
+
+class Program_weight_tensor_parameter_80:
+    name = "parameter_80"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0987199")
+    max_val = float("0.0952045")
+    mean = float("-6.68385e-06")
+    std = float("0.019987")
+    data = None
+
+
+class Program_weight_tensor_parameter_81:
+    name = "parameter_81"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.101537")
+    max_val = float("0.109131")
+    mean = float("1.96553e-05")
+    std = float("0.0199889")
+    data = None
+
+
+class Program_weight_tensor_parameter_82:
+    name = "parameter_82"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0934779")
+    max_val = float("0.0967221")
+    mean = float("1.3731e-06")
+    std = float("0.0200065")
+    data = None
+
+
+class Program_weight_tensor_parameter_83:
+    name = "parameter_83"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0904465")
+    max_val = float("0.113637")
+    mean = float("1.9164e-05")
+    std = float("0.02")
+    data = None
+
+
+class Program_weight_tensor_parameter_84:
+    name = "parameter_84"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0973789")
+    max_val = float("0.0949498")
+    mean = float("2.27422e-06")
+    std = float("0.0200016")
+    data = None
+
+
+class Program_weight_tensor_parameter_85:
+    name = "parameter_85"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_86:
+    name = "parameter_86"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.100945")
+    max_val = float("0.100593")
+    mean = float("-8.58709e-06")
+    std = float("0.0199981")
+    data = None
+
+
+class Program_weight_tensor_parameter_87:
+    name = "parameter_87"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_88:
+    name = "parameter_88"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.104066")
+    max_val = float("0.0972485")
+    mean = float("1.7527e-06")
+    std = float("0.0199909")
+    data = None
+
+
+class Program_weight_tensor_parameter_89:
+    name = "parameter_89"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_90:
+    name = "parameter_90"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_91:
+    name = "parameter_91"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_92:
+    name = "parameter_92"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_93:
+    name = "parameter_93"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0681122")
+    max_val = float("0.062478")
+    mean = float("-0.000147028")
+    std = float("0.0197177")
+    data = None
+
+
+class Program_weight_tensor_parameter_94:
+    name = "parameter_94"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0642083")
+    max_val = float("0.0556058")
+    mean = float("-0.000402527")
+    std = float("0.0194766")
+    data = None
+
+
+class Program_weight_tensor_parameter_95:
+    name = "parameter_95"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0667576")
+    max_val = float("0.0689737")
+    mean = float("-0.000169207")
+    std = float("0.0206662")
+    data = None
+
+
+class Program_weight_tensor_parameter_96:
+    name = "parameter_96"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0613529")
+    max_val = float("0.0688503")
+    mean = float("0.00110047")
+    std = float("0.0193664")
+    data = None
+
+
+class Program_weight_tensor_parameter_97:
+    name = "parameter_97"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0947994")
+    max_val = float("0.105075")
+    mean = float("3.53332e-06")
+    std = float("0.0200105")
+    data = None
+
+
+class Program_weight_tensor_parameter_98:
+    name = "parameter_98"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0946804")
+    max_val = float("0.0998394")
+    mean = float("1.64733e-05")
+    std = float("0.0199982")
+    data = None
+
+
+class Program_weight_tensor_parameter_99:
+    name = "parameter_99"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.101529")
+    max_val = float("0.0913024")
+    mean = float("-1.53374e-05")
+    std = float("0.020014")
+    data = None
+
+
+class Program_weight_tensor_parameter_100:
+    name = "parameter_100"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.101046")
+    max_val = float("0.0985934")
+    mean = float("-1.94784e-05")
+    std = float("0.0199928")
+    data = None
+
+
+class Program_weight_tensor_parameter_101:
+    name = "parameter_101"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0961183")
+    max_val = float("0.0941208")
+    mean = float("2.76461e-05")
+    std = float("0.0199907")
+    data = None
+
+
+class Program_weight_tensor_parameter_102:
+    name = "parameter_102"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_103:
+    name = "parameter_103"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.0962846")
+    max_val = float("0.107709")
+    mean = float("-1.81282e-06")
+    std = float("0.0200036")
+    data = None
+
+
+class Program_weight_tensor_parameter_104:
+    name = "parameter_104"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_105:
+    name = "parameter_105"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.103463")
+    max_val = float("0.104378")
+    mean = float("-3.97845e-06")
+    std = float("0.0200086")
+    data = None
+
+
+class Program_weight_tensor_parameter_106:
+    name = "parameter_106"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_107:
+    name = "parameter_107"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_108:
+    name = "parameter_108"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_109:
+    name = "parameter_109"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_110:
+    name = "parameter_110"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0714291")
+    max_val = float("0.0682799")
+    mean = float("0.000820523")
+    std = float("0.0193262")
+    data = None
+
+
+class Program_weight_tensor_parameter_111:
+    name = "parameter_111"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0645135")
+    max_val = float("0.0605717")
+    mean = float("0.000640813")
+    std = float("0.0202424")
+    data = None
+
+
+class Program_weight_tensor_parameter_112:
+    name = "parameter_112"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.070288")
+    max_val = float("0.0575892")
+    mean = float("8.90004e-05")
+    std = float("0.020105")
+    data = None
+
+
+class Program_weight_tensor_parameter_113:
+    name = "parameter_113"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0616152")
+    max_val = float("0.0581605")
+    mean = float("0.000435356")
+    std = float("0.0196566")
+    data = None
+
+
+class Program_weight_tensor_parameter_114:
+    name = "parameter_114"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0893861")
+    max_val = float("0.09287")
+    mean = float("1.0929e-05")
+    std = float("0.0199854")
+    data = None
+
+
+class Program_weight_tensor_parameter_115:
+    name = "parameter_115"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0974691")
+    max_val = float("0.0946894")
+    mean = float("-1.60171e-05")
+    std = float("0.0200062")
+    data = None
+
+
+class Program_weight_tensor_parameter_116:
+    name = "parameter_116"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0987745")
+    max_val = float("0.0941207")
+    mean = float("1.19255e-05")
+    std = float("0.0199877")
+    data = None
+
+
+class Program_weight_tensor_parameter_117:
+    name = "parameter_117"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0994819")
+    max_val = float("0.106583")
+    mean = float("3.07369e-05")
+    std = float("0.0200022")
+    data = None
+
+
+class Program_weight_tensor_parameter_118:
+    name = "parameter_118"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0924397")
+    max_val = float("0.0972573")
+    mean = float("-4.57201e-05")
+    std = float("0.0200062")
+    data = None
+
+
+class Program_weight_tensor_parameter_119:
+    name = "parameter_119"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_120:
+    name = "parameter_120"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.101136")
+    max_val = float("0.1")
+    mean = float("-9.05641e-07")
+    std = float("0.0199949")
+    data = None
+
+
+class Program_weight_tensor_parameter_121:
+    name = "parameter_121"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_122:
+    name = "parameter_122"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.100887")
+    max_val = float("0.103692")
+    mean = float("-4.19737e-06")
+    std = float("0.0199957")
+    data = None
+
+
+class Program_weight_tensor_parameter_123:
+    name = "parameter_123"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_124:
+    name = "parameter_124"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_125:
+    name = "parameter_125"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_126:
+    name = "parameter_126"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_127:
+    name = "parameter_127"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.066387")
+    max_val = float("0.0768451")
+    mean = float("0.000152864")
+    std = float("0.0198357")
+    data = None
+
+
+class Program_weight_tensor_parameter_128:
+    name = "parameter_128"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0633506")
+    max_val = float("0.0607709")
+    mean = float("-0.000676489")
+    std = float("0.0198584")
+    data = None
+
+
+class Program_weight_tensor_parameter_129:
+    name = "parameter_129"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0631143")
+    max_val = float("0.0591226")
+    mean = float("-0.00120845")
+    std = float("0.0198463")
+    data = None
+
+
+class Program_weight_tensor_parameter_130:
+    name = "parameter_130"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0591492")
+    max_val = float("0.057425")
+    mean = float("0.000858639")
+    std = float("0.0197628")
+    data = None
+
+
+class Program_weight_tensor_parameter_131:
+    name = "parameter_131"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.095505")
+    max_val = float("0.0974372")
+    mean = float("-1.35853e-05")
+    std = float("0.0199879")
+    data = None
+
+
+class Program_weight_tensor_parameter_132:
+    name = "parameter_132"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0911795")
+    max_val = float("0.103497")
+    mean = float("-1.04017e-05")
+    std = float("0.0200005")
+    data = None
+
+
+class Program_weight_tensor_parameter_133:
+    name = "parameter_133"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0916454")
+    max_val = float("0.101501")
+    mean = float("6.3882e-06")
+    std = float("0.0199991")
+    data = None
+
+
+class Program_weight_tensor_parameter_134:
+    name = "parameter_134"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0879144")
+    max_val = float("0.0924113")
+    mean = float("1.04034e-05")
+    std = float("0.0199618")
+    data = None
+
+
+class Program_weight_tensor_parameter_135:
+    name = "parameter_135"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0979216")
+    max_val = float("0.098031")
+    mean = float("-6.51493e-06")
+    std = float("0.0200191")
+    data = None
+
+
+class Program_weight_tensor_parameter_136:
+    name = "parameter_136"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_137:
+    name = "parameter_137"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.0977452")
+    max_val = float("0.100121")
+    mean = float("-1.08976e-06")
+    std = float("0.0200082")
+    data = None
+
+
+class Program_weight_tensor_parameter_138:
+    name = "parameter_138"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_139:
+    name = "parameter_139"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.104211")
+    max_val = float("0.102536")
+    mean = float("1.01295e-05")
+    std = float("0.0200073")
+    data = None
+
+
+class Program_weight_tensor_parameter_140:
+    name = "parameter_140"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_141:
+    name = "parameter_141"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_142:
+    name = "parameter_142"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_143:
+    name = "parameter_143"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_144:
+    name = "parameter_144"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0722138")
+    max_val = float("0.0803455")
+    mean = float("0.000762837")
+    std = float("0.0197865")
+    data = None
+
+
+class Program_weight_tensor_parameter_145:
+    name = "parameter_145"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.056961")
+    max_val = float("0.0715826")
+    mean = float("-0.000786657")
+    std = float("0.0194175")
+    data = None
+
+
+class Program_weight_tensor_parameter_146:
+    name = "parameter_146"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0676472")
+    max_val = float("0.0681574")
+    mean = float("-0.000424236")
+    std = float("0.0204415")
+    data = None
+
+
+class Program_weight_tensor_parameter_147:
+    name = "parameter_147"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0631112")
+    max_val = float("0.0610452")
+    mean = float("-0.000438692")
+    std = float("0.0205049")
+    data = None
+
+
+class Program_weight_tensor_parameter_148:
+    name = "parameter_148"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.097165")
+    max_val = float("0.0994696")
+    mean = float("1.57879e-06")
+    std = float("0.0199953")
+    data = None
+
+
+class Program_weight_tensor_parameter_149:
+    name = "parameter_149"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.093298")
+    max_val = float("0.096159")
+    mean = float("-6.33271e-06")
+    std = float("0.0200232")
+    data = None
+
+
+class Program_weight_tensor_parameter_150:
+    name = "parameter_150"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.100076")
+    max_val = float("0.0885343")
+    mean = float("1.44574e-05")
+    std = float("0.0199766")
+    data = None
+
+
+class Program_weight_tensor_parameter_151:
+    name = "parameter_151"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0925985")
+    max_val = float("0.09827")
+    mean = float("7.36023e-06")
+    std = float("0.0199871")
+    data = None
+
+
+class Program_weight_tensor_parameter_152:
+    name = "parameter_152"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0949021")
+    max_val = float("0.106815")
+    mean = float("-1.50847e-07")
+    std = float("0.020004")
+    data = None
+
+
+class Program_weight_tensor_parameter_153:
+    name = "parameter_153"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_154:
+    name = "parameter_154"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.108906")
+    max_val = float("0.0980955")
+    mean = float("9.04024e-06")
+    std = float("0.0200008")
+    data = None
+
+
+class Program_weight_tensor_parameter_155:
+    name = "parameter_155"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_156:
+    name = "parameter_156"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.101134")
+    max_val = float("0.110153")
+    mean = float("-8.8502e-06")
+    std = float("0.0199973")
+    data = None
+
+
+class Program_weight_tensor_parameter_157:
+    name = "parameter_157"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_158:
+    name = "parameter_158"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_159:
+    name = "parameter_159"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_160:
+    name = "parameter_160"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_161:
+    name = "parameter_161"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0840635")
+    max_val = float("0.0735396")
+    mean = float("-0.00035731")
+    std = float("0.020525")
+    data = None
+
+
+class Program_weight_tensor_parameter_162:
+    name = "parameter_162"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0694478")
+    max_val = float("0.0557208")
+    mean = float("0.000542685")
+    std = float("0.0191024")
+    data = None
+
+
+class Program_weight_tensor_parameter_163:
+    name = "parameter_163"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0774371")
+    max_val = float("0.0614744")
+    mean = float("-0.000672589")
+    std = float("0.0203074")
+    data = None
+
+
+class Program_weight_tensor_parameter_164:
+    name = "parameter_164"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.068261")
+    max_val = float("0.0740985")
+    mean = float("1.77945e-05")
+    std = float("0.0200567")
+    data = None
+
+
+class Program_weight_tensor_parameter_165:
+    name = "parameter_165"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0957754")
+    max_val = float("0.103741")
+    mean = float("-9.28956e-06")
+    std = float("0.0200265")
+    data = None
+
+
+class Program_weight_tensor_parameter_166:
+    name = "parameter_166"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0909206")
+    max_val = float("0.0919583")
+    mean = float("-9.37085e-07")
+    std = float("0.0200099")
+    data = None
+
+
+class Program_weight_tensor_parameter_167:
+    name = "parameter_167"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.101895")
+    max_val = float("0.103712")
+    mean = float("-3.31451e-05")
+    std = float("0.0200209")
+    data = None
+
+
+class Program_weight_tensor_parameter_168:
+    name = "parameter_168"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0914984")
+    max_val = float("0.0924312")
+    mean = float("2.24992e-05")
+    std = float("0.0199888")
+    data = None
+
+
+class Program_weight_tensor_parameter_169:
+    name = "parameter_169"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.117054")
+    max_val = float("0.0974593")
+    mean = float("7.09017e-06")
+    std = float("0.0200082")
+    data = None
+
+
+class Program_weight_tensor_parameter_170:
+    name = "parameter_170"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_171:
+    name = "parameter_171"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.104693")
+    max_val = float("0.109077")
+    mean = float("1.28089e-06")
+    std = float("0.0200104")
+    data = None
+
+
+class Program_weight_tensor_parameter_172:
+    name = "parameter_172"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_173:
+    name = "parameter_173"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.0992847")
+    max_val = float("0.0990478")
+    mean = float("-6.41955e-06")
+    std = float("0.0199923")
+    data = None
+
+
+class Program_weight_tensor_parameter_174:
+    name = "parameter_174"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_175:
+    name = "parameter_175"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_176:
+    name = "parameter_176"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_177:
+    name = "parameter_177"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_178:
+    name = "parameter_178"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0634189")
+    max_val = float("0.0753831")
+    mean = float("0.000107635")
+    std = float("0.0196634")
+    data = None
+
+
+class Program_weight_tensor_parameter_179:
+    name = "parameter_179"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0605175")
+    max_val = float("0.0513971")
+    mean = float("-0.000729531")
+    std = float("0.0196666")
+    data = None
+
+
+class Program_weight_tensor_parameter_180:
+    name = "parameter_180"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0586949")
+    max_val = float("0.0657678")
+    mean = float("-0.000752592")
+    std = float("0.020432")
+    data = None
+
+
+class Program_weight_tensor_parameter_181:
+    name = "parameter_181"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0692708")
+    max_val = float("0.0596413")
+    mean = float("-0.000465153")
+    std = float("0.0199913")
+    data = None
+
+
+class Program_weight_tensor_parameter_182:
+    name = "parameter_182"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0931099")
+    max_val = float("0.0954757")
+    mean = float("-5.31989e-06")
+    std = float("0.02001")
+    data = None
+
+
+class Program_weight_tensor_parameter_183:
+    name = "parameter_183"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0940111")
+    max_val = float("0.0973873")
+    mean = float("-2.33031e-05")
+    std = float("0.0199905")
+    data = None
+
+
+class Program_weight_tensor_parameter_184:
+    name = "parameter_184"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0938948")
+    max_val = float("0.0979914")
+    mean = float("-6.45665e-06")
+    std = float("0.020008")
+    data = None
+
+
+class Program_weight_tensor_parameter_185:
+    name = "parameter_185"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0941754")
+    max_val = float("0.0935313")
+    mean = float("-1.9268e-05")
+    std = float("0.02001")
+    data = None
+
+
+class Program_weight_tensor_parameter_186:
+    name = "parameter_186"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0908799")
+    max_val = float("0.0943659")
+    mean = float("5.38934e-06")
+    std = float("0.0200054")
+    data = None
+
+
+class Program_weight_tensor_parameter_187:
+    name = "parameter_187"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_188:
+    name = "parameter_188"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.10681")
+    max_val = float("0.107106")
+    mean = float("-5.9112e-06")
+    std = float("0.0200055")
+    data = None
+
+
+class Program_weight_tensor_parameter_189:
+    name = "parameter_189"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_190:
+    name = "parameter_190"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.100362")
+    max_val = float("0.102567")
+    mean = float("3.15704e-06")
+    std = float("0.0200096")
+    data = None
+
+
+class Program_weight_tensor_parameter_191:
+    name = "parameter_191"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_192:
+    name = "parameter_192"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_193:
+    name = "parameter_193"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_194:
+    name = "parameter_194"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_195:
+    name = "parameter_195"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.066251")
+    max_val = float("0.0656874")
+    mean = float("0.00064487")
+    std = float("0.0199795")
+    data = None
+
+
+class Program_weight_tensor_parameter_196:
+    name = "parameter_196"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0616622")
+    max_val = float("0.0572829")
+    mean = float("0.00029421")
+    std = float("0.0202689")
+    data = None
+
+
+class Program_weight_tensor_parameter_197:
+    name = "parameter_197"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0663683")
+    max_val = float("0.0598881")
+    mean = float("0.000626146")
+    std = float("0.0199059")
+    data = None
+
+
+class Program_weight_tensor_parameter_198:
+    name = "parameter_198"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.059461")
+    max_val = float("0.0599763")
+    mean = float("0.000392374")
+    std = float("0.0192713")
+    data = None
+
+
+class Program_weight_tensor_parameter_199:
+    name = "parameter_199"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0920732")
+    max_val = float("0.0965993")
+    mean = float("1.22806e-05")
+    std = float("0.0199985")
+    data = None
+
+
+class Program_weight_tensor_parameter_200:
+    name = "parameter_200"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0935352")
+    max_val = float("0.0969375")
+    mean = float("-1.34561e-05")
+    std = float("0.0200127")
+    data = None
+
+
+class Program_weight_tensor_parameter_201:
+    name = "parameter_201"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0976724")
+    max_val = float("0.0972049")
+    mean = float("2.25131e-05")
+    std = float("0.0200015")
+    data = None
+
+
+class Program_weight_tensor_parameter_202:
+    name = "parameter_202"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0971643")
+    max_val = float("0.116639")
+    mean = float("4.43747e-06")
+    std = float("0.0199986")
+    data = None
+
+
+class Program_weight_tensor_parameter_203:
+    name = "parameter_203"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0941074")
+    max_val = float("0.108684")
+    mean = float("1.62897e-05")
+    std = float("0.020005")
+    data = None
+
+
+class Program_weight_tensor_parameter_204:
+    name = "parameter_204"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_205:
+    name = "parameter_205"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.101354")
+    max_val = float("0.112926")
+    mean = float("-9.28535e-06")
+    std = float("0.0199946")
+    data = None
+
+
+class Program_weight_tensor_parameter_206:
+    name = "parameter_206"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_207:
+    name = "parameter_207"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.100693")
+    max_val = float("0.101043")
+    mean = float("9.13948e-06")
+    std = float("0.0200057")
+    data = None
+
+
+class Program_weight_tensor_parameter_208:
+    name = "parameter_208"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_209:
+    name = "parameter_209"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_210:
+    name = "parameter_210"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_211:
+    name = "parameter_211"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_212:
+    name = "parameter_212"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0762107")
+    max_val = float("0.06625")
+    mean = float("0.000400155")
+    std = float("0.0202677")
+    data = None
+
+
+class Program_weight_tensor_parameter_213:
+    name = "parameter_213"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0901191")
+    max_val = float("0.0675753")
+    mean = float("0.00117218")
+    std = float("0.02013")
+    data = None
+
+
+class Program_weight_tensor_parameter_214:
+    name = "parameter_214"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0612773")
+    max_val = float("0.0535311")
+    mean = float("-0.000230495")
+    std = float("0.020225")
+    data = None
+
+
+class Program_weight_tensor_parameter_215:
+    name = "parameter_215"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0577393")
+    max_val = float("0.062089")
+    mean = float("2.74119e-05")
+    std = float("0.0204406")
+    data = None
+
+
+class Program_weight_tensor_parameter_216:
+    name = "parameter_216"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0978597")
+    max_val = float("0.0916914")
+    mean = float("2.58244e-06")
+    std = float("0.0199936")
+    data = None
+
+
+class Program_weight_tensor_parameter_217:
+    name = "parameter_217"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0934566")
+    max_val = float("0.0899418")
+    mean = float("-3.3187e-05")
+    std = float("0.02")
+    data = None
+
+
+class Program_weight_tensor_parameter_218:
+    name = "parameter_218"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.100767")
+    max_val = float("0.0982188")
+    mean = float("1.2552e-05")
+    std = float("0.0200214")
+    data = None
+
+
+class Program_weight_tensor_parameter_219:
+    name = "parameter_219"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0901681")
+    max_val = float("0.110304")
+    mean = float("-7.9722e-06")
+    std = float("0.0200046")
+    data = None
+
+
+class Program_weight_tensor_parameter_220:
+    name = "parameter_220"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0922998")
+    max_val = float("0.101705")
+    mean = float("-1.73963e-05")
+    std = float("0.0199696")
+    data = None
+
+
+class Program_weight_tensor_parameter_221:
+    name = "parameter_221"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_222:
+    name = "parameter_222"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.1041")
+    max_val = float("0.107634")
+    mean = float("9.2561e-06")
+    std = float("0.0199911")
+    data = None
+
+
+class Program_weight_tensor_parameter_223:
+    name = "parameter_223"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_224:
+    name = "parameter_224"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.100772")
+    max_val = float("0.102886")
+    mean = float("-2.07023e-05")
+    std = float("0.0200025")
+    data = None
+
+
+class Program_weight_tensor_parameter_225:
+    name = "parameter_225"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_226:
+    name = "parameter_226"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_227:
+    name = "parameter_227"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_228:
+    name = "parameter_228"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_229:
+    name = "parameter_229"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0757654")
+    max_val = float("0.0701599")
+    mean = float("-0.000318248")
+    std = float("0.019568")
+    data = None
+
+
+class Program_weight_tensor_parameter_230:
+    name = "parameter_230"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0698035")
+    max_val = float("0.0679004")
+    mean = float("0.000979192")
+    std = float("0.0197564")
+    data = None
+
+
+class Program_weight_tensor_parameter_231:
+    name = "parameter_231"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.059353")
+    max_val = float("0.067176")
+    mean = float("2.78118e-05")
+    std = float("0.0196992")
+    data = None
+
+
+class Program_weight_tensor_parameter_232:
+    name = "parameter_232"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0697431")
+    max_val = float("0.0563663")
+    mean = float("0.000152303")
+    std = float("0.0199866")
+    data = None
+
+
+class Program_weight_tensor_parameter_233:
+    name = "parameter_233"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.100614")
+    max_val = float("0.0912606")
+    mean = float("2.82769e-06")
+    std = float("0.0199952")
+    data = None
+
+
+class Program_weight_tensor_parameter_234:
+    name = "parameter_234"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0963025")
+    max_val = float("0.0936212")
+    mean = float("2.50608e-05")
+    std = float("0.020014")
+    data = None
+
+
+class Program_weight_tensor_parameter_235:
+    name = "parameter_235"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0964591")
+    max_val = float("0.0953656")
+    mean = float("4.43783e-06")
+    std = float("0.0199904")
+    data = None
+
+
+class Program_weight_tensor_parameter_236:
+    name = "parameter_236"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0994844")
+    max_val = float("0.100083")
+    mean = float("4.20366e-06")
+    std = float("0.0200082")
+    data = None
+
+
+class Program_weight_tensor_parameter_237:
+    name = "parameter_237"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0990775")
+    max_val = float("0.0973748")
+    mean = float("3.93006e-05")
+    std = float("0.0200132")
+    data = None
+
+
+class Program_weight_tensor_parameter_238:
+    name = "parameter_238"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_239:
+    name = "parameter_239"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.0962434")
+    max_val = float("0.09773")
+    mean = float("2.75828e-06")
+    std = float("0.0200034")
+    data = None
+
+
+class Program_weight_tensor_parameter_240:
+    name = "parameter_240"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_241:
+    name = "parameter_241"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.10718")
+    max_val = float("0.0996396")
+    mean = float("1.37343e-05")
+    std = float("0.0200021")
+    data = None
+
+
+class Program_weight_tensor_parameter_242:
+    name = "parameter_242"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_243:
+    name = "parameter_243"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_244:
+    name = "parameter_244"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_245:
+    name = "parameter_245"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_246:
+    name = "parameter_246"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0572005")
+    max_val = float("0.0597119")
+    mean = float("-0.000229193")
+    std = float("0.0197843")
+    data = None
+
+
+class Program_weight_tensor_parameter_247:
+    name = "parameter_247"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0633685")
+    max_val = float("0.0622368")
+    mean = float("0.000740755")
+    std = float("0.0203714")
+    data = None
+
+
+class Program_weight_tensor_parameter_248:
+    name = "parameter_248"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0584085")
+    max_val = float("0.0657137")
+    mean = float("-0.000159375")
+    std = float("0.0202724")
+    data = None
+
+
+class Program_weight_tensor_parameter_249:
+    name = "parameter_249"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0803327")
+    max_val = float("0.0654359")
+    mean = float("-4.39397e-05")
+    std = float("0.0192414")
+    data = None
+
+
+class Program_weight_tensor_parameter_250:
+    name = "parameter_250"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0943897")
+    max_val = float("0.0953662")
+    mean = float("9.25647e-06")
+    std = float("0.0200079")
+    data = None
+
+
+class Program_weight_tensor_parameter_251:
+    name = "parameter_251"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.100952")
+    max_val = float("0.0902493")
+    mean = float("-1.61662e-05")
+    std = float("0.019998")
+    data = None
+
+
+class Program_weight_tensor_parameter_252:
+    name = "parameter_252"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0941652")
+    max_val = float("0.0981683")
+    mean = float("-4.68069e-05")
+    std = float("0.0200021")
+    data = None
+
+
+class Program_weight_tensor_parameter_253:
+    name = "parameter_253"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0961579")
+    max_val = float("0.0861866")
+    mean = float("3.35187e-05")
+    std = float("0.0200144")
+    data = None
+
+
+class Program_weight_tensor_parameter_254:
+    name = "parameter_254"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.107999")
+    max_val = float("0.0947114")
+    mean = float("-7.68855e-07")
+    std = float("0.0199972")
+    data = None
+
+
+class Program_weight_tensor_parameter_255:
+    name = "parameter_255"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_256:
+    name = "parameter_256"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.102002")
+    max_val = float("0.100289")
+    mean = float("8.46317e-07")
+    std = float("0.0199958")
+    data = None
+
+
+class Program_weight_tensor_parameter_257:
+    name = "parameter_257"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_258:
+    name = "parameter_258"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.0942694")
+    max_val = float("0.106696")
+    mean = float("-1.24755e-05")
+    std = float("0.0199964")
+    data = None
+
+
+class Program_weight_tensor_parameter_259:
+    name = "parameter_259"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_260:
+    name = "parameter_260"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_261:
+    name = "parameter_261"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_262:
+    name = "parameter_262"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_263:
+    name = "parameter_263"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0808637")
+    max_val = float("0.0661098")
+    mean = float("-0.000817654")
+    std = float("0.0201087")
+    data = None
+
+
+class Program_weight_tensor_parameter_264:
+    name = "parameter_264"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0648078")
+    max_val = float("0.0631008")
+    mean = float("-0.00095584")
+    std = float("0.0200808")
+    data = None
+
+
+class Program_weight_tensor_parameter_265:
+    name = "parameter_265"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0644025")
+    max_val = float("0.0644977")
+    mean = float("0.000808763")
+    std = float("0.0205874")
+    data = None
+
+
+class Program_weight_tensor_parameter_266:
+    name = "parameter_266"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0623088")
+    max_val = float("0.064596")
+    mean = float("-0.000765763")
+    std = float("0.0200499")
+    data = None
+
+
+class Program_weight_tensor_parameter_267:
+    name = "parameter_267"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.10159")
+    max_val = float("0.103478")
+    mean = float("4.39358e-05")
+    std = float("0.0199973")
+    data = None
+
+
+class Program_weight_tensor_parameter_268:
+    name = "parameter_268"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0902865")
+    max_val = float("0.0975484")
+    mean = float("-1.29312e-05")
+    std = float("0.0199991")
+    data = None
+
+
+class Program_weight_tensor_parameter_269:
+    name = "parameter_269"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0967346")
+    max_val = float("0.0973734")
+    mean = float("-2.19967e-05")
+    std = float("0.0200175")
+    data = None
+
+
+class Program_weight_tensor_parameter_270:
+    name = "parameter_270"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0959548")
+    max_val = float("0.108116")
+    mean = float("-3.80651e-05")
+    std = float("0.0199919")
+    data = None
+
+
+class Program_weight_tensor_parameter_271:
+    name = "parameter_271"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0958974")
+    max_val = float("0.0982373")
+    mean = float("2.84582e-06")
+    std = float("0.020007")
+    data = None
+
+
+class Program_weight_tensor_parameter_272:
+    name = "parameter_272"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_273:
+    name = "parameter_273"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.100233")
+    max_val = float("0.10353")
+    mean = float("1.35232e-05")
+    std = float("0.0200085")
+    data = None
+
+
+class Program_weight_tensor_parameter_274:
+    name = "parameter_274"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_275:
+    name = "parameter_275"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.0984514")
+    max_val = float("0.103702")
+    mean = float("2.04813e-06")
+    std = float("0.0200084")
+    data = None
+
+
+class Program_weight_tensor_parameter_276:
+    name = "parameter_276"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_277:
+    name = "parameter_277"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_278:
+    name = "parameter_278"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_279:
+    name = "parameter_279"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_280:
+    name = "parameter_280"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0625527")
+    max_val = float("0.067685")
+    mean = float("-7.07414e-05")
+    std = float("0.019375")
+    data = None
+
+
+class Program_weight_tensor_parameter_281:
+    name = "parameter_281"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0639338")
+    max_val = float("0.0624786")
+    mean = float("0.00068492")
+    std = float("0.0202815")
+    data = None
+
+
+class Program_weight_tensor_parameter_282:
+    name = "parameter_282"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0621922")
+    max_val = float("0.0678433")
+    mean = float("-0.000639305")
+    std = float("0.0202925")
+    data = None
+
+
+class Program_weight_tensor_parameter_283:
+    name = "parameter_283"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0639842")
+    max_val = float("0.0744252")
+    mean = float("0.000404183")
+    std = float("0.0197421")
+    data = None
+
+
+class Program_weight_tensor_parameter_284:
+    name = "parameter_284"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0925419")
+    max_val = float("0.0927665")
+    mean = float("3.65188e-06")
+    std = float("0.0200096")
+    data = None
+
+
+class Program_weight_tensor_parameter_285:
+    name = "parameter_285"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.101495")
+    max_val = float("0.0973781")
+    mean = float("1.73695e-05")
+    std = float("0.0199796")
+    data = None
+
+
+class Program_weight_tensor_parameter_286:
+    name = "parameter_286"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0978203")
+    max_val = float("0.0972203")
+    mean = float("-1.14079e-05")
+    std = float("0.0200017")
+    data = None
+
+
+class Program_weight_tensor_parameter_287:
+    name = "parameter_287"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0956221")
+    max_val = float("0.104711")
+    mean = float("1.37487e-05")
+    std = float("0.0200021")
+    data = None
+
+
+class Program_weight_tensor_parameter_288:
+    name = "parameter_288"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0915618")
+    max_val = float("0.108204")
+    mean = float("2.93813e-05")
+    std = float("0.0200081")
+    data = None
+
+
+class Program_weight_tensor_parameter_289:
+    name = "parameter_289"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_290:
+    name = "parameter_290"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.114183")
+    max_val = float("0.104422")
+    mean = float("7.3287e-06")
+    std = float("0.0199976")
+    data = None
+
+
+class Program_weight_tensor_parameter_291:
+    name = "parameter_291"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_292:
+    name = "parameter_292"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.101349")
+    max_val = float("0.10013")
+    mean = float("7.6001e-07")
+    std = float("0.0199979")
+    data = None
+
+
+class Program_weight_tensor_parameter_293:
+    name = "parameter_293"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_294:
+    name = "parameter_294"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_295:
+    name = "parameter_295"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_296:
+    name = "parameter_296"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_297:
+    name = "parameter_297"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0645872")
+    max_val = float("0.0716855")
+    mean = float("-0.000495045")
+    std = float("0.0195812")
+    data = None
+
+
+class Program_weight_tensor_parameter_298:
+    name = "parameter_298"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0611582")
+    max_val = float("0.0727321")
+    mean = float("-0.000392967")
+    std = float("0.0201977")
+    data = None
+
+
+class Program_weight_tensor_parameter_299:
+    name = "parameter_299"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0581971")
+    max_val = float("0.0598608")
+    mean = float("0.000397432")
+    std = float("0.0191885")
+    data = None
+
+
+class Program_weight_tensor_parameter_300:
+    name = "parameter_300"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0564572")
+    max_val = float("0.073037")
+    mean = float("0.00153765")
+    std = float("0.0206363")
+    data = None
+
+
+class Program_weight_tensor_parameter_301:
+    name = "parameter_301"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0962484")
+    max_val = float("0.0942868")
+    mean = float("8.79621e-06")
+    std = float("0.0200071")
+    data = None
+
+
+class Program_weight_tensor_parameter_302:
+    name = "parameter_302"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0956998")
+    max_val = float("0.0942656")
+    mean = float("-3.91776e-06")
+    std = float("0.019995")
+    data = None
+
+
+class Program_weight_tensor_parameter_303:
+    name = "parameter_303"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0909612")
+    max_val = float("0.105077")
+    mean = float("4.49877e-05")
+    std = float("0.0199845")
+    data = None
+
+
+class Program_weight_tensor_parameter_304:
+    name = "parameter_304"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.105015")
+    max_val = float("0.0947171")
+    mean = float("-1.10435e-05")
+    std = float("0.0200096")
+    data = None
+
+
+class Program_weight_tensor_parameter_305:
+    name = "parameter_305"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.100484")
+    max_val = float("0.098835")
+    mean = float("-1.65579e-05")
+    std = float("0.0200025")
+    data = None
+
+
+class Program_weight_tensor_parameter_306:
+    name = "parameter_306"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_307:
+    name = "parameter_307"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.110146")
+    max_val = float("0.109618")
+    mean = float("7.97491e-06")
+    std = float("0.0199906")
+    data = None
+
+
+class Program_weight_tensor_parameter_308:
+    name = "parameter_308"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_309:
+    name = "parameter_309"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.101873")
+    max_val = float("0.105303")
+    mean = float("5.83982e-06")
+    std = float("0.0199958")
+    data = None
+
+
+class Program_weight_tensor_parameter_310:
+    name = "parameter_310"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_311:
+    name = "parameter_311"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_312:
+    name = "parameter_312"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_313:
+    name = "parameter_313"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_314:
+    name = "parameter_314"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0833993")
+    max_val = float("0.0664246")
+    mean = float("-0.000166527")
+    std = float("0.0204661")
+    data = None
+
+
+class Program_weight_tensor_parameter_315:
+    name = "parameter_315"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0728898")
+    max_val = float("0.0616484")
+    mean = float("-0.000625521")
+    std = float("0.0198375")
+    data = None
+
+
+class Program_weight_tensor_parameter_316:
+    name = "parameter_316"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0652227")
+    max_val = float("0.0543112")
+    mean = float("0.00134297")
+    std = float("0.0200099")
+    data = None
+
+
+class Program_weight_tensor_parameter_317:
+    name = "parameter_317"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0576651")
+    max_val = float("0.0620159")
+    mean = float("-0.000763434")
+    std = float("0.0200541")
+    data = None
+
+
+class Program_weight_tensor_parameter_318:
+    name = "parameter_318"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0959107")
+    max_val = float("0.097618")
+    mean = float("1.68048e-05")
+    std = float("0.0199765")
+    data = None
+
+
+class Program_weight_tensor_parameter_319:
+    name = "parameter_319"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.102617")
+    max_val = float("0.102757")
+    mean = float("-2.87196e-06")
+    std = float("0.020003")
+    data = None
+
+
+class Program_weight_tensor_parameter_320:
+    name = "parameter_320"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0937724")
+    max_val = float("0.100478")
+    mean = float("-4.96675e-08")
+    std = float("0.0200332")
+    data = None
+
+
+class Program_weight_tensor_parameter_321:
+    name = "parameter_321"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0923792")
+    max_val = float("0.0992109")
+    mean = float("-1.49416e-05")
+    std = float("0.0200146")
+    data = None
+
+
+class Program_weight_tensor_parameter_322:
+    name = "parameter_322"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0942468")
+    max_val = float("0.0981391")
+    mean = float("-1.31399e-05")
+    std = float("0.0200129")
+    data = None
+
+
+class Program_weight_tensor_parameter_323:
+    name = "parameter_323"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_324:
+    name = "parameter_324"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.0986392")
+    max_val = float("0.0974826")
+    mean = float("1.40878e-05")
+    std = float("0.0200022")
+    data = None
+
+
+class Program_weight_tensor_parameter_325:
+    name = "parameter_325"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_326:
+    name = "parameter_326"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.0987421")
+    max_val = float("0.100351")
+    mean = float("-3.03125e-06")
+    std = float("0.0199945")
+    data = None
+
+
+class Program_weight_tensor_parameter_327:
+    name = "parameter_327"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_328:
+    name = "parameter_328"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_329:
+    name = "parameter_329"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_330:
+    name = "parameter_330"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_331:
+    name = "parameter_331"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0775364")
+    max_val = float("0.0678114")
+    mean = float("-0.000527159")
+    std = float("0.0202819")
+    data = None
+
+
+class Program_weight_tensor_parameter_332:
+    name = "parameter_332"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0583356")
+    max_val = float("0.0583862")
+    mean = float("-5.84847e-05")
+    std = float("0.0197946")
+    data = None
+
+
+class Program_weight_tensor_parameter_333:
+    name = "parameter_333"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0783503")
+    max_val = float("0.0747464")
+    mean = float("-9.03913e-06")
+    std = float("0.020419")
+    data = None
+
+
+class Program_weight_tensor_parameter_334:
+    name = "parameter_334"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0588559")
+    max_val = float("0.0589071")
+    mean = float("-0.000510391")
+    std = float("0.0195717")
+    data = None
+
+
+class Program_weight_tensor_parameter_335:
+    name = "parameter_335"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0965073")
+    max_val = float("0.101522")
+    mean = float("5.3999e-06")
+    std = float("0.0200109")
+    data = None
+
+
+class Program_weight_tensor_parameter_336:
+    name = "parameter_336"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0938622")
+    max_val = float("0.10364")
+    mean = float("-2.70096e-05")
+    std = float("0.0200286")
+    data = None
+
+
+class Program_weight_tensor_parameter_337:
+    name = "parameter_337"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0904915")
+    max_val = float("0.0892064")
+    mean = float("5.79199e-06")
+    std = float("0.0200299")
+    data = None
+
+
+class Program_weight_tensor_parameter_338:
+    name = "parameter_338"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0905588")
+    max_val = float("0.103688")
+    mean = float("-4.00367e-06")
+    std = float("0.0200005")
+    data = None
+
+
+class Program_weight_tensor_parameter_339:
+    name = "parameter_339"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0945638")
+    max_val = float("0.0893655")
+    mean = float("-8.73476e-06")
+    std = float("0.0199973")
+    data = None
+
+
+class Program_weight_tensor_parameter_340:
+    name = "parameter_340"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_341:
+    name = "parameter_341"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.10733")
+    max_val = float("0.106369")
+    mean = float("2.19368e-07")
+    std = float("0.0199976")
+    data = None
+
+
+class Program_weight_tensor_parameter_342:
+    name = "parameter_342"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_343:
+    name = "parameter_343"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.0957499")
+    max_val = float("0.115042")
+    mean = float("-9.20385e-08")
+    std = float("0.0200026")
+    data = None
+
+
+class Program_weight_tensor_parameter_344:
+    name = "parameter_344"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_345:
+    name = "parameter_345"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_346:
+    name = "parameter_346"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_347:
+    name = "parameter_347"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_348:
+    name = "parameter_348"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0604663")
+    max_val = float("0.0795493")
+    mean = float("4.21371e-05")
+    std = float("0.0199298")
+    data = None
+
+
+class Program_weight_tensor_parameter_349:
+    name = "parameter_349"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0630816")
+    max_val = float("0.0646421")
+    mean = float("0.0010257")
+    std = float("0.0197039")
+    data = None
+
+
+class Program_weight_tensor_parameter_350:
+    name = "parameter_350"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0692722")
+    max_val = float("0.0626657")
+    mean = float("-0.000970121")
+    std = float("0.0204877")
+    data = None
+
+
+class Program_weight_tensor_parameter_351:
+    name = "parameter_351"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0622362")
+    max_val = float("0.0625566")
+    mean = float("0.000481886")
+    std = float("0.0193751")
+    data = None
+
+
+class Program_weight_tensor_parameter_352:
+    name = "parameter_352"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0945452")
+    max_val = float("0.0955453")
+    mean = float("-5.62408e-06")
+    std = float("0.019999")
+    data = None
+
+
+class Program_weight_tensor_parameter_353:
+    name = "parameter_353"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.096623")
+    max_val = float("0.096632")
+    mean = float("-3.12025e-05")
+    std = float("0.0200064")
+    data = None
+
+
+class Program_weight_tensor_parameter_354:
+    name = "parameter_354"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.104093")
+    max_val = float("0.10271")
+    mean = float("2.11826e-05")
+    std = float("0.0200002")
+    data = None
+
+
+class Program_weight_tensor_parameter_355:
+    name = "parameter_355"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.101074")
+    max_val = float("0.0974191")
+    mean = float("1.87963e-05")
+    std = float("0.0200046")
+    data = None
+
+
+class Program_weight_tensor_parameter_356:
+    name = "parameter_356"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0949054")
+    max_val = float("0.0949776")
+    mean = float("3.0356e-05")
+    std = float("0.0200197")
+    data = None
+
+
+class Program_weight_tensor_parameter_357:
+    name = "parameter_357"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_358:
+    name = "parameter_358"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.0998529")
+    max_val = float("0.100143")
+    mean = float("-1.32428e-05")
+    std = float("0.0200092")
+    data = None
+
+
+class Program_weight_tensor_parameter_359:
+    name = "parameter_359"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_360:
+    name = "parameter_360"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.0979326")
+    max_val = float("0.0999568")
+    mean = float("-7.35727e-06")
+    std = float("0.0199956")
+    data = None
+
+
+class Program_weight_tensor_parameter_361:
+    name = "parameter_361"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_362:
+    name = "parameter_362"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_363:
+    name = "parameter_363"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_364:
+    name = "parameter_364"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_365:
+    name = "parameter_365"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0653577")
+    max_val = float("0.0607269")
+    mean = float("6.57215e-05")
+    std = float("0.0197316")
+    data = None
+
+
+class Program_weight_tensor_parameter_366:
+    name = "parameter_366"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0630385")
+    max_val = float("0.0739176")
+    mean = float("3.99468e-05")
+    std = float("0.0201011")
+    data = None
+
+
+class Program_weight_tensor_parameter_367:
+    name = "parameter_367"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0548557")
+    max_val = float("0.0849987")
+    mean = float("-0.000277907")
+    std = float("0.0197348")
+    data = None
+
+
+class Program_weight_tensor_parameter_368:
+    name = "parameter_368"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0676048")
+    max_val = float("0.0741839")
+    mean = float("-0.000614193")
+    std = float("0.0193521")
+    data = None
+
+
+class Program_weight_tensor_parameter_369:
+    name = "parameter_369"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0882833")
+    max_val = float("0.100105")
+    mean = float("1.57479e-05")
+    std = float("0.0200117")
+    data = None
+
+
+class Program_weight_tensor_parameter_370:
+    name = "parameter_370"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0898517")
+    max_val = float("0.101617")
+    mean = float("-2.55895e-05")
+    std = float("0.019994")
+    data = None
+
+
+class Program_weight_tensor_parameter_371:
+    name = "parameter_371"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0981875")
+    max_val = float("0.097163")
+    mean = float("-1.07195e-05")
+    std = float("0.0199935")
+    data = None
+
+
+class Program_weight_tensor_parameter_372:
+    name = "parameter_372"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.101081")
+    max_val = float("0.0956739")
+    mean = float("2.08879e-05")
+    std = float("0.0200277")
+    data = None
+
+
+class Program_weight_tensor_parameter_373:
+    name = "parameter_373"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0991906")
+    max_val = float("0.0970829")
+    mean = float("6.88477e-06")
+    std = float("0.0200219")
+    data = None
+
+
+class Program_weight_tensor_parameter_374:
+    name = "parameter_374"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_375:
+    name = "parameter_375"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.115641")
+    max_val = float("0.098508")
+    mean = float("-6.77359e-06")
+    std = float("0.0199976")
+    data = None
+
+
+class Program_weight_tensor_parameter_376:
+    name = "parameter_376"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_377:
+    name = "parameter_377"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.0984785")
+    max_val = float("0.0997063")
+    mean = float("-8.42396e-07")
+    std = float("0.0200093")
+    data = None
+
+
+class Program_weight_tensor_parameter_378:
+    name = "parameter_378"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_379:
+    name = "parameter_379"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_380:
+    name = "parameter_380"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_381:
+    name = "parameter_381"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_382:
+    name = "parameter_382"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0638253")
+    max_val = float("0.0755663")
+    mean = float("0.00015895")
+    std = float("0.0202998")
+    data = None
+
+
+class Program_weight_tensor_parameter_383:
+    name = "parameter_383"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0589826")
+    max_val = float("0.0668031")
+    mean = float("0.000758693")
+    std = float("0.0198609")
+    data = None
+
+
+class Program_weight_tensor_parameter_384:
+    name = "parameter_384"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.066861")
+    max_val = float("0.0602476")
+    mean = float("0.000222833")
+    std = float("0.0206746")
+    data = None
+
+
+class Program_weight_tensor_parameter_385:
+    name = "parameter_385"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0601304")
+    max_val = float("0.0608498")
+    mean = float("-0.000389166")
+    std = float("0.0197741")
+    data = None
+
+
+class Program_weight_tensor_parameter_386:
+    name = "parameter_386"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.095009")
+    max_val = float("0.107419")
+    mean = float("1.37709e-05")
+    std = float("0.020002")
+    data = None
+
+
+class Program_weight_tensor_parameter_387:
+    name = "parameter_387"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.103219")
+    max_val = float("0.0868753")
+    mean = float("1.44e-06")
+    std = float("0.0200017")
+    data = None
+
+
+class Program_weight_tensor_parameter_388:
+    name = "parameter_388"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0906494")
+    max_val = float("0.095732")
+    mean = float("6.41013e-07")
+    std = float("0.0199844")
+    data = None
+
+
+class Program_weight_tensor_parameter_389:
+    name = "parameter_389"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0943175")
+    max_val = float("0.0916165")
+    mean = float("-1.77288e-05")
+    std = float("0.0200087")
+    data = None
+
+
+class Program_weight_tensor_parameter_390:
+    name = "parameter_390"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0939238")
+    max_val = float("0.0962935")
+    mean = float("-3.63531e-06")
+    std = float("0.0200132")
+    data = None
+
+
+class Program_weight_tensor_parameter_391:
+    name = "parameter_391"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_392:
+    name = "parameter_392"
+    shape = [4096, 1024]
+    dtype = "float32"
+    min_val = float("-0.0974197")
+    max_val = float("0.104597")
+    mean = float("1.10199e-06")
+    std = float("0.0199984")
+    data = None
+
+
+class Program_weight_tensor_parameter_393:
+    name = "parameter_393"
+    shape = [4096]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_394:
+    name = "parameter_394"
+    shape = [1024, 4096]
+    dtype = "float32"
+    min_val = float("-0.123339")
+    max_val = float("0.106628")
+    mean = float("1.25406e-05")
+    std = float("0.0199984")
+    data = None
+
+
+class Program_weight_tensor_parameter_395:
+    name = "parameter_395"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_396:
+    name = "parameter_396"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_397:
+    name = "parameter_397"
+    shape = [1024]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_398:
+    name = "parameter_398"
+    shape = [1024]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_399:
+    name = "parameter_399"
+    shape = [2, 16, 64]
+    dtype = "float32"
+    min_val = float("-0.0816646")
+    max_val = float("0.0571751")
+    mean = float("-0.00102561")
+    std = float("0.0201216")
+    data = None
+
+
+class Program_weight_tensor_parameter_400:
+    name = "parameter_400"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0656832")
+    max_val = float("0.062296")
+    mean = float("-0.000532733")
+    std = float("0.0202617")
+    data = None
+
+
+class Program_weight_tensor_parameter_401:
+    name = "parameter_401"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0586502")
+    max_val = float("0.0668614")
+    mean = float("0.000874736")
+    std = float("0.0200638")
+    data = None
+
+
+class Program_weight_tensor_parameter_402:
+    name = "parameter_402"
+    shape = [16, 64]
+    dtype = "float32"
+    min_val = float("-0.0553704")
+    max_val = float("0.0684032")
+    mean = float("-0.000129585")
+    std = float("0.0194602")
+    data = None
+
+
+class Program_weight_tensor_parameter_403:
+    name = "parameter_403"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0992038")
+    max_val = float("0.0909653")
+    mean = float("-1.80797e-05")
+    std = float("0.0199992")
+    data = None
+
+
+class Program_weight_tensor_parameter_404:
+    name = "parameter_404"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.114171")
+    max_val = float("0.0952026")
+    mean = float("8.99988e-06")
+    std = float("0.0200014")
+    data = None
+
+
+class Program_weight_tensor_parameter_405:
+    name = "parameter_405"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.09525")
+    max_val = float("0.100321")
+    mean = float("-2.06329e-05")
+    std = float("0.0199986")
+    data = None
+
+
+class Program_weight_tensor_parameter_406:
+    name = "parameter_406"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.0894399")
+    max_val = float("0.0966649")
+    mean = float("1.86653e-05")
+    std = float("0.019994")
+    data = None
+
+
+class Program_weight_tensor_parameter_407:
+    name = "parameter_407"
+    shape = [1024, 1024]
+    dtype = "float32"
+    min_val = float("-0.105839")
+    max_val = float("0.0950324")
+    mean = float("-1.29832e-05")
+    std = float("0.0200056")
+    data = None
+
+
+class Program_weight_tensor_parameter_408:
+    name = "parameter_408"
+    shape = [32000, 1024]
+    dtype = "float32"
+    min_val = float("-0.109282")
+    max_val = float("0.10991")
+    mean = float("3.50928e-06")
+    std = float("0.0200011")
+    data = None
+
+
+class Program_weight_tensor_parameter_409:
+    name = "parameter_409"
+    shape = [1, 1, 1024]
+    dtype = "float32"
+    min_val = float("-0.0618999")
+    max_val = float("0.0796384")
+    mean = float("-0.000848671")
+    std = float("0.0199154")
+    data = None

From adadf101a0ba10240ea11a774a63fb2c25d33320 Mon Sep 17 00:00:00 2001
From: RbRe145 <czheng12399@outlook.com>
Date: Thu, 25 Sep 2025 07:55:47 +0000
Subject: [PATCH 3/4] add new albert and t5 models

---
 graph_net/test/nlp_model_getter.py            |   90 +
 .../PaddleNLP/albert-base-v1/graph_net.json   |    6 +
 .../PaddleNLP/albert-base-v1/input_meta.py    |   41 +
 .../PaddleNLP/albert-base-v1/model.py         | 1900 ++++++++++
 .../PaddleNLP/albert-base-v1/weight_meta.py   |  235 ++
 .../PaddleNLP/albert-base-v2/graph_net.json   |    6 +
 .../PaddleNLP/albert-base-v2/input_meta.py    |   41 +
 .../PaddleNLP/albert-base-v2/model.py         | 2003 ++++++++++
 .../PaddleNLP/albert-base-v2/weight_meta.py   |  235 ++
 .../albert-chinese-base/graph_net.json        |    6 +
 .../albert-chinese-base/input_meta.py         |   19 +
 .../PaddleNLP/albert-chinese-base/model.py    | 1670 +++++++++
 .../albert-chinese-base/weight_meta.py        |  235 ++
 .../albert-chinese-small/graph_net.json       |    6 +
 .../albert-chinese-small/input_meta.py        |   19 +
 .../PaddleNLP/albert-chinese-small/model.py   |  914 +++++
 .../albert-chinese-small/weight_meta.py       |  237 ++
 .../albert-chinese-tiny/graph_net.json        |    6 +
 .../albert-chinese-tiny/input_meta.py         |   19 +
 .../PaddleNLP/albert-chinese-tiny/model.py    |  662 ++++
 .../albert-chinese-tiny/weight_meta.py        |  235 ++
 .../PaddleNLP/t5-small/graph_net.json         |    6 +
 .../PaddleNLP/t5-small/input_meta.py          |   40 +
 paddle_samples/PaddleNLP/t5-small/model.py    | 3317 +++++++++++++++++
 .../PaddleNLP/t5-small/weight_meta.py         | 1439 +++++++
 25 files changed, 13387 insertions(+)
 create mode 100644 paddle_samples/PaddleNLP/albert-base-v1/graph_net.json
 create mode 100644 paddle_samples/PaddleNLP/albert-base-v1/input_meta.py
 create mode 100644 paddle_samples/PaddleNLP/albert-base-v1/model.py
 create mode 100644 paddle_samples/PaddleNLP/albert-base-v1/weight_meta.py
 create mode 100644 paddle_samples/PaddleNLP/albert-base-v2/graph_net.json
 create mode 100644 paddle_samples/PaddleNLP/albert-base-v2/input_meta.py
 create mode 100644 paddle_samples/PaddleNLP/albert-base-v2/model.py
 create mode 100644 paddle_samples/PaddleNLP/albert-base-v2/weight_meta.py
 create mode 100644 paddle_samples/PaddleNLP/albert-chinese-base/graph_net.json
 create mode 100644 paddle_samples/PaddleNLP/albert-chinese-base/input_meta.py
 create mode 100644 paddle_samples/PaddleNLP/albert-chinese-base/model.py
 create mode 100644 paddle_samples/PaddleNLP/albert-chinese-base/weight_meta.py
 create mode 100644 paddle_samples/PaddleNLP/albert-chinese-small/graph_net.json
 create mode 100644 paddle_samples/PaddleNLP/albert-chinese-small/input_meta.py
 create mode 100644 paddle_samples/PaddleNLP/albert-chinese-small/model.py
 create mode 100644 paddle_samples/PaddleNLP/albert-chinese-small/weight_meta.py
 create mode 100644 paddle_samples/PaddleNLP/albert-chinese-tiny/graph_net.json
 create mode 100644 paddle_samples/PaddleNLP/albert-chinese-tiny/input_meta.py
 create mode 100644 paddle_samples/PaddleNLP/albert-chinese-tiny/model.py
 create mode 100644 paddle_samples/PaddleNLP/albert-chinese-tiny/weight_meta.py
 create mode 100644 paddle_samples/PaddleNLP/t5-small/graph_net.json
 create mode 100644 paddle_samples/PaddleNLP/t5-small/input_meta.py
 create mode 100644 paddle_samples/PaddleNLP/t5-small/model.py
 create mode 100644 paddle_samples/PaddleNLP/t5-small/weight_meta.py

diff --git a/graph_net/test/nlp_model_getter.py b/graph_net/test/nlp_model_getter.py
index 5ce710b24..d795f7e30 100644
--- a/graph_net/test/nlp_model_getter.py
+++ b/graph_net/test/nlp_model_getter.py
@@ -154,3 +154,93 @@ def get_xlnet_model_and_inputs(model_name, text, dtype):
         enc["attention_mask"] = (input_ids != pad_id).astype("int64")
 
     return model, enc
+
+
+def get_t5_model_and_inputs(model_name, text, dtype):
+    import paddle
+    from paddlenlp.transformers import T5ForConditionalGeneration, T5Tokenizer
+
+    # 1) 分词器（先建 tokenizer 方便取 pad/eos id）
+    tokenizer = T5Tokenizer.from_pretrained(model_name)
+
+    # 2) 编码输入（支持单条或批量 text）
+    enc = tokenizer(
+        text,
+        return_tensors="pd",
+        padding=True,
+        truncation=True,
+        max_length=512,
+    )
+
+    # 补 attention_mask（pad 处为 0，其他为 1）
+    if "attention_mask" not in enc:
+        input_ids = enc["input_ids"]
+        attn_mask = (input_ids != tokenizer.pad_token_id).astype("int64")
+        enc["attention_mask"] = attn_mask
+
+    # 构造 decoder_input_ids：
+    # T5 以 pad_token_id 作为 decoder_start_token_id
+    batch_size = enc["input_ids"].shape[0]
+    decoder_input_ids = paddle.full(
+        shape=[batch_size, 1],
+        fill_value=tokenizer.pad_token_id,
+        dtype="int64",
+    )
+
+    # 3) 加载模型
+    model = T5ForConditionalGeneration.from_pretrained(model_name)
+    if dtype == "float16":
+        model = model.astype(paddle.float16)
+    model.eval()
+
+    # 4) 组装喂给模型的输入
+    inputs = {
+        "input_ids": enc["input_ids"],
+        "attention_mask": enc["attention_mask"],
+        "decoder_input_ids": decoder_input_ids,
+    }
+    return model, inputs
+
+
+def get_albert_model_and_inputs(model_name, text, dtype):
+    """
+    加载 ALBERT backbone（AlbertModel）并构造输入。
+    - model_name 例如: "albert-base-v2", "albert-xxlarge-v1"（PaddleNLP 内置名称）
+    - dtype: "float32" 或 "float16"
+    返回: (model, inputs_dict)
+    """
+    import paddle
+    from paddlenlp.transformers import AlbertConfig, AlbertModel, AlbertTokenizer
+
+    # 1) 读取配置（不触发权重下载）
+    config = AlbertConfig.from_pretrained(model_name)
+
+    # 2) 模型
+    #    若你只需要网络结构，可改成: model = AlbertModel(config)
+    model = AlbertModel(config)
+    if dtype == "float16":
+        model = model.astype(paddle.float16)
+    model.eval()
+
+    # 3) 分词器
+    tokenizer = AlbertTokenizer.from_pretrained(model_name)
+
+    # 若无 pad_token，则回退到 unk_token（ALBERT 没有 eos_token，别设 pad=eos）
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.unk_token
+
+    # 4) 构造输入（支持 str 或 List[str]）
+    enc = tokenizer(
+        text,
+        return_tensors="pd",
+        padding=True,
+        truncation=True,
+        max_length=512,
+    )
+
+    # 显式补 attention_mask（pad 处为 0）
+    if "attention_mask" not in enc:
+        input_ids = enc["input_ids"]
+        enc["attention_mask"] = (input_ids != tokenizer.pad_token_id).astype("int64")
+
+    return model, enc
diff --git a/paddle_samples/PaddleNLP/albert-base-v1/graph_net.json b/paddle_samples/PaddleNLP/albert-base-v1/graph_net.json
new file mode 100644
index 000000000..e0b36802b
--- /dev/null
+++ b/paddle_samples/PaddleNLP/albert-base-v1/graph_net.json
@@ -0,0 +1,6 @@
+{
+    "framework": "paddle",
+    "model_name": "albert-base-v1",
+    "num_devices_required": 1,
+    "num_nodes_required": 1
+}
\ No newline at end of file
diff --git a/paddle_samples/PaddleNLP/albert-base-v1/input_meta.py b/paddle_samples/PaddleNLP/albert-base-v1/input_meta.py
new file mode 100644
index 000000000..b45834638
--- /dev/null
+++ b/paddle_samples/PaddleNLP/albert-base-v1/input_meta.py
@@ -0,0 +1,41 @@
+class Program_weight_tensor_data_0:
+    name = "data_0"
+    shape = [1, 21]
+    dtype = "int64"
+    data = [
+        2,
+        10975,
+        15,
+        51,
+        204,
+        25,
+        1909,
+        9,
+        31,
+        589,
+        2477,
+        88,
+        370,
+        816,
+        2761,
+        17,
+        66,
+        2607,
+        18,
+        9,
+        3,
+    ]
+
+
+class Program_weight_tensor_data_1:
+    name = "data_1"
+    shape = [1, 21]
+    dtype = "int64"
+    data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+
+
+class Program_weight_tensor_data_2:
+    name = "data_2"
+    shape = [1, 21]
+    dtype = "int64"
+    data = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
diff --git a/paddle_samples/PaddleNLP/albert-base-v1/model.py b/paddle_samples/PaddleNLP/albert-base-v1/model.py
new file mode 100644
index 000000000..d13518e69
--- /dev/null
+++ b/paddle_samples/PaddleNLP/albert-base-v1/model.py
@@ -0,0 +1,1900 @@
+import paddle
+
+
+class GraphModule(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self,
+        parameter_0,
+        parameter_1,
+        parameter_2,
+        parameter_3,
+        parameter_4,
+        parameter_5,
+        parameter_6,
+        parameter_7,
+        parameter_8,
+        parameter_9,
+        parameter_10,
+        parameter_11,
+        parameter_12,
+        parameter_13,
+        parameter_14,
+        parameter_15,
+        parameter_16,
+        parameter_17,
+        parameter_18,
+        parameter_19,
+        parameter_20,
+        parameter_21,
+        parameter_22,
+        parameter_23,
+        parameter_24,
+        parameter_25,
+        data_0,
+        data_1,
+        data_2,
+    ):
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_0 = [1]
+
+        # pd_op.unsqueeze: (1x1x21xi64) <- (1x21xi64, 1xi64)
+        unsqueeze_0 = paddle._C_ops.unsqueeze(data_1, full_int_array_0)
+        del data_1
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_1 = [2]
+
+        # pd_op.unsqueeze: (1x1x1x21xi64) <- (1x1x21xi64, 1xi64)
+        unsqueeze_1 = paddle._C_ops.unsqueeze(unsqueeze_0, full_int_array_1)
+        del full_int_array_1, unsqueeze_0
+
+        # pd_op.cast: (1x1x1x21xf32) <- (1x1x1x21xi64)
+        cast_0 = paddle._C_ops.cast(unsqueeze_1, paddle.float32)
+        del unsqueeze_1
+
+        # pd_op.full: (1xf32) <- ()
+        full_0 = paddle._C_ops.full(
+            [1], float("-1"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x1x1x21xf32) <- (1x1x1x21xf32, 1xf32)
+        scale_0 = paddle._C_ops.scale(cast_0, full_0, float("1"), True)
+        del cast_0, full_0
+
+        # pd_op.full: (1xf32) <- ()
+        full_1 = paddle._C_ops.full(
+            [1], float("-10000"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x1x1x21xf32) <- (1x1x1x21xf32, 1xf32)
+        scale_1 = paddle._C_ops.scale(scale_0, full_1, float("0"), True)
+        del full_1, scale_0
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_2 = [0]
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_3 = [21]
+
+        # pd_op.slice: (1x21xi64) <- (1x512xi64, 1xi64, 1xi64)
+        slice_0 = paddle._C_ops.slice(
+            parameter_0, [1], full_int_array_2, full_int_array_3, [1], []
+        )
+        del full_int_array_3, parameter_0
+
+        # pd_op.embedding: (1x21x128xf32) <- (1x21xi64, 30000x128xf32)
+        embedding_0 = paddle._C_ops.embedding(data_0, parameter_25, 0, False)
+        del data_0, parameter_25
+
+        # pd_op.embedding: (1x21x128xf32) <- (1x21xi64, 2x128xf32)
+        embedding_1 = paddle._C_ops.embedding(data_2, parameter_23, -1, False)
+        del data_2, parameter_23
+
+        # pd_op.add: (1x21x128xf32) <- (1x21x128xf32, 1x21x128xf32)
+        add_0 = paddle._C_ops.add(embedding_0, embedding_1)
+        del embedding_0, embedding_1
+
+        # pd_op.embedding: (1x21x128xf32) <- (1x21xi64, 512x128xf32)
+        embedding_2 = paddle._C_ops.embedding(slice_0, parameter_24, -1, False)
+        del parameter_24, slice_0
+
+        # pd_op.add: (1x21x128xf32) <- (1x21x128xf32, 1x21x128xf32)
+        add_1 = paddle._C_ops.add(add_0, embedding_2)
+        del add_0, embedding_2
+
+        # pd_op.layer_norm: (1x21x128xf32, 1x21xf32, 1x21xf32) <- (1x21x128xf32, 128xf32, 128xf32)
+        layer_norm_0, layer_norm_1, layer_norm_2 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_1, parameter_22, parameter_21, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_1, parameter_21, parameter_22
+
+        # pd_op.full: (1xf32) <- ()
+        full_2 = paddle._C_ops.full(
+            [1], float("0.1"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.dropout: (1x21x128xf32, 1x21x128xui8) <- (1x21x128xf32, None, 1xf32)
+        dropout_0, dropout_1 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                layer_norm_0, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del layer_norm_0
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x128xf32, 128x768xf32)
+        matmul_0 = paddle._C_ops.matmul(dropout_0, parameter_20, False, False)
+        del dropout_0, parameter_20
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_2 = paddle._C_ops.add(matmul_0, parameter_19)
+        del matmul_0, parameter_19
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_1 = paddle._C_ops.matmul(add_2, parameter_16, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_3 = paddle._C_ops.add(matmul_1, parameter_15)
+        del matmul_1
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_2 = paddle._C_ops.matmul(add_2, parameter_14, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_4 = paddle._C_ops.add(matmul_2, parameter_13)
+        del matmul_2
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_3 = paddle._C_ops.matmul(add_2, parameter_12, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_5 = paddle._C_ops.add(matmul_3, parameter_11)
+        del matmul_3
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_4 = [1, 21, 12, 64]
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_0 = paddle._C_ops.reshape(add_3, full_int_array_4)
+        del add_3
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_0 = paddle._C_ops.transpose(reshape_0, [0, 2, 1, 3])
+        del reshape_0
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_1 = paddle._C_ops.reshape(add_4, full_int_array_4)
+        del add_4
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_1 = paddle._C_ops.transpose(reshape_1, [0, 2, 1, 3])
+        del reshape_1
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_2 = paddle._C_ops.reshape(add_5, full_int_array_4)
+        del add_5
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_2 = paddle._C_ops.transpose(reshape_2, [0, 2, 1, 3])
+        del reshape_2
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_4 = paddle._C_ops.matmul(transpose_0, transpose_1, False, True)
+        del transpose_0, transpose_1
+
+        # pd_op.full: (1xf32) <- ()
+        full_3 = paddle._C_ops.full(
+            [1], float("0.125"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x12x21x21xf32) <- (1x12x21x21xf32, 1xf32)
+        scale_2 = paddle._C_ops.scale(matmul_4, full_3, float("0"), True)
+        del matmul_4
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_6 = paddle._C_ops.add(scale_2, scale_1)
+        del scale_2
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_0 = paddle._C_ops.softmax(add_6, -1)
+        del add_6
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_2, dropout_3 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_0, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_0
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_5 = paddle._C_ops.matmul(dropout_2, transpose_2, False, False)
+        del dropout_2, transpose_2
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_3 = paddle._C_ops.transpose(matmul_5, [0, 2, 1, 3])
+        del matmul_5
+
+        # pd_op.full_int_array: (3xi64) <- ()
+        full_int_array_5 = [0, 0, -1]
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_3 = paddle._C_ops.reshape(transpose_3, full_int_array_5)
+        del transpose_3
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_6 = paddle._C_ops.matmul(reshape_3, parameter_10, False, False)
+        del reshape_3
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_7 = paddle._C_ops.add(matmul_6, parameter_9)
+        del matmul_6
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_4, dropout_5 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_7, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_7
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_8 = paddle._C_ops.add(add_2, dropout_4)
+        del add_2, dropout_4
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_3, layer_norm_4, layer_norm_5 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_8, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_8
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_7 = paddle._C_ops.matmul(layer_norm_3, parameter_6, False, False)
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_9 = paddle._C_ops.add(matmul_7, parameter_5)
+        del matmul_7
+
+        # pd_op.gelu: (1x21x3072xf32) <- (1x21x3072xf32)
+        gelu_0 = paddle._C_ops.gelu(add_9, False)
+        del add_9
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_8 = paddle._C_ops.matmul(gelu_0, parameter_4, False, False)
+        del gelu_0
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_10 = paddle._C_ops.add(matmul_8, parameter_3)
+        del matmul_8
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_11 = paddle._C_ops.add(add_10, layer_norm_3)
+        del add_10, layer_norm_3
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_6, layer_norm_7, layer_norm_8 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_11, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_11
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_9 = paddle._C_ops.matmul(layer_norm_6, parameter_16, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_12 = paddle._C_ops.add(matmul_9, parameter_15)
+        del matmul_9
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_10 = paddle._C_ops.matmul(layer_norm_6, parameter_14, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_13 = paddle._C_ops.add(matmul_10, parameter_13)
+        del matmul_10
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_11 = paddle._C_ops.matmul(layer_norm_6, parameter_12, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_14 = paddle._C_ops.add(matmul_11, parameter_11)
+        del matmul_11
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_4 = paddle._C_ops.reshape(add_12, full_int_array_4)
+        del add_12
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_4 = paddle._C_ops.transpose(reshape_4, [0, 2, 1, 3])
+        del reshape_4
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_5 = paddle._C_ops.reshape(add_13, full_int_array_4)
+        del add_13
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_5 = paddle._C_ops.transpose(reshape_5, [0, 2, 1, 3])
+        del reshape_5
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_6 = paddle._C_ops.reshape(add_14, full_int_array_4)
+        del add_14
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_6 = paddle._C_ops.transpose(reshape_6, [0, 2, 1, 3])
+        del reshape_6
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_12 = paddle._C_ops.matmul(transpose_4, transpose_5, False, True)
+        del transpose_4, transpose_5
+
+        # pd_op.scale: (1x12x21x21xf32) <- (1x12x21x21xf32, 1xf32)
+        scale_3 = paddle._C_ops.scale(matmul_12, full_3, float("0"), True)
+        del matmul_12
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_15 = paddle._C_ops.add(scale_3, scale_1)
+        del scale_3
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_1 = paddle._C_ops.softmax(add_15, -1)
+        del add_15
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_6, dropout_7 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_1, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_1
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_13 = paddle._C_ops.matmul(dropout_6, transpose_6, False, False)
+        del dropout_6, transpose_6
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_7 = paddle._C_ops.transpose(matmul_13, [0, 2, 1, 3])
+        del matmul_13
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_7 = paddle._C_ops.reshape(transpose_7, full_int_array_5)
+        del transpose_7
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_14 = paddle._C_ops.matmul(reshape_7, parameter_10, False, False)
+        del reshape_7
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_16 = paddle._C_ops.add(matmul_14, parameter_9)
+        del matmul_14
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_8, dropout_9 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_16, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_16
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_17 = paddle._C_ops.add(layer_norm_6, dropout_8)
+        del dropout_8, layer_norm_6
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_9, layer_norm_10, layer_norm_11 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_17, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_17
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_15 = paddle._C_ops.matmul(layer_norm_9, parameter_6, False, False)
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_18 = paddle._C_ops.add(matmul_15, parameter_5)
+        del matmul_15
+
+        # pd_op.gelu: (1x21x3072xf32) <- (1x21x3072xf32)
+        gelu_1 = paddle._C_ops.gelu(add_18, False)
+        del add_18
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_16 = paddle._C_ops.matmul(gelu_1, parameter_4, False, False)
+        del gelu_1
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_19 = paddle._C_ops.add(matmul_16, parameter_3)
+        del matmul_16
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_20 = paddle._C_ops.add(add_19, layer_norm_9)
+        del add_19, layer_norm_9
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_12, layer_norm_13, layer_norm_14 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_20, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_20
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_17 = paddle._C_ops.matmul(layer_norm_12, parameter_16, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_21 = paddle._C_ops.add(matmul_17, parameter_15)
+        del matmul_17
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_18 = paddle._C_ops.matmul(layer_norm_12, parameter_14, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_22 = paddle._C_ops.add(matmul_18, parameter_13)
+        del matmul_18
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_19 = paddle._C_ops.matmul(layer_norm_12, parameter_12, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_23 = paddle._C_ops.add(matmul_19, parameter_11)
+        del matmul_19
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_8 = paddle._C_ops.reshape(add_21, full_int_array_4)
+        del add_21
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_8 = paddle._C_ops.transpose(reshape_8, [0, 2, 1, 3])
+        del reshape_8
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_9 = paddle._C_ops.reshape(add_22, full_int_array_4)
+        del add_22
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_9 = paddle._C_ops.transpose(reshape_9, [0, 2, 1, 3])
+        del reshape_9
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_10 = paddle._C_ops.reshape(add_23, full_int_array_4)
+        del add_23
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_10 = paddle._C_ops.transpose(reshape_10, [0, 2, 1, 3])
+        del reshape_10
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_20 = paddle._C_ops.matmul(transpose_8, transpose_9, False, True)
+        del transpose_8, transpose_9
+
+        # pd_op.scale: (1x12x21x21xf32) <- (1x12x21x21xf32, 1xf32)
+        scale_4 = paddle._C_ops.scale(matmul_20, full_3, float("0"), True)
+        del matmul_20
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_24 = paddle._C_ops.add(scale_4, scale_1)
+        del scale_4
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_2 = paddle._C_ops.softmax(add_24, -1)
+        del add_24
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_10, dropout_11 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_2, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_2
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_21 = paddle._C_ops.matmul(dropout_10, transpose_10, False, False)
+        del dropout_10, transpose_10
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_11 = paddle._C_ops.transpose(matmul_21, [0, 2, 1, 3])
+        del matmul_21
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_11 = paddle._C_ops.reshape(transpose_11, full_int_array_5)
+        del transpose_11
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_22 = paddle._C_ops.matmul(reshape_11, parameter_10, False, False)
+        del reshape_11
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_25 = paddle._C_ops.add(matmul_22, parameter_9)
+        del matmul_22
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_12, dropout_13 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_25, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_25
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_26 = paddle._C_ops.add(layer_norm_12, dropout_12)
+        del dropout_12, layer_norm_12
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_15, layer_norm_16, layer_norm_17 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_26, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_26
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_23 = paddle._C_ops.matmul(layer_norm_15, parameter_6, False, False)
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_27 = paddle._C_ops.add(matmul_23, parameter_5)
+        del matmul_23
+
+        # pd_op.gelu: (1x21x3072xf32) <- (1x21x3072xf32)
+        gelu_2 = paddle._C_ops.gelu(add_27, False)
+        del add_27
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_24 = paddle._C_ops.matmul(gelu_2, parameter_4, False, False)
+        del gelu_2
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_28 = paddle._C_ops.add(matmul_24, parameter_3)
+        del matmul_24
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_29 = paddle._C_ops.add(add_28, layer_norm_15)
+        del add_28, layer_norm_15
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_18, layer_norm_19, layer_norm_20 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_29, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_29
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_25 = paddle._C_ops.matmul(layer_norm_18, parameter_16, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_30 = paddle._C_ops.add(matmul_25, parameter_15)
+        del matmul_25
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_26 = paddle._C_ops.matmul(layer_norm_18, parameter_14, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_31 = paddle._C_ops.add(matmul_26, parameter_13)
+        del matmul_26
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_27 = paddle._C_ops.matmul(layer_norm_18, parameter_12, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_32 = paddle._C_ops.add(matmul_27, parameter_11)
+        del matmul_27
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_12 = paddle._C_ops.reshape(add_30, full_int_array_4)
+        del add_30
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_12 = paddle._C_ops.transpose(reshape_12, [0, 2, 1, 3])
+        del reshape_12
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_13 = paddle._C_ops.reshape(add_31, full_int_array_4)
+        del add_31
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_13 = paddle._C_ops.transpose(reshape_13, [0, 2, 1, 3])
+        del reshape_13
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_14 = paddle._C_ops.reshape(add_32, full_int_array_4)
+        del add_32
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_14 = paddle._C_ops.transpose(reshape_14, [0, 2, 1, 3])
+        del reshape_14
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_28 = paddle._C_ops.matmul(transpose_12, transpose_13, False, True)
+        del transpose_12, transpose_13
+
+        # pd_op.scale: (1x12x21x21xf32) <- (1x12x21x21xf32, 1xf32)
+        scale_5 = paddle._C_ops.scale(matmul_28, full_3, float("0"), True)
+        del matmul_28
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_33 = paddle._C_ops.add(scale_5, scale_1)
+        del scale_5
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_3 = paddle._C_ops.softmax(add_33, -1)
+        del add_33
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_14, dropout_15 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_3, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_3
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_29 = paddle._C_ops.matmul(dropout_14, transpose_14, False, False)
+        del dropout_14, transpose_14
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_15 = paddle._C_ops.transpose(matmul_29, [0, 2, 1, 3])
+        del matmul_29
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_15 = paddle._C_ops.reshape(transpose_15, full_int_array_5)
+        del transpose_15
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_30 = paddle._C_ops.matmul(reshape_15, parameter_10, False, False)
+        del reshape_15
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_34 = paddle._C_ops.add(matmul_30, parameter_9)
+        del matmul_30
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_16, dropout_17 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_34, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_34
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_35 = paddle._C_ops.add(layer_norm_18, dropout_16)
+        del dropout_16, layer_norm_18
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_21, layer_norm_22, layer_norm_23 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_35, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_35
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_31 = paddle._C_ops.matmul(layer_norm_21, parameter_6, False, False)
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_36 = paddle._C_ops.add(matmul_31, parameter_5)
+        del matmul_31
+
+        # pd_op.gelu: (1x21x3072xf32) <- (1x21x3072xf32)
+        gelu_3 = paddle._C_ops.gelu(add_36, False)
+        del add_36
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_32 = paddle._C_ops.matmul(gelu_3, parameter_4, False, False)
+        del gelu_3
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_37 = paddle._C_ops.add(matmul_32, parameter_3)
+        del matmul_32
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_38 = paddle._C_ops.add(add_37, layer_norm_21)
+        del add_37, layer_norm_21
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_24, layer_norm_25, layer_norm_26 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_38, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_38
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_33 = paddle._C_ops.matmul(layer_norm_24, parameter_16, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_39 = paddle._C_ops.add(matmul_33, parameter_15)
+        del matmul_33
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_34 = paddle._C_ops.matmul(layer_norm_24, parameter_14, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_40 = paddle._C_ops.add(matmul_34, parameter_13)
+        del matmul_34
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_35 = paddle._C_ops.matmul(layer_norm_24, parameter_12, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_41 = paddle._C_ops.add(matmul_35, parameter_11)
+        del matmul_35
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_16 = paddle._C_ops.reshape(add_39, full_int_array_4)
+        del add_39
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_16 = paddle._C_ops.transpose(reshape_16, [0, 2, 1, 3])
+        del reshape_16
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_17 = paddle._C_ops.reshape(add_40, full_int_array_4)
+        del add_40
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_17 = paddle._C_ops.transpose(reshape_17, [0, 2, 1, 3])
+        del reshape_17
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_18 = paddle._C_ops.reshape(add_41, full_int_array_4)
+        del add_41
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_18 = paddle._C_ops.transpose(reshape_18, [0, 2, 1, 3])
+        del reshape_18
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_36 = paddle._C_ops.matmul(transpose_16, transpose_17, False, True)
+        del transpose_16, transpose_17
+
+        # pd_op.scale: (1x12x21x21xf32) <- (1x12x21x21xf32, 1xf32)
+        scale_6 = paddle._C_ops.scale(matmul_36, full_3, float("0"), True)
+        del matmul_36
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_42 = paddle._C_ops.add(scale_6, scale_1)
+        del scale_6
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_4 = paddle._C_ops.softmax(add_42, -1)
+        del add_42
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_18, dropout_19 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_4, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_4
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_37 = paddle._C_ops.matmul(dropout_18, transpose_18, False, False)
+        del dropout_18, transpose_18
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_19 = paddle._C_ops.transpose(matmul_37, [0, 2, 1, 3])
+        del matmul_37
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_19 = paddle._C_ops.reshape(transpose_19, full_int_array_5)
+        del transpose_19
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_38 = paddle._C_ops.matmul(reshape_19, parameter_10, False, False)
+        del reshape_19
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_43 = paddle._C_ops.add(matmul_38, parameter_9)
+        del matmul_38
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_20, dropout_21 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_43, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_43
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_44 = paddle._C_ops.add(layer_norm_24, dropout_20)
+        del dropout_20, layer_norm_24
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_27, layer_norm_28, layer_norm_29 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_44, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_44
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_39 = paddle._C_ops.matmul(layer_norm_27, parameter_6, False, False)
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_45 = paddle._C_ops.add(matmul_39, parameter_5)
+        del matmul_39
+
+        # pd_op.gelu: (1x21x3072xf32) <- (1x21x3072xf32)
+        gelu_4 = paddle._C_ops.gelu(add_45, False)
+        del add_45
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_40 = paddle._C_ops.matmul(gelu_4, parameter_4, False, False)
+        del gelu_4
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_46 = paddle._C_ops.add(matmul_40, parameter_3)
+        del matmul_40
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_47 = paddle._C_ops.add(add_46, layer_norm_27)
+        del add_46, layer_norm_27
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_30, layer_norm_31, layer_norm_32 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_47, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_47
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_41 = paddle._C_ops.matmul(layer_norm_30, parameter_16, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_48 = paddle._C_ops.add(matmul_41, parameter_15)
+        del matmul_41
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_42 = paddle._C_ops.matmul(layer_norm_30, parameter_14, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_49 = paddle._C_ops.add(matmul_42, parameter_13)
+        del matmul_42
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_43 = paddle._C_ops.matmul(layer_norm_30, parameter_12, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_50 = paddle._C_ops.add(matmul_43, parameter_11)
+        del matmul_43
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_20 = paddle._C_ops.reshape(add_48, full_int_array_4)
+        del add_48
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_20 = paddle._C_ops.transpose(reshape_20, [0, 2, 1, 3])
+        del reshape_20
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_21 = paddle._C_ops.reshape(add_49, full_int_array_4)
+        del add_49
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_21 = paddle._C_ops.transpose(reshape_21, [0, 2, 1, 3])
+        del reshape_21
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_22 = paddle._C_ops.reshape(add_50, full_int_array_4)
+        del add_50
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_22 = paddle._C_ops.transpose(reshape_22, [0, 2, 1, 3])
+        del reshape_22
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_44 = paddle._C_ops.matmul(transpose_20, transpose_21, False, True)
+        del transpose_20, transpose_21
+
+        # pd_op.scale: (1x12x21x21xf32) <- (1x12x21x21xf32, 1xf32)
+        scale_7 = paddle._C_ops.scale(matmul_44, full_3, float("0"), True)
+        del matmul_44
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_51 = paddle._C_ops.add(scale_7, scale_1)
+        del scale_7
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_5 = paddle._C_ops.softmax(add_51, -1)
+        del add_51
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_22, dropout_23 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_5, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_5
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_45 = paddle._C_ops.matmul(dropout_22, transpose_22, False, False)
+        del dropout_22, transpose_22
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_23 = paddle._C_ops.transpose(matmul_45, [0, 2, 1, 3])
+        del matmul_45
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_23 = paddle._C_ops.reshape(transpose_23, full_int_array_5)
+        del transpose_23
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_46 = paddle._C_ops.matmul(reshape_23, parameter_10, False, False)
+        del reshape_23
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_52 = paddle._C_ops.add(matmul_46, parameter_9)
+        del matmul_46
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_24, dropout_25 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_52, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_52
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_53 = paddle._C_ops.add(layer_norm_30, dropout_24)
+        del dropout_24, layer_norm_30
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_33, layer_norm_34, layer_norm_35 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_53, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_53
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_47 = paddle._C_ops.matmul(layer_norm_33, parameter_6, False, False)
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_54 = paddle._C_ops.add(matmul_47, parameter_5)
+        del matmul_47
+
+        # pd_op.gelu: (1x21x3072xf32) <- (1x21x3072xf32)
+        gelu_5 = paddle._C_ops.gelu(add_54, False)
+        del add_54
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_48 = paddle._C_ops.matmul(gelu_5, parameter_4, False, False)
+        del gelu_5
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_55 = paddle._C_ops.add(matmul_48, parameter_3)
+        del matmul_48
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_56 = paddle._C_ops.add(add_55, layer_norm_33)
+        del add_55, layer_norm_33
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_36, layer_norm_37, layer_norm_38 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_56, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_56
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_49 = paddle._C_ops.matmul(layer_norm_36, parameter_16, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_57 = paddle._C_ops.add(matmul_49, parameter_15)
+        del matmul_49
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_50 = paddle._C_ops.matmul(layer_norm_36, parameter_14, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_58 = paddle._C_ops.add(matmul_50, parameter_13)
+        del matmul_50
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_51 = paddle._C_ops.matmul(layer_norm_36, parameter_12, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_59 = paddle._C_ops.add(matmul_51, parameter_11)
+        del matmul_51
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_24 = paddle._C_ops.reshape(add_57, full_int_array_4)
+        del add_57
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_24 = paddle._C_ops.transpose(reshape_24, [0, 2, 1, 3])
+        del reshape_24
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_25 = paddle._C_ops.reshape(add_58, full_int_array_4)
+        del add_58
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_25 = paddle._C_ops.transpose(reshape_25, [0, 2, 1, 3])
+        del reshape_25
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_26 = paddle._C_ops.reshape(add_59, full_int_array_4)
+        del add_59
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_26 = paddle._C_ops.transpose(reshape_26, [0, 2, 1, 3])
+        del reshape_26
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_52 = paddle._C_ops.matmul(transpose_24, transpose_25, False, True)
+        del transpose_24, transpose_25
+
+        # pd_op.scale: (1x12x21x21xf32) <- (1x12x21x21xf32, 1xf32)
+        scale_8 = paddle._C_ops.scale(matmul_52, full_3, float("0"), True)
+        del matmul_52
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_60 = paddle._C_ops.add(scale_8, scale_1)
+        del scale_8
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_6 = paddle._C_ops.softmax(add_60, -1)
+        del add_60
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_26, dropout_27 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_6, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_6
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_53 = paddle._C_ops.matmul(dropout_26, transpose_26, False, False)
+        del dropout_26, transpose_26
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_27 = paddle._C_ops.transpose(matmul_53, [0, 2, 1, 3])
+        del matmul_53
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_27 = paddle._C_ops.reshape(transpose_27, full_int_array_5)
+        del transpose_27
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_54 = paddle._C_ops.matmul(reshape_27, parameter_10, False, False)
+        del reshape_27
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_61 = paddle._C_ops.add(matmul_54, parameter_9)
+        del matmul_54
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_28, dropout_29 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_61, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_61
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_62 = paddle._C_ops.add(layer_norm_36, dropout_28)
+        del dropout_28, layer_norm_36
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_39, layer_norm_40, layer_norm_41 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_62, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_62
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_55 = paddle._C_ops.matmul(layer_norm_39, parameter_6, False, False)
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_63 = paddle._C_ops.add(matmul_55, parameter_5)
+        del matmul_55
+
+        # pd_op.gelu: (1x21x3072xf32) <- (1x21x3072xf32)
+        gelu_6 = paddle._C_ops.gelu(add_63, False)
+        del add_63
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_56 = paddle._C_ops.matmul(gelu_6, parameter_4, False, False)
+        del gelu_6
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_64 = paddle._C_ops.add(matmul_56, parameter_3)
+        del matmul_56
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_65 = paddle._C_ops.add(add_64, layer_norm_39)
+        del add_64, layer_norm_39
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_42, layer_norm_43, layer_norm_44 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_65, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_65
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_57 = paddle._C_ops.matmul(layer_norm_42, parameter_16, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_66 = paddle._C_ops.add(matmul_57, parameter_15)
+        del matmul_57
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_58 = paddle._C_ops.matmul(layer_norm_42, parameter_14, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_67 = paddle._C_ops.add(matmul_58, parameter_13)
+        del matmul_58
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_59 = paddle._C_ops.matmul(layer_norm_42, parameter_12, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_68 = paddle._C_ops.add(matmul_59, parameter_11)
+        del matmul_59
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_28 = paddle._C_ops.reshape(add_66, full_int_array_4)
+        del add_66
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_28 = paddle._C_ops.transpose(reshape_28, [0, 2, 1, 3])
+        del reshape_28
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_29 = paddle._C_ops.reshape(add_67, full_int_array_4)
+        del add_67
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_29 = paddle._C_ops.transpose(reshape_29, [0, 2, 1, 3])
+        del reshape_29
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_30 = paddle._C_ops.reshape(add_68, full_int_array_4)
+        del add_68
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_30 = paddle._C_ops.transpose(reshape_30, [0, 2, 1, 3])
+        del reshape_30
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_60 = paddle._C_ops.matmul(transpose_28, transpose_29, False, True)
+        del transpose_28, transpose_29
+
+        # pd_op.scale: (1x12x21x21xf32) <- (1x12x21x21xf32, 1xf32)
+        scale_9 = paddle._C_ops.scale(matmul_60, full_3, float("0"), True)
+        del matmul_60
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_69 = paddle._C_ops.add(scale_9, scale_1)
+        del scale_9
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_7 = paddle._C_ops.softmax(add_69, -1)
+        del add_69
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_30, dropout_31 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_7, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_7
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_61 = paddle._C_ops.matmul(dropout_30, transpose_30, False, False)
+        del dropout_30, transpose_30
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_31 = paddle._C_ops.transpose(matmul_61, [0, 2, 1, 3])
+        del matmul_61
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_31 = paddle._C_ops.reshape(transpose_31, full_int_array_5)
+        del transpose_31
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_62 = paddle._C_ops.matmul(reshape_31, parameter_10, False, False)
+        del reshape_31
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_70 = paddle._C_ops.add(matmul_62, parameter_9)
+        del matmul_62
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_32, dropout_33 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_70, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_70
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_71 = paddle._C_ops.add(layer_norm_42, dropout_32)
+        del dropout_32, layer_norm_42
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_45, layer_norm_46, layer_norm_47 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_71, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_71
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_63 = paddle._C_ops.matmul(layer_norm_45, parameter_6, False, False)
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_72 = paddle._C_ops.add(matmul_63, parameter_5)
+        del matmul_63
+
+        # pd_op.gelu: (1x21x3072xf32) <- (1x21x3072xf32)
+        gelu_7 = paddle._C_ops.gelu(add_72, False)
+        del add_72
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_64 = paddle._C_ops.matmul(gelu_7, parameter_4, False, False)
+        del gelu_7
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_73 = paddle._C_ops.add(matmul_64, parameter_3)
+        del matmul_64
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_74 = paddle._C_ops.add(add_73, layer_norm_45)
+        del add_73, layer_norm_45
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_48, layer_norm_49, layer_norm_50 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_74, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_74
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_65 = paddle._C_ops.matmul(layer_norm_48, parameter_16, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_75 = paddle._C_ops.add(matmul_65, parameter_15)
+        del matmul_65
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_66 = paddle._C_ops.matmul(layer_norm_48, parameter_14, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_76 = paddle._C_ops.add(matmul_66, parameter_13)
+        del matmul_66
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_67 = paddle._C_ops.matmul(layer_norm_48, parameter_12, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_77 = paddle._C_ops.add(matmul_67, parameter_11)
+        del matmul_67
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_32 = paddle._C_ops.reshape(add_75, full_int_array_4)
+        del add_75
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_32 = paddle._C_ops.transpose(reshape_32, [0, 2, 1, 3])
+        del reshape_32
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_33 = paddle._C_ops.reshape(add_76, full_int_array_4)
+        del add_76
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_33 = paddle._C_ops.transpose(reshape_33, [0, 2, 1, 3])
+        del reshape_33
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_34 = paddle._C_ops.reshape(add_77, full_int_array_4)
+        del add_77
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_34 = paddle._C_ops.transpose(reshape_34, [0, 2, 1, 3])
+        del reshape_34
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_68 = paddle._C_ops.matmul(transpose_32, transpose_33, False, True)
+        del transpose_32, transpose_33
+
+        # pd_op.scale: (1x12x21x21xf32) <- (1x12x21x21xf32, 1xf32)
+        scale_10 = paddle._C_ops.scale(matmul_68, full_3, float("0"), True)
+        del matmul_68
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_78 = paddle._C_ops.add(scale_10, scale_1)
+        del scale_10
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_8 = paddle._C_ops.softmax(add_78, -1)
+        del add_78
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_34, dropout_35 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_8, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_8
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_69 = paddle._C_ops.matmul(dropout_34, transpose_34, False, False)
+        del dropout_34, transpose_34
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_35 = paddle._C_ops.transpose(matmul_69, [0, 2, 1, 3])
+        del matmul_69
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_35 = paddle._C_ops.reshape(transpose_35, full_int_array_5)
+        del transpose_35
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_70 = paddle._C_ops.matmul(reshape_35, parameter_10, False, False)
+        del reshape_35
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_79 = paddle._C_ops.add(matmul_70, parameter_9)
+        del matmul_70
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_36, dropout_37 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_79, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_79
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_80 = paddle._C_ops.add(layer_norm_48, dropout_36)
+        del dropout_36, layer_norm_48
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_51, layer_norm_52, layer_norm_53 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_80, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_80
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_71 = paddle._C_ops.matmul(layer_norm_51, parameter_6, False, False)
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_81 = paddle._C_ops.add(matmul_71, parameter_5)
+        del matmul_71
+
+        # pd_op.gelu: (1x21x3072xf32) <- (1x21x3072xf32)
+        gelu_8 = paddle._C_ops.gelu(add_81, False)
+        del add_81
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_72 = paddle._C_ops.matmul(gelu_8, parameter_4, False, False)
+        del gelu_8
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_82 = paddle._C_ops.add(matmul_72, parameter_3)
+        del matmul_72
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_83 = paddle._C_ops.add(add_82, layer_norm_51)
+        del add_82, layer_norm_51
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_54, layer_norm_55, layer_norm_56 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_83, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_83
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_73 = paddle._C_ops.matmul(layer_norm_54, parameter_16, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_84 = paddle._C_ops.add(matmul_73, parameter_15)
+        del matmul_73
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_74 = paddle._C_ops.matmul(layer_norm_54, parameter_14, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_85 = paddle._C_ops.add(matmul_74, parameter_13)
+        del matmul_74
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_75 = paddle._C_ops.matmul(layer_norm_54, parameter_12, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_86 = paddle._C_ops.add(matmul_75, parameter_11)
+        del matmul_75
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_36 = paddle._C_ops.reshape(add_84, full_int_array_4)
+        del add_84
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_36 = paddle._C_ops.transpose(reshape_36, [0, 2, 1, 3])
+        del reshape_36
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_37 = paddle._C_ops.reshape(add_85, full_int_array_4)
+        del add_85
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_37 = paddle._C_ops.transpose(reshape_37, [0, 2, 1, 3])
+        del reshape_37
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_38 = paddle._C_ops.reshape(add_86, full_int_array_4)
+        del add_86
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_38 = paddle._C_ops.transpose(reshape_38, [0, 2, 1, 3])
+        del reshape_38
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_76 = paddle._C_ops.matmul(transpose_36, transpose_37, False, True)
+        del transpose_36, transpose_37
+
+        # pd_op.scale: (1x12x21x21xf32) <- (1x12x21x21xf32, 1xf32)
+        scale_11 = paddle._C_ops.scale(matmul_76, full_3, float("0"), True)
+        del matmul_76
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_87 = paddle._C_ops.add(scale_11, scale_1)
+        del scale_11
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_9 = paddle._C_ops.softmax(add_87, -1)
+        del add_87
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_38, dropout_39 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_9, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_9
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_77 = paddle._C_ops.matmul(dropout_38, transpose_38, False, False)
+        del dropout_38, transpose_38
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_39 = paddle._C_ops.transpose(matmul_77, [0, 2, 1, 3])
+        del matmul_77
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_39 = paddle._C_ops.reshape(transpose_39, full_int_array_5)
+        del transpose_39
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_78 = paddle._C_ops.matmul(reshape_39, parameter_10, False, False)
+        del reshape_39
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_88 = paddle._C_ops.add(matmul_78, parameter_9)
+        del matmul_78
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_40, dropout_41 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_88, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_88
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_89 = paddle._C_ops.add(layer_norm_54, dropout_40)
+        del dropout_40, layer_norm_54
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_57, layer_norm_58, layer_norm_59 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_89, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_89
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_79 = paddle._C_ops.matmul(layer_norm_57, parameter_6, False, False)
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_90 = paddle._C_ops.add(matmul_79, parameter_5)
+        del matmul_79
+
+        # pd_op.gelu: (1x21x3072xf32) <- (1x21x3072xf32)
+        gelu_9 = paddle._C_ops.gelu(add_90, False)
+        del add_90
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_80 = paddle._C_ops.matmul(gelu_9, parameter_4, False, False)
+        del gelu_9
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_91 = paddle._C_ops.add(matmul_80, parameter_3)
+        del matmul_80
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_92 = paddle._C_ops.add(add_91, layer_norm_57)
+        del add_91, layer_norm_57
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_60, layer_norm_61, layer_norm_62 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_92, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_92
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_81 = paddle._C_ops.matmul(layer_norm_60, parameter_16, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_93 = paddle._C_ops.add(matmul_81, parameter_15)
+        del matmul_81
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_82 = paddle._C_ops.matmul(layer_norm_60, parameter_14, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_94 = paddle._C_ops.add(matmul_82, parameter_13)
+        del matmul_82
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_83 = paddle._C_ops.matmul(layer_norm_60, parameter_12, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_95 = paddle._C_ops.add(matmul_83, parameter_11)
+        del matmul_83
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_40 = paddle._C_ops.reshape(add_93, full_int_array_4)
+        del add_93
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_40 = paddle._C_ops.transpose(reshape_40, [0, 2, 1, 3])
+        del reshape_40
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_41 = paddle._C_ops.reshape(add_94, full_int_array_4)
+        del add_94
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_41 = paddle._C_ops.transpose(reshape_41, [0, 2, 1, 3])
+        del reshape_41
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_42 = paddle._C_ops.reshape(add_95, full_int_array_4)
+        del add_95
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_42 = paddle._C_ops.transpose(reshape_42, [0, 2, 1, 3])
+        del reshape_42
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_84 = paddle._C_ops.matmul(transpose_40, transpose_41, False, True)
+        del transpose_40, transpose_41
+
+        # pd_op.scale: (1x12x21x21xf32) <- (1x12x21x21xf32, 1xf32)
+        scale_12 = paddle._C_ops.scale(matmul_84, full_3, float("0"), True)
+        del matmul_84
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_96 = paddle._C_ops.add(scale_12, scale_1)
+        del scale_12
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_10 = paddle._C_ops.softmax(add_96, -1)
+        del add_96
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_42, dropout_43 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_10, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_10
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_85 = paddle._C_ops.matmul(dropout_42, transpose_42, False, False)
+        del dropout_42, transpose_42
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_43 = paddle._C_ops.transpose(matmul_85, [0, 2, 1, 3])
+        del matmul_85
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_43 = paddle._C_ops.reshape(transpose_43, full_int_array_5)
+        del transpose_43
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_86 = paddle._C_ops.matmul(reshape_43, parameter_10, False, False)
+        del reshape_43
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_97 = paddle._C_ops.add(matmul_86, parameter_9)
+        del matmul_86
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_44, dropout_45 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_97, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_97
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_98 = paddle._C_ops.add(layer_norm_60, dropout_44)
+        del dropout_44, layer_norm_60
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_63, layer_norm_64, layer_norm_65 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_98, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_98
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_87 = paddle._C_ops.matmul(layer_norm_63, parameter_6, False, False)
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_99 = paddle._C_ops.add(matmul_87, parameter_5)
+        del matmul_87
+
+        # pd_op.gelu: (1x21x3072xf32) <- (1x21x3072xf32)
+        gelu_10 = paddle._C_ops.gelu(add_99, False)
+        del add_99
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_88 = paddle._C_ops.matmul(gelu_10, parameter_4, False, False)
+        del gelu_10
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_100 = paddle._C_ops.add(matmul_88, parameter_3)
+        del matmul_88
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_101 = paddle._C_ops.add(add_100, layer_norm_63)
+        del add_100, layer_norm_63
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_66, layer_norm_67, layer_norm_68 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_101, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_101
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_89 = paddle._C_ops.matmul(layer_norm_66, parameter_16, False, False)
+        del parameter_16
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_102 = paddle._C_ops.add(matmul_89, parameter_15)
+        del matmul_89, parameter_15
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_90 = paddle._C_ops.matmul(layer_norm_66, parameter_14, False, False)
+        del parameter_14
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_103 = paddle._C_ops.add(matmul_90, parameter_13)
+        del matmul_90, parameter_13
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_91 = paddle._C_ops.matmul(layer_norm_66, parameter_12, False, False)
+        del parameter_12
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_104 = paddle._C_ops.add(matmul_91, parameter_11)
+        del matmul_91, parameter_11
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_44 = paddle._C_ops.reshape(add_102, full_int_array_4)
+        del add_102
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_44 = paddle._C_ops.transpose(reshape_44, [0, 2, 1, 3])
+        del reshape_44
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_45 = paddle._C_ops.reshape(add_103, full_int_array_4)
+        del add_103
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_45 = paddle._C_ops.transpose(reshape_45, [0, 2, 1, 3])
+        del reshape_45
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_46 = paddle._C_ops.reshape(add_104, full_int_array_4)
+        del add_104, full_int_array_4
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_46 = paddle._C_ops.transpose(reshape_46, [0, 2, 1, 3])
+        del reshape_46
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_92 = paddle._C_ops.matmul(transpose_44, transpose_45, False, True)
+        del transpose_44, transpose_45
+
+        # pd_op.scale: (1x12x21x21xf32) <- (1x12x21x21xf32, 1xf32)
+        scale_13 = paddle._C_ops.scale(matmul_92, full_3, float("0"), True)
+        del full_3, matmul_92
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_105 = paddle._C_ops.add(scale_13, scale_1)
+        del scale_1, scale_13
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_11 = paddle._C_ops.softmax(add_105, -1)
+        del add_105
+
+        # pd_op.dropout: (1x12x21x21xf32, 1x12x21x21xui8) <- (1x12x21x21xf32, None, 1xf32)
+        dropout_46, dropout_47 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_11, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_11
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_93 = paddle._C_ops.matmul(dropout_46, transpose_46, False, False)
+        del dropout_46, transpose_46
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_47 = paddle._C_ops.transpose(matmul_93, [0, 2, 1, 3])
+        del matmul_93
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_47 = paddle._C_ops.reshape(transpose_47, full_int_array_5)
+        del full_int_array_5, transpose_47
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_94 = paddle._C_ops.matmul(reshape_47, parameter_10, False, False)
+        del parameter_10, reshape_47
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_106 = paddle._C_ops.add(matmul_94, parameter_9)
+        del matmul_94, parameter_9
+
+        # pd_op.dropout: (1x21x768xf32, 1x21x768xui8) <- (1x21x768xf32, None, 1xf32)
+        dropout_48, dropout_49 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                add_106, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del add_106, full_2
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_107 = paddle._C_ops.add(layer_norm_66, dropout_48)
+        del dropout_48, layer_norm_66
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_69, layer_norm_70, layer_norm_71 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_107, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_107, parameter_7, parameter_8
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_95 = paddle._C_ops.matmul(layer_norm_69, parameter_6, False, False)
+        del parameter_6
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_108 = paddle._C_ops.add(matmul_95, parameter_5)
+        del matmul_95, parameter_5
+
+        # pd_op.gelu: (1x21x3072xf32) <- (1x21x3072xf32)
+        gelu_11 = paddle._C_ops.gelu(add_108, False)
+        del add_108
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_96 = paddle._C_ops.matmul(gelu_11, parameter_4, False, False)
+        del gelu_11, parameter_4
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_109 = paddle._C_ops.add(matmul_96, parameter_3)
+        del matmul_96, parameter_3
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_110 = paddle._C_ops.add(add_109, layer_norm_69)
+        del add_109, layer_norm_69
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_72, layer_norm_73, layer_norm_74 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_110, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_110, parameter_17, parameter_18
+
+        # pd_op.slice: (1x768xf32) <- (1x21x768xf32, 1xi64, 1xi64)
+        slice_1 = paddle._C_ops.slice(
+            layer_norm_72, [1], full_int_array_2, full_int_array_0, [1], [1]
+        )
+        del full_int_array_0, full_int_array_2
+
+        # pd_op.matmul: (1x768xf32) <- (1x768xf32, 768x768xf32)
+        matmul_97 = paddle._C_ops.matmul(slice_1, parameter_2, False, False)
+        del parameter_2, slice_1
+
+        # pd_op.add: (1x768xf32) <- (1x768xf32, 768xf32)
+        add_111 = paddle._C_ops.add(matmul_97, parameter_1)
+        del matmul_97, parameter_1
+
+        # pd_op.tanh: (1x768xf32) <- (1x768xf32)
+        tanh_0 = paddle._C_ops.tanh(add_111)
+        del add_111, layer_norm_72
+
+        return tanh_0
diff --git a/paddle_samples/PaddleNLP/albert-base-v1/weight_meta.py b/paddle_samples/PaddleNLP/albert-base-v1/weight_meta.py
new file mode 100644
index 000000000..f9edf4996
--- /dev/null
+++ b/paddle_samples/PaddleNLP/albert-base-v1/weight_meta.py
@@ -0,0 +1,235 @@
+class Program_weight_tensor_parameter_0:
+    name = "parameter_0"
+    shape = [1, 512]
+    dtype = "int64"
+    min_val = 0
+    max_val = 511
+    data = None
+
+
+class Program_weight_tensor_parameter_1:
+    name = "parameter_1"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_2:
+    name = "parameter_2"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0950393")
+    max_val = float("0.0949818")
+    mean = float("2.1163e-05")
+    std = float("0.0199833")
+    data = None
+
+
+class Program_weight_tensor_parameter_3:
+    name = "parameter_3"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_4:
+    name = "parameter_4"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.0978883")
+    max_val = float("0.0982025")
+    mean = float("-9.18199e-06")
+    std = float("0.0199946")
+    data = None
+
+
+class Program_weight_tensor_parameter_5:
+    name = "parameter_5"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_6:
+    name = "parameter_6"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.101063")
+    max_val = float("0.100538")
+    mean = float("1.45909e-05")
+    std = float("0.0200083")
+    data = None
+
+
+class Program_weight_tensor_parameter_7:
+    name = "parameter_7"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_8:
+    name = "parameter_8"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_9:
+    name = "parameter_9"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_10:
+    name = "parameter_10"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0869559")
+    max_val = float("0.10256")
+    mean = float("-1.37491e-05")
+    std = float("0.0200072")
+    data = None
+
+
+class Program_weight_tensor_parameter_11:
+    name = "parameter_11"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_12:
+    name = "parameter_12"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.095139")
+    max_val = float("0.0959922")
+    mean = float("3.0233e-05")
+    std = float("0.0199653")
+    data = None
+
+
+class Program_weight_tensor_parameter_13:
+    name = "parameter_13"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_14:
+    name = "parameter_14"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0921497")
+    max_val = float("0.0860873")
+    mean = float("3.58198e-05")
+    std = float("0.019985")
+    data = None
+
+
+class Program_weight_tensor_parameter_15:
+    name = "parameter_15"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_16:
+    name = "parameter_16"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0895908")
+    max_val = float("0.08705")
+    mean = float("7.12779e-06")
+    std = float("0.0199925")
+    data = None
+
+
+class Program_weight_tensor_parameter_17:
+    name = "parameter_17"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_18:
+    name = "parameter_18"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_19:
+    name = "parameter_19"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_20:
+    name = "parameter_20"
+    shape = [128, 768]
+    dtype = "float32"
+    min_val = float("-0.0824699")
+    max_val = float("0.0883701")
+    mean = float("5.90966e-05")
+    std = float("0.0199163")
+    data = None
+
+
+class Program_weight_tensor_parameter_21:
+    name = "parameter_21"
+    shape = [128]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_22:
+    name = "parameter_22"
+    shape = [128]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_23:
+    name = "parameter_23"
+    shape = [2, 128]
+    dtype = "float32"
+    min_val = float("-0.0438373")
+    max_val = float("0.0567006")
+    mean = float("0.00175291")
+    std = float("0.0182297")
+    data = None
+
+
+class Program_weight_tensor_parameter_24:
+    name = "parameter_24"
+    shape = [512, 128]
+    dtype = "float32"
+    min_val = float("-0.0952002")
+    max_val = float("0.0822103")
+    mean = float("0.000103211")
+    std = float("0.0200516")
+    data = None
+
+
+class Program_weight_tensor_parameter_25:
+    name = "parameter_25"
+    shape = [30000, 128]
+    dtype = "float32"
+    min_val = float("-0.0970852")
+    max_val = float("0.110504")
+    mean = float("5.30125e-06")
+    std = float("0.0200003")
+    data = None
diff --git a/paddle_samples/PaddleNLP/albert-base-v2/graph_net.json b/paddle_samples/PaddleNLP/albert-base-v2/graph_net.json
new file mode 100644
index 000000000..ae04e4634
--- /dev/null
+++ b/paddle_samples/PaddleNLP/albert-base-v2/graph_net.json
@@ -0,0 +1,6 @@
+{
+    "framework": "paddle",
+    "model_name": "albert-base-v2",
+    "num_devices_required": 1,
+    "num_nodes_required": 1
+}
\ No newline at end of file
diff --git a/paddle_samples/PaddleNLP/albert-base-v2/input_meta.py b/paddle_samples/PaddleNLP/albert-base-v2/input_meta.py
new file mode 100644
index 000000000..b45834638
--- /dev/null
+++ b/paddle_samples/PaddleNLP/albert-base-v2/input_meta.py
@@ -0,0 +1,41 @@
+class Program_weight_tensor_data_0:
+    name = "data_0"
+    shape = [1, 21]
+    dtype = "int64"
+    data = [
+        2,
+        10975,
+        15,
+        51,
+        204,
+        25,
+        1909,
+        9,
+        31,
+        589,
+        2477,
+        88,
+        370,
+        816,
+        2761,
+        17,
+        66,
+        2607,
+        18,
+        9,
+        3,
+    ]
+
+
+class Program_weight_tensor_data_1:
+    name = "data_1"
+    shape = [1, 21]
+    dtype = "int64"
+    data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+
+
+class Program_weight_tensor_data_2:
+    name = "data_2"
+    shape = [1, 21]
+    dtype = "int64"
+    data = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
diff --git a/paddle_samples/PaddleNLP/albert-base-v2/model.py b/paddle_samples/PaddleNLP/albert-base-v2/model.py
new file mode 100644
index 000000000..ec2624730
--- /dev/null
+++ b/paddle_samples/PaddleNLP/albert-base-v2/model.py
@@ -0,0 +1,2003 @@
+import paddle
+
+
+class GraphModule(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self,
+        parameter_0,
+        parameter_1,
+        parameter_2,
+        parameter_3,
+        parameter_4,
+        parameter_5,
+        parameter_6,
+        parameter_7,
+        parameter_8,
+        parameter_9,
+        parameter_10,
+        parameter_11,
+        parameter_12,
+        parameter_13,
+        parameter_14,
+        parameter_15,
+        parameter_16,
+        parameter_17,
+        parameter_18,
+        parameter_19,
+        parameter_20,
+        parameter_21,
+        parameter_22,
+        parameter_23,
+        parameter_24,
+        parameter_25,
+        data_0,
+        data_1,
+        data_2,
+    ):
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_0 = [1]
+
+        # pd_op.unsqueeze: (1x1x21xi64) <- (1x21xi64, 1xi64)
+        unsqueeze_0 = paddle._C_ops.unsqueeze(data_1, full_int_array_0)
+        del data_1
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_1 = [2]
+
+        # pd_op.unsqueeze: (1x1x1x21xi64) <- (1x1x21xi64, 1xi64)
+        unsqueeze_1 = paddle._C_ops.unsqueeze(unsqueeze_0, full_int_array_1)
+        del full_int_array_1, unsqueeze_0
+
+        # pd_op.cast: (1x1x1x21xf32) <- (1x1x1x21xi64)
+        cast_0 = paddle._C_ops.cast(unsqueeze_1, paddle.float32)
+        del unsqueeze_1
+
+        # pd_op.full: (1xf32) <- ()
+        full_0 = paddle._C_ops.full(
+            [1], float("-1"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x1x1x21xf32) <- (1x1x1x21xf32, 1xf32)
+        scale_0 = paddle._C_ops.scale(cast_0, full_0, float("1"), True)
+        del cast_0, full_0
+
+        # pd_op.full: (1xf32) <- ()
+        full_1 = paddle._C_ops.full(
+            [1], float("-10000"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x1x1x21xf32) <- (1x1x1x21xf32, 1xf32)
+        scale_1 = paddle._C_ops.scale(scale_0, full_1, float("0"), True)
+        del full_1, scale_0
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_2 = [0]
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_3 = [21]
+
+        # pd_op.slice: (1x21xi64) <- (1x512xi64, 1xi64, 1xi64)
+        slice_0 = paddle._C_ops.slice(
+            parameter_0, [1], full_int_array_2, full_int_array_3, [1], []
+        )
+        del full_int_array_3, parameter_0
+
+        # pd_op.embedding: (1x21x128xf32) <- (1x21xi64, 30000x128xf32)
+        embedding_0 = paddle._C_ops.embedding(data_0, parameter_25, 0, False)
+        del data_0, parameter_25
+
+        # pd_op.embedding: (1x21x128xf32) <- (1x21xi64, 2x128xf32)
+        embedding_1 = paddle._C_ops.embedding(data_2, parameter_23, -1, False)
+        del data_2, parameter_23
+
+        # pd_op.add: (1x21x128xf32) <- (1x21x128xf32, 1x21x128xf32)
+        add_0 = paddle._C_ops.add(embedding_0, embedding_1)
+        del embedding_0, embedding_1
+
+        # pd_op.embedding: (1x21x128xf32) <- (1x21xi64, 512x128xf32)
+        embedding_2 = paddle._C_ops.embedding(slice_0, parameter_24, -1, False)
+        del parameter_24, slice_0
+
+        # pd_op.add: (1x21x128xf32) <- (1x21x128xf32, 1x21x128xf32)
+        add_1 = paddle._C_ops.add(add_0, embedding_2)
+        del add_0, embedding_2
+
+        # pd_op.layer_norm: (1x21x128xf32, 1x21xf32, 1x21xf32) <- (1x21x128xf32, 128xf32, 128xf32)
+        layer_norm_0, layer_norm_1, layer_norm_2 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_1, parameter_22, parameter_21, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_1, parameter_21, parameter_22
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x128xf32, 128x768xf32)
+        matmul_0 = paddle._C_ops.matmul(layer_norm_0, parameter_20, False, False)
+        del layer_norm_0, parameter_20
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_2 = paddle._C_ops.add(matmul_0, parameter_19)
+        del matmul_0, parameter_19
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_1 = paddle._C_ops.matmul(add_2, parameter_16, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_3 = paddle._C_ops.add(matmul_1, parameter_15)
+        del matmul_1
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_2 = paddle._C_ops.matmul(add_2, parameter_14, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_4 = paddle._C_ops.add(matmul_2, parameter_13)
+        del matmul_2
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_3 = paddle._C_ops.matmul(add_2, parameter_12, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_5 = paddle._C_ops.add(matmul_3, parameter_11)
+        del matmul_3
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_4 = [1, 21, 12, 64]
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_0 = paddle._C_ops.reshape(add_3, full_int_array_4)
+        del add_3
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_0 = paddle._C_ops.transpose(reshape_0, [0, 2, 1, 3])
+        del reshape_0
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_1 = paddle._C_ops.reshape(add_4, full_int_array_4)
+        del add_4
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_1 = paddle._C_ops.transpose(reshape_1, [0, 2, 1, 3])
+        del reshape_1
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_2 = paddle._C_ops.reshape(add_5, full_int_array_4)
+        del add_5
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_2 = paddle._C_ops.transpose(reshape_2, [0, 2, 1, 3])
+        del reshape_2
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_4 = paddle._C_ops.matmul(transpose_0, transpose_1, False, True)
+        del transpose_0, transpose_1
+
+        # pd_op.full: (1xf32) <- ()
+        full_2 = paddle._C_ops.full(
+            [1], float("0.125"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x12x21x21xf32) <- (1x12x21x21xf32, 1xf32)
+        scale_2 = paddle._C_ops.scale(matmul_4, full_2, float("0"), True)
+        del matmul_4
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_6 = paddle._C_ops.add(scale_2, scale_1)
+        del scale_2
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_0 = paddle._C_ops.softmax(add_6, -1)
+        del add_6
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_5 = paddle._C_ops.matmul(softmax_0, transpose_2, False, False)
+        del softmax_0, transpose_2
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_3 = paddle._C_ops.transpose(matmul_5, [0, 2, 1, 3])
+        del matmul_5
+
+        # pd_op.full_int_array: (3xi64) <- ()
+        full_int_array_5 = [0, 0, -1]
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_3 = paddle._C_ops.reshape(transpose_3, full_int_array_5)
+        del transpose_3
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_6 = paddle._C_ops.matmul(reshape_3, parameter_10, False, False)
+        del reshape_3
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_7 = paddle._C_ops.add(matmul_6, parameter_9)
+        del matmul_6
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_8 = paddle._C_ops.add(add_2, add_7)
+        del add_2, add_7
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_3, layer_norm_4, layer_norm_5 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_8, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_8
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_7 = paddle._C_ops.matmul(layer_norm_3, parameter_6, False, False)
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_9 = paddle._C_ops.add(matmul_7, parameter_5)
+        del matmul_7
+
+        # pd_op.full: (1xf32) <- ()
+        full_3 = paddle._C_ops.full(
+            [1], float("0.5"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_3 = paddle._C_ops.scale(add_9, full_3, float("0"), True)
+
+        # pd_op.pow: (1x21x3072xf32) <- (1x21x3072xf32)
+        pow_0 = paddle._C_ops.pow(add_9, float("3"))
+
+        # pd_op.full: (1xf32) <- ()
+        full_4 = paddle._C_ops.full(
+            [1], float("0.044715"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_4 = paddle._C_ops.scale(pow_0, full_4, float("0"), True)
+        del pow_0
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 1x21x3072xf32)
+        add_10 = paddle._C_ops.add(add_9, scale_4)
+        del add_9, scale_4
+
+        # pd_op.full: (1xf32) <- ()
+        full_5 = paddle._C_ops.full(
+            [1], float("0.797885"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_5 = paddle._C_ops.scale(add_10, full_5, float("0"), True)
+        del add_10
+
+        # pd_op.tanh: (1x21x3072xf32) <- (1x21x3072xf32)
+        tanh_1 = paddle._C_ops.tanh(scale_5)
+        del scale_5
+
+        # pd_op.full: (1xf32) <- ()
+        full_6 = paddle._C_ops.full(
+            [1], float("1"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_6 = paddle._C_ops.scale(tanh_1, full_6, float("1"), True)
+        del tanh_1
+
+        # pd_op.multiply: (1x21x3072xf32) <- (1x21x3072xf32, 1x21x3072xf32)
+        multiply_0 = paddle._C_ops.multiply(scale_3, scale_6)
+        del scale_3, scale_6
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_8 = paddle._C_ops.matmul(multiply_0, parameter_4, False, False)
+        del multiply_0
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_11 = paddle._C_ops.add(matmul_8, parameter_3)
+        del matmul_8
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_12 = paddle._C_ops.add(add_11, layer_norm_3)
+        del add_11, layer_norm_3
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_6, layer_norm_7, layer_norm_8 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_12, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_12
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_9 = paddle._C_ops.matmul(layer_norm_6, parameter_16, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_13 = paddle._C_ops.add(matmul_9, parameter_15)
+        del matmul_9
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_10 = paddle._C_ops.matmul(layer_norm_6, parameter_14, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_14 = paddle._C_ops.add(matmul_10, parameter_13)
+        del matmul_10
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_11 = paddle._C_ops.matmul(layer_norm_6, parameter_12, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_15 = paddle._C_ops.add(matmul_11, parameter_11)
+        del matmul_11
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_4 = paddle._C_ops.reshape(add_13, full_int_array_4)
+        del add_13
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_4 = paddle._C_ops.transpose(reshape_4, [0, 2, 1, 3])
+        del reshape_4
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_5 = paddle._C_ops.reshape(add_14, full_int_array_4)
+        del add_14
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_5 = paddle._C_ops.transpose(reshape_5, [0, 2, 1, 3])
+        del reshape_5
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_6 = paddle._C_ops.reshape(add_15, full_int_array_4)
+        del add_15
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_6 = paddle._C_ops.transpose(reshape_6, [0, 2, 1, 3])
+        del reshape_6
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_12 = paddle._C_ops.matmul(transpose_4, transpose_5, False, True)
+        del transpose_4, transpose_5
+
+        # pd_op.scale: (1x12x21x21xf32) <- (1x12x21x21xf32, 1xf32)
+        scale_7 = paddle._C_ops.scale(matmul_12, full_2, float("0"), True)
+        del matmul_12
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_16 = paddle._C_ops.add(scale_7, scale_1)
+        del scale_7
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_1 = paddle._C_ops.softmax(add_16, -1)
+        del add_16
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_13 = paddle._C_ops.matmul(softmax_1, transpose_6, False, False)
+        del softmax_1, transpose_6
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_7 = paddle._C_ops.transpose(matmul_13, [0, 2, 1, 3])
+        del matmul_13
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_7 = paddle._C_ops.reshape(transpose_7, full_int_array_5)
+        del transpose_7
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_14 = paddle._C_ops.matmul(reshape_7, parameter_10, False, False)
+        del reshape_7
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_17 = paddle._C_ops.add(matmul_14, parameter_9)
+        del matmul_14
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_18 = paddle._C_ops.add(layer_norm_6, add_17)
+        del add_17, layer_norm_6
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_9, layer_norm_10, layer_norm_11 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_18, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_18
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_15 = paddle._C_ops.matmul(layer_norm_9, parameter_6, False, False)
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_19 = paddle._C_ops.add(matmul_15, parameter_5)
+        del matmul_15
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_8 = paddle._C_ops.scale(add_19, full_3, float("0"), True)
+
+        # pd_op.pow: (1x21x3072xf32) <- (1x21x3072xf32)
+        pow_1 = paddle._C_ops.pow(add_19, float("3"))
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_9 = paddle._C_ops.scale(pow_1, full_4, float("0"), True)
+        del pow_1
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 1x21x3072xf32)
+        add_20 = paddle._C_ops.add(add_19, scale_9)
+        del add_19, scale_9
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_10 = paddle._C_ops.scale(add_20, full_5, float("0"), True)
+        del add_20
+
+        # pd_op.tanh: (1x21x3072xf32) <- (1x21x3072xf32)
+        tanh_2 = paddle._C_ops.tanh(scale_10)
+        del scale_10
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_11 = paddle._C_ops.scale(tanh_2, full_6, float("1"), True)
+        del tanh_2
+
+        # pd_op.multiply: (1x21x3072xf32) <- (1x21x3072xf32, 1x21x3072xf32)
+        multiply_1 = paddle._C_ops.multiply(scale_8, scale_11)
+        del scale_11, scale_8
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_16 = paddle._C_ops.matmul(multiply_1, parameter_4, False, False)
+        del multiply_1
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_21 = paddle._C_ops.add(matmul_16, parameter_3)
+        del matmul_16
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_22 = paddle._C_ops.add(add_21, layer_norm_9)
+        del add_21, layer_norm_9
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_12, layer_norm_13, layer_norm_14 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_22, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_22
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_17 = paddle._C_ops.matmul(layer_norm_12, parameter_16, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_23 = paddle._C_ops.add(matmul_17, parameter_15)
+        del matmul_17
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_18 = paddle._C_ops.matmul(layer_norm_12, parameter_14, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_24 = paddle._C_ops.add(matmul_18, parameter_13)
+        del matmul_18
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_19 = paddle._C_ops.matmul(layer_norm_12, parameter_12, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_25 = paddle._C_ops.add(matmul_19, parameter_11)
+        del matmul_19
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_8 = paddle._C_ops.reshape(add_23, full_int_array_4)
+        del add_23
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_8 = paddle._C_ops.transpose(reshape_8, [0, 2, 1, 3])
+        del reshape_8
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_9 = paddle._C_ops.reshape(add_24, full_int_array_4)
+        del add_24
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_9 = paddle._C_ops.transpose(reshape_9, [0, 2, 1, 3])
+        del reshape_9
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_10 = paddle._C_ops.reshape(add_25, full_int_array_4)
+        del add_25
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_10 = paddle._C_ops.transpose(reshape_10, [0, 2, 1, 3])
+        del reshape_10
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_20 = paddle._C_ops.matmul(transpose_8, transpose_9, False, True)
+        del transpose_8, transpose_9
+
+        # pd_op.scale: (1x12x21x21xf32) <- (1x12x21x21xf32, 1xf32)
+        scale_12 = paddle._C_ops.scale(matmul_20, full_2, float("0"), True)
+        del matmul_20
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_26 = paddle._C_ops.add(scale_12, scale_1)
+        del scale_12
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_2 = paddle._C_ops.softmax(add_26, -1)
+        del add_26
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_21 = paddle._C_ops.matmul(softmax_2, transpose_10, False, False)
+        del softmax_2, transpose_10
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_11 = paddle._C_ops.transpose(matmul_21, [0, 2, 1, 3])
+        del matmul_21
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_11 = paddle._C_ops.reshape(transpose_11, full_int_array_5)
+        del transpose_11
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_22 = paddle._C_ops.matmul(reshape_11, parameter_10, False, False)
+        del reshape_11
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_27 = paddle._C_ops.add(matmul_22, parameter_9)
+        del matmul_22
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_28 = paddle._C_ops.add(layer_norm_12, add_27)
+        del add_27, layer_norm_12
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_15, layer_norm_16, layer_norm_17 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_28, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_28
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_23 = paddle._C_ops.matmul(layer_norm_15, parameter_6, False, False)
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_29 = paddle._C_ops.add(matmul_23, parameter_5)
+        del matmul_23
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_13 = paddle._C_ops.scale(add_29, full_3, float("0"), True)
+
+        # pd_op.pow: (1x21x3072xf32) <- (1x21x3072xf32)
+        pow_2 = paddle._C_ops.pow(add_29, float("3"))
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_14 = paddle._C_ops.scale(pow_2, full_4, float("0"), True)
+        del pow_2
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 1x21x3072xf32)
+        add_30 = paddle._C_ops.add(add_29, scale_14)
+        del add_29, scale_14
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_15 = paddle._C_ops.scale(add_30, full_5, float("0"), True)
+        del add_30
+
+        # pd_op.tanh: (1x21x3072xf32) <- (1x21x3072xf32)
+        tanh_3 = paddle._C_ops.tanh(scale_15)
+        del scale_15
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_16 = paddle._C_ops.scale(tanh_3, full_6, float("1"), True)
+        del tanh_3
+
+        # pd_op.multiply: (1x21x3072xf32) <- (1x21x3072xf32, 1x21x3072xf32)
+        multiply_2 = paddle._C_ops.multiply(scale_13, scale_16)
+        del scale_13, scale_16
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_24 = paddle._C_ops.matmul(multiply_2, parameter_4, False, False)
+        del multiply_2
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_31 = paddle._C_ops.add(matmul_24, parameter_3)
+        del matmul_24
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_32 = paddle._C_ops.add(add_31, layer_norm_15)
+        del add_31, layer_norm_15
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_18, layer_norm_19, layer_norm_20 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_32, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_32
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_25 = paddle._C_ops.matmul(layer_norm_18, parameter_16, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_33 = paddle._C_ops.add(matmul_25, parameter_15)
+        del matmul_25
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_26 = paddle._C_ops.matmul(layer_norm_18, parameter_14, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_34 = paddle._C_ops.add(matmul_26, parameter_13)
+        del matmul_26
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_27 = paddle._C_ops.matmul(layer_norm_18, parameter_12, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_35 = paddle._C_ops.add(matmul_27, parameter_11)
+        del matmul_27
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_12 = paddle._C_ops.reshape(add_33, full_int_array_4)
+        del add_33
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_12 = paddle._C_ops.transpose(reshape_12, [0, 2, 1, 3])
+        del reshape_12
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_13 = paddle._C_ops.reshape(add_34, full_int_array_4)
+        del add_34
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_13 = paddle._C_ops.transpose(reshape_13, [0, 2, 1, 3])
+        del reshape_13
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_14 = paddle._C_ops.reshape(add_35, full_int_array_4)
+        del add_35
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_14 = paddle._C_ops.transpose(reshape_14, [0, 2, 1, 3])
+        del reshape_14
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_28 = paddle._C_ops.matmul(transpose_12, transpose_13, False, True)
+        del transpose_12, transpose_13
+
+        # pd_op.scale: (1x12x21x21xf32) <- (1x12x21x21xf32, 1xf32)
+        scale_17 = paddle._C_ops.scale(matmul_28, full_2, float("0"), True)
+        del matmul_28
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_36 = paddle._C_ops.add(scale_17, scale_1)
+        del scale_17
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_3 = paddle._C_ops.softmax(add_36, -1)
+        del add_36
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_29 = paddle._C_ops.matmul(softmax_3, transpose_14, False, False)
+        del softmax_3, transpose_14
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_15 = paddle._C_ops.transpose(matmul_29, [0, 2, 1, 3])
+        del matmul_29
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_15 = paddle._C_ops.reshape(transpose_15, full_int_array_5)
+        del transpose_15
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_30 = paddle._C_ops.matmul(reshape_15, parameter_10, False, False)
+        del reshape_15
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_37 = paddle._C_ops.add(matmul_30, parameter_9)
+        del matmul_30
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_38 = paddle._C_ops.add(layer_norm_18, add_37)
+        del add_37, layer_norm_18
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_21, layer_norm_22, layer_norm_23 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_38, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_38
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_31 = paddle._C_ops.matmul(layer_norm_21, parameter_6, False, False)
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_39 = paddle._C_ops.add(matmul_31, parameter_5)
+        del matmul_31
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_18 = paddle._C_ops.scale(add_39, full_3, float("0"), True)
+
+        # pd_op.pow: (1x21x3072xf32) <- (1x21x3072xf32)
+        pow_3 = paddle._C_ops.pow(add_39, float("3"))
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_19 = paddle._C_ops.scale(pow_3, full_4, float("0"), True)
+        del pow_3
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 1x21x3072xf32)
+        add_40 = paddle._C_ops.add(add_39, scale_19)
+        del add_39, scale_19
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_20 = paddle._C_ops.scale(add_40, full_5, float("0"), True)
+        del add_40
+
+        # pd_op.tanh: (1x21x3072xf32) <- (1x21x3072xf32)
+        tanh_4 = paddle._C_ops.tanh(scale_20)
+        del scale_20
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_21 = paddle._C_ops.scale(tanh_4, full_6, float("1"), True)
+        del tanh_4
+
+        # pd_op.multiply: (1x21x3072xf32) <- (1x21x3072xf32, 1x21x3072xf32)
+        multiply_3 = paddle._C_ops.multiply(scale_18, scale_21)
+        del scale_18, scale_21
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_32 = paddle._C_ops.matmul(multiply_3, parameter_4, False, False)
+        del multiply_3
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_41 = paddle._C_ops.add(matmul_32, parameter_3)
+        del matmul_32
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_42 = paddle._C_ops.add(add_41, layer_norm_21)
+        del add_41, layer_norm_21
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_24, layer_norm_25, layer_norm_26 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_42, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_42
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_33 = paddle._C_ops.matmul(layer_norm_24, parameter_16, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_43 = paddle._C_ops.add(matmul_33, parameter_15)
+        del matmul_33
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_34 = paddle._C_ops.matmul(layer_norm_24, parameter_14, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_44 = paddle._C_ops.add(matmul_34, parameter_13)
+        del matmul_34
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_35 = paddle._C_ops.matmul(layer_norm_24, parameter_12, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_45 = paddle._C_ops.add(matmul_35, parameter_11)
+        del matmul_35
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_16 = paddle._C_ops.reshape(add_43, full_int_array_4)
+        del add_43
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_16 = paddle._C_ops.transpose(reshape_16, [0, 2, 1, 3])
+        del reshape_16
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_17 = paddle._C_ops.reshape(add_44, full_int_array_4)
+        del add_44
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_17 = paddle._C_ops.transpose(reshape_17, [0, 2, 1, 3])
+        del reshape_17
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_18 = paddle._C_ops.reshape(add_45, full_int_array_4)
+        del add_45
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_18 = paddle._C_ops.transpose(reshape_18, [0, 2, 1, 3])
+        del reshape_18
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_36 = paddle._C_ops.matmul(transpose_16, transpose_17, False, True)
+        del transpose_16, transpose_17
+
+        # pd_op.scale: (1x12x21x21xf32) <- (1x12x21x21xf32, 1xf32)
+        scale_22 = paddle._C_ops.scale(matmul_36, full_2, float("0"), True)
+        del matmul_36
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_46 = paddle._C_ops.add(scale_22, scale_1)
+        del scale_22
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_4 = paddle._C_ops.softmax(add_46, -1)
+        del add_46
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_37 = paddle._C_ops.matmul(softmax_4, transpose_18, False, False)
+        del softmax_4, transpose_18
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_19 = paddle._C_ops.transpose(matmul_37, [0, 2, 1, 3])
+        del matmul_37
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_19 = paddle._C_ops.reshape(transpose_19, full_int_array_5)
+        del transpose_19
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_38 = paddle._C_ops.matmul(reshape_19, parameter_10, False, False)
+        del reshape_19
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_47 = paddle._C_ops.add(matmul_38, parameter_9)
+        del matmul_38
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_48 = paddle._C_ops.add(layer_norm_24, add_47)
+        del add_47, layer_norm_24
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_27, layer_norm_28, layer_norm_29 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_48, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_48
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_39 = paddle._C_ops.matmul(layer_norm_27, parameter_6, False, False)
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_49 = paddle._C_ops.add(matmul_39, parameter_5)
+        del matmul_39
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_23 = paddle._C_ops.scale(add_49, full_3, float("0"), True)
+
+        # pd_op.pow: (1x21x3072xf32) <- (1x21x3072xf32)
+        pow_4 = paddle._C_ops.pow(add_49, float("3"))
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_24 = paddle._C_ops.scale(pow_4, full_4, float("0"), True)
+        del pow_4
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 1x21x3072xf32)
+        add_50 = paddle._C_ops.add(add_49, scale_24)
+        del add_49, scale_24
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_25 = paddle._C_ops.scale(add_50, full_5, float("0"), True)
+        del add_50
+
+        # pd_op.tanh: (1x21x3072xf32) <- (1x21x3072xf32)
+        tanh_5 = paddle._C_ops.tanh(scale_25)
+        del scale_25
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_26 = paddle._C_ops.scale(tanh_5, full_6, float("1"), True)
+        del tanh_5
+
+        # pd_op.multiply: (1x21x3072xf32) <- (1x21x3072xf32, 1x21x3072xf32)
+        multiply_4 = paddle._C_ops.multiply(scale_23, scale_26)
+        del scale_23, scale_26
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_40 = paddle._C_ops.matmul(multiply_4, parameter_4, False, False)
+        del multiply_4
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_51 = paddle._C_ops.add(matmul_40, parameter_3)
+        del matmul_40
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_52 = paddle._C_ops.add(add_51, layer_norm_27)
+        del add_51, layer_norm_27
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_30, layer_norm_31, layer_norm_32 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_52, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_52
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_41 = paddle._C_ops.matmul(layer_norm_30, parameter_16, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_53 = paddle._C_ops.add(matmul_41, parameter_15)
+        del matmul_41
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_42 = paddle._C_ops.matmul(layer_norm_30, parameter_14, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_54 = paddle._C_ops.add(matmul_42, parameter_13)
+        del matmul_42
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_43 = paddle._C_ops.matmul(layer_norm_30, parameter_12, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_55 = paddle._C_ops.add(matmul_43, parameter_11)
+        del matmul_43
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_20 = paddle._C_ops.reshape(add_53, full_int_array_4)
+        del add_53
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_20 = paddle._C_ops.transpose(reshape_20, [0, 2, 1, 3])
+        del reshape_20
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_21 = paddle._C_ops.reshape(add_54, full_int_array_4)
+        del add_54
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_21 = paddle._C_ops.transpose(reshape_21, [0, 2, 1, 3])
+        del reshape_21
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_22 = paddle._C_ops.reshape(add_55, full_int_array_4)
+        del add_55
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_22 = paddle._C_ops.transpose(reshape_22, [0, 2, 1, 3])
+        del reshape_22
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_44 = paddle._C_ops.matmul(transpose_20, transpose_21, False, True)
+        del transpose_20, transpose_21
+
+        # pd_op.scale: (1x12x21x21xf32) <- (1x12x21x21xf32, 1xf32)
+        scale_27 = paddle._C_ops.scale(matmul_44, full_2, float("0"), True)
+        del matmul_44
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_56 = paddle._C_ops.add(scale_27, scale_1)
+        del scale_27
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_5 = paddle._C_ops.softmax(add_56, -1)
+        del add_56
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_45 = paddle._C_ops.matmul(softmax_5, transpose_22, False, False)
+        del softmax_5, transpose_22
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_23 = paddle._C_ops.transpose(matmul_45, [0, 2, 1, 3])
+        del matmul_45
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_23 = paddle._C_ops.reshape(transpose_23, full_int_array_5)
+        del transpose_23
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_46 = paddle._C_ops.matmul(reshape_23, parameter_10, False, False)
+        del reshape_23
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_57 = paddle._C_ops.add(matmul_46, parameter_9)
+        del matmul_46
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_58 = paddle._C_ops.add(layer_norm_30, add_57)
+        del add_57, layer_norm_30
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_33, layer_norm_34, layer_norm_35 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_58, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_58
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_47 = paddle._C_ops.matmul(layer_norm_33, parameter_6, False, False)
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_59 = paddle._C_ops.add(matmul_47, parameter_5)
+        del matmul_47
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_28 = paddle._C_ops.scale(add_59, full_3, float("0"), True)
+
+        # pd_op.pow: (1x21x3072xf32) <- (1x21x3072xf32)
+        pow_5 = paddle._C_ops.pow(add_59, float("3"))
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_29 = paddle._C_ops.scale(pow_5, full_4, float("0"), True)
+        del pow_5
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 1x21x3072xf32)
+        add_60 = paddle._C_ops.add(add_59, scale_29)
+        del add_59, scale_29
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_30 = paddle._C_ops.scale(add_60, full_5, float("0"), True)
+        del add_60
+
+        # pd_op.tanh: (1x21x3072xf32) <- (1x21x3072xf32)
+        tanh_6 = paddle._C_ops.tanh(scale_30)
+        del scale_30
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_31 = paddle._C_ops.scale(tanh_6, full_6, float("1"), True)
+        del tanh_6
+
+        # pd_op.multiply: (1x21x3072xf32) <- (1x21x3072xf32, 1x21x3072xf32)
+        multiply_5 = paddle._C_ops.multiply(scale_28, scale_31)
+        del scale_28, scale_31
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_48 = paddle._C_ops.matmul(multiply_5, parameter_4, False, False)
+        del multiply_5
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_61 = paddle._C_ops.add(matmul_48, parameter_3)
+        del matmul_48
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_62 = paddle._C_ops.add(add_61, layer_norm_33)
+        del add_61, layer_norm_33
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_36, layer_norm_37, layer_norm_38 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_62, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_62
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_49 = paddle._C_ops.matmul(layer_norm_36, parameter_16, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_63 = paddle._C_ops.add(matmul_49, parameter_15)
+        del matmul_49
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_50 = paddle._C_ops.matmul(layer_norm_36, parameter_14, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_64 = paddle._C_ops.add(matmul_50, parameter_13)
+        del matmul_50
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_51 = paddle._C_ops.matmul(layer_norm_36, parameter_12, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_65 = paddle._C_ops.add(matmul_51, parameter_11)
+        del matmul_51
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_24 = paddle._C_ops.reshape(add_63, full_int_array_4)
+        del add_63
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_24 = paddle._C_ops.transpose(reshape_24, [0, 2, 1, 3])
+        del reshape_24
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_25 = paddle._C_ops.reshape(add_64, full_int_array_4)
+        del add_64
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_25 = paddle._C_ops.transpose(reshape_25, [0, 2, 1, 3])
+        del reshape_25
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_26 = paddle._C_ops.reshape(add_65, full_int_array_4)
+        del add_65
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_26 = paddle._C_ops.transpose(reshape_26, [0, 2, 1, 3])
+        del reshape_26
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_52 = paddle._C_ops.matmul(transpose_24, transpose_25, False, True)
+        del transpose_24, transpose_25
+
+        # pd_op.scale: (1x12x21x21xf32) <- (1x12x21x21xf32, 1xf32)
+        scale_32 = paddle._C_ops.scale(matmul_52, full_2, float("0"), True)
+        del matmul_52
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_66 = paddle._C_ops.add(scale_32, scale_1)
+        del scale_32
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_6 = paddle._C_ops.softmax(add_66, -1)
+        del add_66
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_53 = paddle._C_ops.matmul(softmax_6, transpose_26, False, False)
+        del softmax_6, transpose_26
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_27 = paddle._C_ops.transpose(matmul_53, [0, 2, 1, 3])
+        del matmul_53
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_27 = paddle._C_ops.reshape(transpose_27, full_int_array_5)
+        del transpose_27
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_54 = paddle._C_ops.matmul(reshape_27, parameter_10, False, False)
+        del reshape_27
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_67 = paddle._C_ops.add(matmul_54, parameter_9)
+        del matmul_54
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_68 = paddle._C_ops.add(layer_norm_36, add_67)
+        del add_67, layer_norm_36
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_39, layer_norm_40, layer_norm_41 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_68, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_68
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_55 = paddle._C_ops.matmul(layer_norm_39, parameter_6, False, False)
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_69 = paddle._C_ops.add(matmul_55, parameter_5)
+        del matmul_55
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_33 = paddle._C_ops.scale(add_69, full_3, float("0"), True)
+
+        # pd_op.pow: (1x21x3072xf32) <- (1x21x3072xf32)
+        pow_6 = paddle._C_ops.pow(add_69, float("3"))
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_34 = paddle._C_ops.scale(pow_6, full_4, float("0"), True)
+        del pow_6
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 1x21x3072xf32)
+        add_70 = paddle._C_ops.add(add_69, scale_34)
+        del add_69, scale_34
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_35 = paddle._C_ops.scale(add_70, full_5, float("0"), True)
+        del add_70
+
+        # pd_op.tanh: (1x21x3072xf32) <- (1x21x3072xf32)
+        tanh_7 = paddle._C_ops.tanh(scale_35)
+        del scale_35
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_36 = paddle._C_ops.scale(tanh_7, full_6, float("1"), True)
+        del tanh_7
+
+        # pd_op.multiply: (1x21x3072xf32) <- (1x21x3072xf32, 1x21x3072xf32)
+        multiply_6 = paddle._C_ops.multiply(scale_33, scale_36)
+        del scale_33, scale_36
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_56 = paddle._C_ops.matmul(multiply_6, parameter_4, False, False)
+        del multiply_6
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_71 = paddle._C_ops.add(matmul_56, parameter_3)
+        del matmul_56
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_72 = paddle._C_ops.add(add_71, layer_norm_39)
+        del add_71, layer_norm_39
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_42, layer_norm_43, layer_norm_44 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_72, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_72
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_57 = paddle._C_ops.matmul(layer_norm_42, parameter_16, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_73 = paddle._C_ops.add(matmul_57, parameter_15)
+        del matmul_57
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_58 = paddle._C_ops.matmul(layer_norm_42, parameter_14, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_74 = paddle._C_ops.add(matmul_58, parameter_13)
+        del matmul_58
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_59 = paddle._C_ops.matmul(layer_norm_42, parameter_12, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_75 = paddle._C_ops.add(matmul_59, parameter_11)
+        del matmul_59
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_28 = paddle._C_ops.reshape(add_73, full_int_array_4)
+        del add_73
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_28 = paddle._C_ops.transpose(reshape_28, [0, 2, 1, 3])
+        del reshape_28
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_29 = paddle._C_ops.reshape(add_74, full_int_array_4)
+        del add_74
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_29 = paddle._C_ops.transpose(reshape_29, [0, 2, 1, 3])
+        del reshape_29
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_30 = paddle._C_ops.reshape(add_75, full_int_array_4)
+        del add_75
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_30 = paddle._C_ops.transpose(reshape_30, [0, 2, 1, 3])
+        del reshape_30
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_60 = paddle._C_ops.matmul(transpose_28, transpose_29, False, True)
+        del transpose_28, transpose_29
+
+        # pd_op.scale: (1x12x21x21xf32) <- (1x12x21x21xf32, 1xf32)
+        scale_37 = paddle._C_ops.scale(matmul_60, full_2, float("0"), True)
+        del matmul_60
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_76 = paddle._C_ops.add(scale_37, scale_1)
+        del scale_37
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_7 = paddle._C_ops.softmax(add_76, -1)
+        del add_76
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_61 = paddle._C_ops.matmul(softmax_7, transpose_30, False, False)
+        del softmax_7, transpose_30
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_31 = paddle._C_ops.transpose(matmul_61, [0, 2, 1, 3])
+        del matmul_61
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_31 = paddle._C_ops.reshape(transpose_31, full_int_array_5)
+        del transpose_31
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_62 = paddle._C_ops.matmul(reshape_31, parameter_10, False, False)
+        del reshape_31
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_77 = paddle._C_ops.add(matmul_62, parameter_9)
+        del matmul_62
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_78 = paddle._C_ops.add(layer_norm_42, add_77)
+        del add_77, layer_norm_42
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_45, layer_norm_46, layer_norm_47 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_78, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_78
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_63 = paddle._C_ops.matmul(layer_norm_45, parameter_6, False, False)
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_79 = paddle._C_ops.add(matmul_63, parameter_5)
+        del matmul_63
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_38 = paddle._C_ops.scale(add_79, full_3, float("0"), True)
+
+        # pd_op.pow: (1x21x3072xf32) <- (1x21x3072xf32)
+        pow_7 = paddle._C_ops.pow(add_79, float("3"))
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_39 = paddle._C_ops.scale(pow_7, full_4, float("0"), True)
+        del pow_7
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 1x21x3072xf32)
+        add_80 = paddle._C_ops.add(add_79, scale_39)
+        del add_79, scale_39
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_40 = paddle._C_ops.scale(add_80, full_5, float("0"), True)
+        del add_80
+
+        # pd_op.tanh: (1x21x3072xf32) <- (1x21x3072xf32)
+        tanh_8 = paddle._C_ops.tanh(scale_40)
+        del scale_40
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_41 = paddle._C_ops.scale(tanh_8, full_6, float("1"), True)
+        del tanh_8
+
+        # pd_op.multiply: (1x21x3072xf32) <- (1x21x3072xf32, 1x21x3072xf32)
+        multiply_7 = paddle._C_ops.multiply(scale_38, scale_41)
+        del scale_38, scale_41
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_64 = paddle._C_ops.matmul(multiply_7, parameter_4, False, False)
+        del multiply_7
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_81 = paddle._C_ops.add(matmul_64, parameter_3)
+        del matmul_64
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_82 = paddle._C_ops.add(add_81, layer_norm_45)
+        del add_81, layer_norm_45
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_48, layer_norm_49, layer_norm_50 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_82, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_82
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_65 = paddle._C_ops.matmul(layer_norm_48, parameter_16, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_83 = paddle._C_ops.add(matmul_65, parameter_15)
+        del matmul_65
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_66 = paddle._C_ops.matmul(layer_norm_48, parameter_14, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_84 = paddle._C_ops.add(matmul_66, parameter_13)
+        del matmul_66
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_67 = paddle._C_ops.matmul(layer_norm_48, parameter_12, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_85 = paddle._C_ops.add(matmul_67, parameter_11)
+        del matmul_67
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_32 = paddle._C_ops.reshape(add_83, full_int_array_4)
+        del add_83
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_32 = paddle._C_ops.transpose(reshape_32, [0, 2, 1, 3])
+        del reshape_32
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_33 = paddle._C_ops.reshape(add_84, full_int_array_4)
+        del add_84
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_33 = paddle._C_ops.transpose(reshape_33, [0, 2, 1, 3])
+        del reshape_33
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_34 = paddle._C_ops.reshape(add_85, full_int_array_4)
+        del add_85
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_34 = paddle._C_ops.transpose(reshape_34, [0, 2, 1, 3])
+        del reshape_34
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_68 = paddle._C_ops.matmul(transpose_32, transpose_33, False, True)
+        del transpose_32, transpose_33
+
+        # pd_op.scale: (1x12x21x21xf32) <- (1x12x21x21xf32, 1xf32)
+        scale_42 = paddle._C_ops.scale(matmul_68, full_2, float("0"), True)
+        del matmul_68
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_86 = paddle._C_ops.add(scale_42, scale_1)
+        del scale_42
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_8 = paddle._C_ops.softmax(add_86, -1)
+        del add_86
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_69 = paddle._C_ops.matmul(softmax_8, transpose_34, False, False)
+        del softmax_8, transpose_34
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_35 = paddle._C_ops.transpose(matmul_69, [0, 2, 1, 3])
+        del matmul_69
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_35 = paddle._C_ops.reshape(transpose_35, full_int_array_5)
+        del transpose_35
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_70 = paddle._C_ops.matmul(reshape_35, parameter_10, False, False)
+        del reshape_35
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_87 = paddle._C_ops.add(matmul_70, parameter_9)
+        del matmul_70
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_88 = paddle._C_ops.add(layer_norm_48, add_87)
+        del add_87, layer_norm_48
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_51, layer_norm_52, layer_norm_53 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_88, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_88
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_71 = paddle._C_ops.matmul(layer_norm_51, parameter_6, False, False)
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_89 = paddle._C_ops.add(matmul_71, parameter_5)
+        del matmul_71
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_43 = paddle._C_ops.scale(add_89, full_3, float("0"), True)
+
+        # pd_op.pow: (1x21x3072xf32) <- (1x21x3072xf32)
+        pow_8 = paddle._C_ops.pow(add_89, float("3"))
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_44 = paddle._C_ops.scale(pow_8, full_4, float("0"), True)
+        del pow_8
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 1x21x3072xf32)
+        add_90 = paddle._C_ops.add(add_89, scale_44)
+        del add_89, scale_44
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_45 = paddle._C_ops.scale(add_90, full_5, float("0"), True)
+        del add_90
+
+        # pd_op.tanh: (1x21x3072xf32) <- (1x21x3072xf32)
+        tanh_9 = paddle._C_ops.tanh(scale_45)
+        del scale_45
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_46 = paddle._C_ops.scale(tanh_9, full_6, float("1"), True)
+        del tanh_9
+
+        # pd_op.multiply: (1x21x3072xf32) <- (1x21x3072xf32, 1x21x3072xf32)
+        multiply_8 = paddle._C_ops.multiply(scale_43, scale_46)
+        del scale_43, scale_46
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_72 = paddle._C_ops.matmul(multiply_8, parameter_4, False, False)
+        del multiply_8
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_91 = paddle._C_ops.add(matmul_72, parameter_3)
+        del matmul_72
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_92 = paddle._C_ops.add(add_91, layer_norm_51)
+        del add_91, layer_norm_51
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_54, layer_norm_55, layer_norm_56 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_92, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_92
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_73 = paddle._C_ops.matmul(layer_norm_54, parameter_16, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_93 = paddle._C_ops.add(matmul_73, parameter_15)
+        del matmul_73
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_74 = paddle._C_ops.matmul(layer_norm_54, parameter_14, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_94 = paddle._C_ops.add(matmul_74, parameter_13)
+        del matmul_74
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_75 = paddle._C_ops.matmul(layer_norm_54, parameter_12, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_95 = paddle._C_ops.add(matmul_75, parameter_11)
+        del matmul_75
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_36 = paddle._C_ops.reshape(add_93, full_int_array_4)
+        del add_93
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_36 = paddle._C_ops.transpose(reshape_36, [0, 2, 1, 3])
+        del reshape_36
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_37 = paddle._C_ops.reshape(add_94, full_int_array_4)
+        del add_94
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_37 = paddle._C_ops.transpose(reshape_37, [0, 2, 1, 3])
+        del reshape_37
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_38 = paddle._C_ops.reshape(add_95, full_int_array_4)
+        del add_95
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_38 = paddle._C_ops.transpose(reshape_38, [0, 2, 1, 3])
+        del reshape_38
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_76 = paddle._C_ops.matmul(transpose_36, transpose_37, False, True)
+        del transpose_36, transpose_37
+
+        # pd_op.scale: (1x12x21x21xf32) <- (1x12x21x21xf32, 1xf32)
+        scale_47 = paddle._C_ops.scale(matmul_76, full_2, float("0"), True)
+        del matmul_76
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_96 = paddle._C_ops.add(scale_47, scale_1)
+        del scale_47
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_9 = paddle._C_ops.softmax(add_96, -1)
+        del add_96
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_77 = paddle._C_ops.matmul(softmax_9, transpose_38, False, False)
+        del softmax_9, transpose_38
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_39 = paddle._C_ops.transpose(matmul_77, [0, 2, 1, 3])
+        del matmul_77
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_39 = paddle._C_ops.reshape(transpose_39, full_int_array_5)
+        del transpose_39
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_78 = paddle._C_ops.matmul(reshape_39, parameter_10, False, False)
+        del reshape_39
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_97 = paddle._C_ops.add(matmul_78, parameter_9)
+        del matmul_78
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_98 = paddle._C_ops.add(layer_norm_54, add_97)
+        del add_97, layer_norm_54
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_57, layer_norm_58, layer_norm_59 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_98, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_98
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_79 = paddle._C_ops.matmul(layer_norm_57, parameter_6, False, False)
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_99 = paddle._C_ops.add(matmul_79, parameter_5)
+        del matmul_79
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_48 = paddle._C_ops.scale(add_99, full_3, float("0"), True)
+
+        # pd_op.pow: (1x21x3072xf32) <- (1x21x3072xf32)
+        pow_9 = paddle._C_ops.pow(add_99, float("3"))
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_49 = paddle._C_ops.scale(pow_9, full_4, float("0"), True)
+        del pow_9
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 1x21x3072xf32)
+        add_100 = paddle._C_ops.add(add_99, scale_49)
+        del add_99, scale_49
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_50 = paddle._C_ops.scale(add_100, full_5, float("0"), True)
+        del add_100
+
+        # pd_op.tanh: (1x21x3072xf32) <- (1x21x3072xf32)
+        tanh_10 = paddle._C_ops.tanh(scale_50)
+        del scale_50
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_51 = paddle._C_ops.scale(tanh_10, full_6, float("1"), True)
+        del tanh_10
+
+        # pd_op.multiply: (1x21x3072xf32) <- (1x21x3072xf32, 1x21x3072xf32)
+        multiply_9 = paddle._C_ops.multiply(scale_48, scale_51)
+        del scale_48, scale_51
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_80 = paddle._C_ops.matmul(multiply_9, parameter_4, False, False)
+        del multiply_9
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_101 = paddle._C_ops.add(matmul_80, parameter_3)
+        del matmul_80
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_102 = paddle._C_ops.add(add_101, layer_norm_57)
+        del add_101, layer_norm_57
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_60, layer_norm_61, layer_norm_62 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_102, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_102
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_81 = paddle._C_ops.matmul(layer_norm_60, parameter_16, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_103 = paddle._C_ops.add(matmul_81, parameter_15)
+        del matmul_81
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_82 = paddle._C_ops.matmul(layer_norm_60, parameter_14, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_104 = paddle._C_ops.add(matmul_82, parameter_13)
+        del matmul_82
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_83 = paddle._C_ops.matmul(layer_norm_60, parameter_12, False, False)
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_105 = paddle._C_ops.add(matmul_83, parameter_11)
+        del matmul_83
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_40 = paddle._C_ops.reshape(add_103, full_int_array_4)
+        del add_103
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_40 = paddle._C_ops.transpose(reshape_40, [0, 2, 1, 3])
+        del reshape_40
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_41 = paddle._C_ops.reshape(add_104, full_int_array_4)
+        del add_104
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_41 = paddle._C_ops.transpose(reshape_41, [0, 2, 1, 3])
+        del reshape_41
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_42 = paddle._C_ops.reshape(add_105, full_int_array_4)
+        del add_105
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_42 = paddle._C_ops.transpose(reshape_42, [0, 2, 1, 3])
+        del reshape_42
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_84 = paddle._C_ops.matmul(transpose_40, transpose_41, False, True)
+        del transpose_40, transpose_41
+
+        # pd_op.scale: (1x12x21x21xf32) <- (1x12x21x21xf32, 1xf32)
+        scale_52 = paddle._C_ops.scale(matmul_84, full_2, float("0"), True)
+        del matmul_84
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_106 = paddle._C_ops.add(scale_52, scale_1)
+        del scale_52
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_10 = paddle._C_ops.softmax(add_106, -1)
+        del add_106
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_85 = paddle._C_ops.matmul(softmax_10, transpose_42, False, False)
+        del softmax_10, transpose_42
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_43 = paddle._C_ops.transpose(matmul_85, [0, 2, 1, 3])
+        del matmul_85
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_43 = paddle._C_ops.reshape(transpose_43, full_int_array_5)
+        del transpose_43
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_86 = paddle._C_ops.matmul(reshape_43, parameter_10, False, False)
+        del reshape_43
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_107 = paddle._C_ops.add(matmul_86, parameter_9)
+        del matmul_86
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_108 = paddle._C_ops.add(layer_norm_60, add_107)
+        del add_107, layer_norm_60
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_63, layer_norm_64, layer_norm_65 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_108, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_108
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_87 = paddle._C_ops.matmul(layer_norm_63, parameter_6, False, False)
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_109 = paddle._C_ops.add(matmul_87, parameter_5)
+        del matmul_87
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_53 = paddle._C_ops.scale(add_109, full_3, float("0"), True)
+
+        # pd_op.pow: (1x21x3072xf32) <- (1x21x3072xf32)
+        pow_10 = paddle._C_ops.pow(add_109, float("3"))
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_54 = paddle._C_ops.scale(pow_10, full_4, float("0"), True)
+        del pow_10
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 1x21x3072xf32)
+        add_110 = paddle._C_ops.add(add_109, scale_54)
+        del add_109, scale_54
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_55 = paddle._C_ops.scale(add_110, full_5, float("0"), True)
+        del add_110
+
+        # pd_op.tanh: (1x21x3072xf32) <- (1x21x3072xf32)
+        tanh_11 = paddle._C_ops.tanh(scale_55)
+        del scale_55
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_56 = paddle._C_ops.scale(tanh_11, full_6, float("1"), True)
+        del tanh_11
+
+        # pd_op.multiply: (1x21x3072xf32) <- (1x21x3072xf32, 1x21x3072xf32)
+        multiply_10 = paddle._C_ops.multiply(scale_53, scale_56)
+        del scale_53, scale_56
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_88 = paddle._C_ops.matmul(multiply_10, parameter_4, False, False)
+        del multiply_10
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_111 = paddle._C_ops.add(matmul_88, parameter_3)
+        del matmul_88
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_112 = paddle._C_ops.add(add_111, layer_norm_63)
+        del add_111, layer_norm_63
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_66, layer_norm_67, layer_norm_68 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_112, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_112
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_89 = paddle._C_ops.matmul(layer_norm_66, parameter_16, False, False)
+        del parameter_16
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_113 = paddle._C_ops.add(matmul_89, parameter_15)
+        del matmul_89, parameter_15
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_90 = paddle._C_ops.matmul(layer_norm_66, parameter_14, False, False)
+        del parameter_14
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_114 = paddle._C_ops.add(matmul_90, parameter_13)
+        del matmul_90, parameter_13
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_91 = paddle._C_ops.matmul(layer_norm_66, parameter_12, False, False)
+        del parameter_12
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_115 = paddle._C_ops.add(matmul_91, parameter_11)
+        del matmul_91, parameter_11
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_44 = paddle._C_ops.reshape(add_113, full_int_array_4)
+        del add_113
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_44 = paddle._C_ops.transpose(reshape_44, [0, 2, 1, 3])
+        del reshape_44
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_45 = paddle._C_ops.reshape(add_114, full_int_array_4)
+        del add_114
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_45 = paddle._C_ops.transpose(reshape_45, [0, 2, 1, 3])
+        del reshape_45
+
+        # pd_op.reshape: (1x21x12x64xf32) <- (1x21x768xf32, 4xi64)
+        reshape_46 = paddle._C_ops.reshape(add_115, full_int_array_4)
+        del add_115, full_int_array_4
+
+        # pd_op.transpose: (1x12x21x64xf32) <- (1x21x12x64xf32)
+        transpose_46 = paddle._C_ops.transpose(reshape_46, [0, 2, 1, 3])
+        del reshape_46
+
+        # pd_op.matmul: (1x12x21x21xf32) <- (1x12x21x64xf32, 1x12x21x64xf32)
+        matmul_92 = paddle._C_ops.matmul(transpose_44, transpose_45, False, True)
+        del transpose_44, transpose_45
+
+        # pd_op.scale: (1x12x21x21xf32) <- (1x12x21x21xf32, 1xf32)
+        scale_57 = paddle._C_ops.scale(matmul_92, full_2, float("0"), True)
+        del full_2, matmul_92
+
+        # pd_op.add: (1x12x21x21xf32) <- (1x12x21x21xf32, 1x1x1x21xf32)
+        add_116 = paddle._C_ops.add(scale_57, scale_1)
+        del scale_1, scale_57
+
+        # pd_op.softmax: (1x12x21x21xf32) <- (1x12x21x21xf32)
+        softmax_11 = paddle._C_ops.softmax(add_116, -1)
+        del add_116
+
+        # pd_op.matmul: (1x12x21x64xf32) <- (1x12x21x21xf32, 1x12x21x64xf32)
+        matmul_93 = paddle._C_ops.matmul(softmax_11, transpose_46, False, False)
+        del softmax_11, transpose_46
+
+        # pd_op.transpose: (1x21x12x64xf32) <- (1x12x21x64xf32)
+        transpose_47 = paddle._C_ops.transpose(matmul_93, [0, 2, 1, 3])
+        del matmul_93
+
+        # pd_op.reshape: (1x21x768xf32) <- (1x21x12x64xf32, 3xi64)
+        reshape_47 = paddle._C_ops.reshape(transpose_47, full_int_array_5)
+        del full_int_array_5, transpose_47
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x768xf32, 768x768xf32)
+        matmul_94 = paddle._C_ops.matmul(reshape_47, parameter_10, False, False)
+        del parameter_10, reshape_47
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_117 = paddle._C_ops.add(matmul_94, parameter_9)
+        del matmul_94, parameter_9
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_118 = paddle._C_ops.add(layer_norm_66, add_117)
+        del add_117, layer_norm_66
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_69, layer_norm_70, layer_norm_71 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_118, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_118, parameter_7, parameter_8
+
+        # pd_op.matmul: (1x21x3072xf32) <- (1x21x768xf32, 768x3072xf32)
+        matmul_95 = paddle._C_ops.matmul(layer_norm_69, parameter_6, False, False)
+        del parameter_6
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 3072xf32)
+        add_119 = paddle._C_ops.add(matmul_95, parameter_5)
+        del matmul_95, parameter_5
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_58 = paddle._C_ops.scale(add_119, full_3, float("0"), True)
+        del full_3
+
+        # pd_op.pow: (1x21x3072xf32) <- (1x21x3072xf32)
+        pow_11 = paddle._C_ops.pow(add_119, float("3"))
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_59 = paddle._C_ops.scale(pow_11, full_4, float("0"), True)
+        del full_4, pow_11
+
+        # pd_op.add: (1x21x3072xf32) <- (1x21x3072xf32, 1x21x3072xf32)
+        add_120 = paddle._C_ops.add(add_119, scale_59)
+        del add_119, scale_59
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_60 = paddle._C_ops.scale(add_120, full_5, float("0"), True)
+        del add_120, full_5
+
+        # pd_op.tanh: (1x21x3072xf32) <- (1x21x3072xf32)
+        tanh_12 = paddle._C_ops.tanh(scale_60)
+        del scale_60
+
+        # pd_op.scale: (1x21x3072xf32) <- (1x21x3072xf32, 1xf32)
+        scale_61 = paddle._C_ops.scale(tanh_12, full_6, float("1"), True)
+        del full_6, tanh_12
+
+        # pd_op.multiply: (1x21x3072xf32) <- (1x21x3072xf32, 1x21x3072xf32)
+        multiply_11 = paddle._C_ops.multiply(scale_58, scale_61)
+        del scale_58, scale_61
+
+        # pd_op.matmul: (1x21x768xf32) <- (1x21x3072xf32, 3072x768xf32)
+        matmul_96 = paddle._C_ops.matmul(multiply_11, parameter_4, False, False)
+        del multiply_11, parameter_4
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 768xf32)
+        add_121 = paddle._C_ops.add(matmul_96, parameter_3)
+        del matmul_96, parameter_3
+
+        # pd_op.add: (1x21x768xf32) <- (1x21x768xf32, 1x21x768xf32)
+        add_122 = paddle._C_ops.add(add_121, layer_norm_69)
+        del add_121, layer_norm_69
+
+        # pd_op.layer_norm: (1x21x768xf32, 1x21xf32, 1x21xf32) <- (1x21x768xf32, 768xf32, 768xf32)
+        layer_norm_72, layer_norm_73, layer_norm_74 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_122, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_122, parameter_17, parameter_18
+
+        # pd_op.slice: (1x768xf32) <- (1x21x768xf32, 1xi64, 1xi64)
+        slice_1 = paddle._C_ops.slice(
+            layer_norm_72, [1], full_int_array_2, full_int_array_0, [1], [1]
+        )
+        del full_int_array_0, full_int_array_2
+
+        # pd_op.matmul: (1x768xf32) <- (1x768xf32, 768x768xf32)
+        matmul_97 = paddle._C_ops.matmul(slice_1, parameter_2, False, False)
+        del parameter_2, slice_1
+
+        # pd_op.add: (1x768xf32) <- (1x768xf32, 768xf32)
+        add_123 = paddle._C_ops.add(matmul_97, parameter_1)
+        del matmul_97, parameter_1
+
+        # pd_op.tanh: (1x768xf32) <- (1x768xf32)
+        tanh_0 = paddle._C_ops.tanh(add_123)
+        del add_123, layer_norm_72
+
+        return tanh_0
diff --git a/paddle_samples/PaddleNLP/albert-base-v2/weight_meta.py b/paddle_samples/PaddleNLP/albert-base-v2/weight_meta.py
new file mode 100644
index 000000000..7588cbb6e
--- /dev/null
+++ b/paddle_samples/PaddleNLP/albert-base-v2/weight_meta.py
@@ -0,0 +1,235 @@
+class Program_weight_tensor_parameter_0:
+    name = "parameter_0"
+    shape = [1, 512]
+    dtype = "int64"
+    min_val = 0
+    max_val = 511
+    data = None
+
+
+class Program_weight_tensor_parameter_1:
+    name = "parameter_1"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_2:
+    name = "parameter_2"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0887999")
+    max_val = float("0.107858")
+    mean = float("9.86864e-06")
+    std = float("0.0199999")
+    data = None
+
+
+class Program_weight_tensor_parameter_3:
+    name = "parameter_3"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_4:
+    name = "parameter_4"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.101961")
+    max_val = float("0.107306")
+    mean = float("-1.01831e-05")
+    std = float("0.0199888")
+    data = None
+
+
+class Program_weight_tensor_parameter_5:
+    name = "parameter_5"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_6:
+    name = "parameter_6"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0937552")
+    max_val = float("0.100479")
+    mean = float("4.65614e-06")
+    std = float("0.0200006")
+    data = None
+
+
+class Program_weight_tensor_parameter_7:
+    name = "parameter_7"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_8:
+    name = "parameter_8"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_9:
+    name = "parameter_9"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_10:
+    name = "parameter_10"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0929592")
+    max_val = float("0.0998105")
+    mean = float("4.01795e-05")
+    std = float("0.0200188")
+    data = None
+
+
+class Program_weight_tensor_parameter_11:
+    name = "parameter_11"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_12:
+    name = "parameter_12"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0910254")
+    max_val = float("0.0940073")
+    mean = float("1.1987e-05")
+    std = float("0.02001")
+    data = None
+
+
+class Program_weight_tensor_parameter_13:
+    name = "parameter_13"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_14:
+    name = "parameter_14"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0958641")
+    max_val = float("0.0952735")
+    mean = float("-4.36463e-05")
+    std = float("0.0199898")
+    data = None
+
+
+class Program_weight_tensor_parameter_15:
+    name = "parameter_15"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_16:
+    name = "parameter_16"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0925532")
+    max_val = float("0.0965498")
+    mean = float("-4.51693e-05")
+    std = float("0.0200245")
+    data = None
+
+
+class Program_weight_tensor_parameter_17:
+    name = "parameter_17"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_18:
+    name = "parameter_18"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_19:
+    name = "parameter_19"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_20:
+    name = "parameter_20"
+    shape = [128, 768]
+    dtype = "float32"
+    min_val = float("-0.0868425")
+    max_val = float("0.0925445")
+    mean = float("-0.000119993")
+    std = float("0.0200474")
+    data = None
+
+
+class Program_weight_tensor_parameter_21:
+    name = "parameter_21"
+    shape = [128]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_22:
+    name = "parameter_22"
+    shape = [128]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_23:
+    name = "parameter_23"
+    shape = [2, 128]
+    dtype = "float32"
+    min_val = float("-0.0554865")
+    max_val = float("0.0556627")
+    mean = float("-0.00351806")
+    std = float("0.0195563")
+    data = None
+
+
+class Program_weight_tensor_parameter_24:
+    name = "parameter_24"
+    shape = [512, 128]
+    dtype = "float32"
+    min_val = float("-0.0802345")
+    max_val = float("0.0806108")
+    mean = float("4.65631e-05")
+    std = float("0.0200016")
+    data = None
+
+
+class Program_weight_tensor_parameter_25:
+    name = "parameter_25"
+    shape = [30000, 128]
+    dtype = "float32"
+    min_val = float("-0.103231")
+    max_val = float("0.10054")
+    mean = float("-1.42952e-05")
+    std = float("0.0199984")
+    data = None
diff --git a/paddle_samples/PaddleNLP/albert-chinese-base/graph_net.json b/paddle_samples/PaddleNLP/albert-chinese-base/graph_net.json
new file mode 100644
index 000000000..16cf1535c
--- /dev/null
+++ b/paddle_samples/PaddleNLP/albert-chinese-base/graph_net.json
@@ -0,0 +1,6 @@
+{
+    "framework": "paddle",
+    "model_name": "albert-chinese-base",
+    "num_devices_required": 1,
+    "num_nodes_required": 1
+}
\ No newline at end of file
diff --git a/paddle_samples/PaddleNLP/albert-chinese-base/input_meta.py b/paddle_samples/PaddleNLP/albert-chinese-base/input_meta.py
new file mode 100644
index 000000000..3708564f7
--- /dev/null
+++ b/paddle_samples/PaddleNLP/albert-chinese-base/input_meta.py
@@ -0,0 +1,19 @@
+class Program_weight_tensor_data_0:
+    name = "data_0"
+    shape = [1, 11]
+    dtype = "int64"
+    data = [101, 3614, 6816, 886, 4500, 4636, 2428, 7607, 3444, 106, 102]
+
+
+class Program_weight_tensor_data_1:
+    name = "data_1"
+    shape = [1, 11]
+    dtype = "int64"
+    data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+
+
+class Program_weight_tensor_data_2:
+    name = "data_2"
+    shape = [1, 11]
+    dtype = "int64"
+    data = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
diff --git a/paddle_samples/PaddleNLP/albert-chinese-base/model.py b/paddle_samples/PaddleNLP/albert-chinese-base/model.py
new file mode 100644
index 000000000..51da31093
--- /dev/null
+++ b/paddle_samples/PaddleNLP/albert-chinese-base/model.py
@@ -0,0 +1,1670 @@
+import paddle
+
+
+class GraphModule(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self,
+        parameter_0,
+        parameter_1,
+        parameter_2,
+        parameter_3,
+        parameter_4,
+        parameter_5,
+        parameter_6,
+        parameter_7,
+        parameter_8,
+        parameter_9,
+        parameter_10,
+        parameter_11,
+        parameter_12,
+        parameter_13,
+        parameter_14,
+        parameter_15,
+        parameter_16,
+        parameter_17,
+        parameter_18,
+        parameter_19,
+        parameter_20,
+        parameter_21,
+        parameter_22,
+        parameter_23,
+        parameter_24,
+        parameter_25,
+        data_0,
+        data_1,
+        data_2,
+    ):
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_0 = [1]
+
+        # pd_op.unsqueeze: (1x1x11xi64) <- (1x11xi64, 1xi64)
+        unsqueeze_0 = paddle._C_ops.unsqueeze(data_1, full_int_array_0)
+        del data_1
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_1 = [2]
+
+        # pd_op.unsqueeze: (1x1x1x11xi64) <- (1x1x11xi64, 1xi64)
+        unsqueeze_1 = paddle._C_ops.unsqueeze(unsqueeze_0, full_int_array_1)
+        del full_int_array_1, unsqueeze_0
+
+        # pd_op.cast: (1x1x1x11xf32) <- (1x1x1x11xi64)
+        cast_0 = paddle._C_ops.cast(unsqueeze_1, paddle.float32)
+        del unsqueeze_1
+
+        # pd_op.full: (1xf32) <- ()
+        full_0 = paddle._C_ops.full(
+            [1], float("-1"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x1x1x11xf32) <- (1x1x1x11xf32, 1xf32)
+        scale_0 = paddle._C_ops.scale(cast_0, full_0, float("1"), True)
+        del cast_0, full_0
+
+        # pd_op.full: (1xf32) <- ()
+        full_1 = paddle._C_ops.full(
+            [1], float("-10000"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x1x1x11xf32) <- (1x1x1x11xf32, 1xf32)
+        scale_1 = paddle._C_ops.scale(scale_0, full_1, float("0"), True)
+        del full_1, scale_0
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_2 = [0]
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_3 = [11]
+
+        # pd_op.slice: (1x11xi64) <- (1x512xi64, 1xi64, 1xi64)
+        slice_0 = paddle._C_ops.slice(
+            parameter_0, [1], full_int_array_2, full_int_array_3, [1], []
+        )
+        del full_int_array_3, parameter_0
+
+        # pd_op.embedding: (1x11x128xf32) <- (1x11xi64, 21128x128xf32)
+        embedding_0 = paddle._C_ops.embedding(data_0, parameter_25, 0, False)
+        del data_0, parameter_25
+
+        # pd_op.embedding: (1x11x128xf32) <- (1x11xi64, 2x128xf32)
+        embedding_1 = paddle._C_ops.embedding(data_2, parameter_23, -1, False)
+        del data_2, parameter_23
+
+        # pd_op.add: (1x11x128xf32) <- (1x11x128xf32, 1x11x128xf32)
+        add_0 = paddle._C_ops.add(embedding_0, embedding_1)
+        del embedding_0, embedding_1
+
+        # pd_op.embedding: (1x11x128xf32) <- (1x11xi64, 512x128xf32)
+        embedding_2 = paddle._C_ops.embedding(slice_0, parameter_24, -1, False)
+        del parameter_24, slice_0
+
+        # pd_op.add: (1x11x128xf32) <- (1x11x128xf32, 1x11x128xf32)
+        add_1 = paddle._C_ops.add(add_0, embedding_2)
+        del add_0, embedding_2
+
+        # pd_op.layer_norm: (1x11x128xf32, 1x11xf32, 1x11xf32) <- (1x11x128xf32, 128xf32, 128xf32)
+        layer_norm_0, layer_norm_1, layer_norm_2 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_1, parameter_22, parameter_21, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_1, parameter_21, parameter_22
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x128xf32, 128x768xf32)
+        matmul_0 = paddle._C_ops.matmul(layer_norm_0, parameter_20, False, False)
+        del layer_norm_0, parameter_20
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_2 = paddle._C_ops.add(matmul_0, parameter_19)
+        del matmul_0, parameter_19
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_1 = paddle._C_ops.matmul(add_2, parameter_16, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_3 = paddle._C_ops.add(matmul_1, parameter_15)
+        del matmul_1
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_2 = paddle._C_ops.matmul(add_2, parameter_14, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_4 = paddle._C_ops.add(matmul_2, parameter_13)
+        del matmul_2
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_3 = paddle._C_ops.matmul(add_2, parameter_12, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_5 = paddle._C_ops.add(matmul_3, parameter_11)
+        del matmul_3
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_4 = [1, 11, 12, 64]
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_0 = paddle._C_ops.reshape(add_3, full_int_array_4)
+        del add_3
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_0 = paddle._C_ops.transpose(reshape_0, [0, 2, 1, 3])
+        del reshape_0
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_1 = paddle._C_ops.reshape(add_4, full_int_array_4)
+        del add_4
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_1 = paddle._C_ops.transpose(reshape_1, [0, 2, 1, 3])
+        del reshape_1
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_2 = paddle._C_ops.reshape(add_5, full_int_array_4)
+        del add_5
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_2 = paddle._C_ops.transpose(reshape_2, [0, 2, 1, 3])
+        del reshape_2
+
+        # pd_op.matmul: (1x12x11x11xf32) <- (1x12x11x64xf32, 1x12x11x64xf32)
+        matmul_4 = paddle._C_ops.matmul(transpose_0, transpose_1, False, True)
+        del transpose_0, transpose_1
+
+        # pd_op.full: (1xf32) <- ()
+        full_2 = paddle._C_ops.full(
+            [1], float("0.125"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x12x11x11xf32) <- (1x12x11x11xf32, 1xf32)
+        scale_2 = paddle._C_ops.scale(matmul_4, full_2, float("0"), True)
+        del matmul_4
+
+        # pd_op.add: (1x12x11x11xf32) <- (1x12x11x11xf32, 1x1x1x11xf32)
+        add_6 = paddle._C_ops.add(scale_2, scale_1)
+        del scale_2
+
+        # pd_op.softmax: (1x12x11x11xf32) <- (1x12x11x11xf32)
+        softmax_0 = paddle._C_ops.softmax(add_6, -1)
+        del add_6
+
+        # pd_op.matmul: (1x12x11x64xf32) <- (1x12x11x11xf32, 1x12x11x64xf32)
+        matmul_5 = paddle._C_ops.matmul(softmax_0, transpose_2, False, False)
+        del softmax_0, transpose_2
+
+        # pd_op.transpose: (1x11x12x64xf32) <- (1x12x11x64xf32)
+        transpose_3 = paddle._C_ops.transpose(matmul_5, [0, 2, 1, 3])
+        del matmul_5
+
+        # pd_op.full_int_array: (3xi64) <- ()
+        full_int_array_5 = [0, 0, -1]
+
+        # pd_op.reshape: (1x11x768xf32) <- (1x11x12x64xf32, 3xi64)
+        reshape_3 = paddle._C_ops.reshape(transpose_3, full_int_array_5)
+        del transpose_3
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_6 = paddle._C_ops.matmul(reshape_3, parameter_10, False, False)
+        del reshape_3
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_7 = paddle._C_ops.add(matmul_6, parameter_9)
+        del matmul_6
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 1x11x768xf32)
+        add_8 = paddle._C_ops.add(add_2, add_7)
+        del add_2, add_7
+
+        # pd_op.layer_norm: (1x11x768xf32, 1x11xf32, 1x11xf32) <- (1x11x768xf32, 768xf32, 768xf32)
+        layer_norm_3, layer_norm_4, layer_norm_5 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_8, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_8
+
+        # pd_op.matmul: (1x11x3072xf32) <- (1x11x768xf32, 768x3072xf32)
+        matmul_7 = paddle._C_ops.matmul(layer_norm_3, parameter_6, False, False)
+
+        # pd_op.add: (1x11x3072xf32) <- (1x11x3072xf32, 3072xf32)
+        add_9 = paddle._C_ops.add(matmul_7, parameter_5)
+        del matmul_7
+
+        # pd_op.relu: (1x11x3072xf32) <- (1x11x3072xf32)
+        relu_0 = paddle._C_ops.relu(add_9)
+        del add_9
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x3072xf32, 3072x768xf32)
+        matmul_8 = paddle._C_ops.matmul(relu_0, parameter_4, False, False)
+        del relu_0
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_10 = paddle._C_ops.add(matmul_8, parameter_3)
+        del matmul_8
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 1x11x768xf32)
+        add_11 = paddle._C_ops.add(add_10, layer_norm_3)
+        del add_10, layer_norm_3
+
+        # pd_op.layer_norm: (1x11x768xf32, 1x11xf32, 1x11xf32) <- (1x11x768xf32, 768xf32, 768xf32)
+        layer_norm_6, layer_norm_7, layer_norm_8 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_11, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_11
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_9 = paddle._C_ops.matmul(layer_norm_6, parameter_16, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_12 = paddle._C_ops.add(matmul_9, parameter_15)
+        del matmul_9
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_10 = paddle._C_ops.matmul(layer_norm_6, parameter_14, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_13 = paddle._C_ops.add(matmul_10, parameter_13)
+        del matmul_10
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_11 = paddle._C_ops.matmul(layer_norm_6, parameter_12, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_14 = paddle._C_ops.add(matmul_11, parameter_11)
+        del matmul_11
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_4 = paddle._C_ops.reshape(add_12, full_int_array_4)
+        del add_12
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_4 = paddle._C_ops.transpose(reshape_4, [0, 2, 1, 3])
+        del reshape_4
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_5 = paddle._C_ops.reshape(add_13, full_int_array_4)
+        del add_13
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_5 = paddle._C_ops.transpose(reshape_5, [0, 2, 1, 3])
+        del reshape_5
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_6 = paddle._C_ops.reshape(add_14, full_int_array_4)
+        del add_14
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_6 = paddle._C_ops.transpose(reshape_6, [0, 2, 1, 3])
+        del reshape_6
+
+        # pd_op.matmul: (1x12x11x11xf32) <- (1x12x11x64xf32, 1x12x11x64xf32)
+        matmul_12 = paddle._C_ops.matmul(transpose_4, transpose_5, False, True)
+        del transpose_4, transpose_5
+
+        # pd_op.scale: (1x12x11x11xf32) <- (1x12x11x11xf32, 1xf32)
+        scale_3 = paddle._C_ops.scale(matmul_12, full_2, float("0"), True)
+        del matmul_12
+
+        # pd_op.add: (1x12x11x11xf32) <- (1x12x11x11xf32, 1x1x1x11xf32)
+        add_15 = paddle._C_ops.add(scale_3, scale_1)
+        del scale_3
+
+        # pd_op.softmax: (1x12x11x11xf32) <- (1x12x11x11xf32)
+        softmax_1 = paddle._C_ops.softmax(add_15, -1)
+        del add_15
+
+        # pd_op.matmul: (1x12x11x64xf32) <- (1x12x11x11xf32, 1x12x11x64xf32)
+        matmul_13 = paddle._C_ops.matmul(softmax_1, transpose_6, False, False)
+        del softmax_1, transpose_6
+
+        # pd_op.transpose: (1x11x12x64xf32) <- (1x12x11x64xf32)
+        transpose_7 = paddle._C_ops.transpose(matmul_13, [0, 2, 1, 3])
+        del matmul_13
+
+        # pd_op.reshape: (1x11x768xf32) <- (1x11x12x64xf32, 3xi64)
+        reshape_7 = paddle._C_ops.reshape(transpose_7, full_int_array_5)
+        del transpose_7
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_14 = paddle._C_ops.matmul(reshape_7, parameter_10, False, False)
+        del reshape_7
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_16 = paddle._C_ops.add(matmul_14, parameter_9)
+        del matmul_14
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 1x11x768xf32)
+        add_17 = paddle._C_ops.add(layer_norm_6, add_16)
+        del add_16, layer_norm_6
+
+        # pd_op.layer_norm: (1x11x768xf32, 1x11xf32, 1x11xf32) <- (1x11x768xf32, 768xf32, 768xf32)
+        layer_norm_9, layer_norm_10, layer_norm_11 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_17, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_17
+
+        # pd_op.matmul: (1x11x3072xf32) <- (1x11x768xf32, 768x3072xf32)
+        matmul_15 = paddle._C_ops.matmul(layer_norm_9, parameter_6, False, False)
+
+        # pd_op.add: (1x11x3072xf32) <- (1x11x3072xf32, 3072xf32)
+        add_18 = paddle._C_ops.add(matmul_15, parameter_5)
+        del matmul_15
+
+        # pd_op.relu: (1x11x3072xf32) <- (1x11x3072xf32)
+        relu_1 = paddle._C_ops.relu(add_18)
+        del add_18
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x3072xf32, 3072x768xf32)
+        matmul_16 = paddle._C_ops.matmul(relu_1, parameter_4, False, False)
+        del relu_1
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_19 = paddle._C_ops.add(matmul_16, parameter_3)
+        del matmul_16
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 1x11x768xf32)
+        add_20 = paddle._C_ops.add(add_19, layer_norm_9)
+        del add_19, layer_norm_9
+
+        # pd_op.layer_norm: (1x11x768xf32, 1x11xf32, 1x11xf32) <- (1x11x768xf32, 768xf32, 768xf32)
+        layer_norm_12, layer_norm_13, layer_norm_14 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_20, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_20
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_17 = paddle._C_ops.matmul(layer_norm_12, parameter_16, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_21 = paddle._C_ops.add(matmul_17, parameter_15)
+        del matmul_17
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_18 = paddle._C_ops.matmul(layer_norm_12, parameter_14, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_22 = paddle._C_ops.add(matmul_18, parameter_13)
+        del matmul_18
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_19 = paddle._C_ops.matmul(layer_norm_12, parameter_12, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_23 = paddle._C_ops.add(matmul_19, parameter_11)
+        del matmul_19
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_8 = paddle._C_ops.reshape(add_21, full_int_array_4)
+        del add_21
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_8 = paddle._C_ops.transpose(reshape_8, [0, 2, 1, 3])
+        del reshape_8
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_9 = paddle._C_ops.reshape(add_22, full_int_array_4)
+        del add_22
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_9 = paddle._C_ops.transpose(reshape_9, [0, 2, 1, 3])
+        del reshape_9
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_10 = paddle._C_ops.reshape(add_23, full_int_array_4)
+        del add_23
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_10 = paddle._C_ops.transpose(reshape_10, [0, 2, 1, 3])
+        del reshape_10
+
+        # pd_op.matmul: (1x12x11x11xf32) <- (1x12x11x64xf32, 1x12x11x64xf32)
+        matmul_20 = paddle._C_ops.matmul(transpose_8, transpose_9, False, True)
+        del transpose_8, transpose_9
+
+        # pd_op.scale: (1x12x11x11xf32) <- (1x12x11x11xf32, 1xf32)
+        scale_4 = paddle._C_ops.scale(matmul_20, full_2, float("0"), True)
+        del matmul_20
+
+        # pd_op.add: (1x12x11x11xf32) <- (1x12x11x11xf32, 1x1x1x11xf32)
+        add_24 = paddle._C_ops.add(scale_4, scale_1)
+        del scale_4
+
+        # pd_op.softmax: (1x12x11x11xf32) <- (1x12x11x11xf32)
+        softmax_2 = paddle._C_ops.softmax(add_24, -1)
+        del add_24
+
+        # pd_op.matmul: (1x12x11x64xf32) <- (1x12x11x11xf32, 1x12x11x64xf32)
+        matmul_21 = paddle._C_ops.matmul(softmax_2, transpose_10, False, False)
+        del softmax_2, transpose_10
+
+        # pd_op.transpose: (1x11x12x64xf32) <- (1x12x11x64xf32)
+        transpose_11 = paddle._C_ops.transpose(matmul_21, [0, 2, 1, 3])
+        del matmul_21
+
+        # pd_op.reshape: (1x11x768xf32) <- (1x11x12x64xf32, 3xi64)
+        reshape_11 = paddle._C_ops.reshape(transpose_11, full_int_array_5)
+        del transpose_11
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_22 = paddle._C_ops.matmul(reshape_11, parameter_10, False, False)
+        del reshape_11
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_25 = paddle._C_ops.add(matmul_22, parameter_9)
+        del matmul_22
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 1x11x768xf32)
+        add_26 = paddle._C_ops.add(layer_norm_12, add_25)
+        del add_25, layer_norm_12
+
+        # pd_op.layer_norm: (1x11x768xf32, 1x11xf32, 1x11xf32) <- (1x11x768xf32, 768xf32, 768xf32)
+        layer_norm_15, layer_norm_16, layer_norm_17 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_26, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_26
+
+        # pd_op.matmul: (1x11x3072xf32) <- (1x11x768xf32, 768x3072xf32)
+        matmul_23 = paddle._C_ops.matmul(layer_norm_15, parameter_6, False, False)
+
+        # pd_op.add: (1x11x3072xf32) <- (1x11x3072xf32, 3072xf32)
+        add_27 = paddle._C_ops.add(matmul_23, parameter_5)
+        del matmul_23
+
+        # pd_op.relu: (1x11x3072xf32) <- (1x11x3072xf32)
+        relu_2 = paddle._C_ops.relu(add_27)
+        del add_27
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x3072xf32, 3072x768xf32)
+        matmul_24 = paddle._C_ops.matmul(relu_2, parameter_4, False, False)
+        del relu_2
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_28 = paddle._C_ops.add(matmul_24, parameter_3)
+        del matmul_24
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 1x11x768xf32)
+        add_29 = paddle._C_ops.add(add_28, layer_norm_15)
+        del add_28, layer_norm_15
+
+        # pd_op.layer_norm: (1x11x768xf32, 1x11xf32, 1x11xf32) <- (1x11x768xf32, 768xf32, 768xf32)
+        layer_norm_18, layer_norm_19, layer_norm_20 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_29, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_29
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_25 = paddle._C_ops.matmul(layer_norm_18, parameter_16, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_30 = paddle._C_ops.add(matmul_25, parameter_15)
+        del matmul_25
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_26 = paddle._C_ops.matmul(layer_norm_18, parameter_14, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_31 = paddle._C_ops.add(matmul_26, parameter_13)
+        del matmul_26
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_27 = paddle._C_ops.matmul(layer_norm_18, parameter_12, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_32 = paddle._C_ops.add(matmul_27, parameter_11)
+        del matmul_27
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_12 = paddle._C_ops.reshape(add_30, full_int_array_4)
+        del add_30
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_12 = paddle._C_ops.transpose(reshape_12, [0, 2, 1, 3])
+        del reshape_12
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_13 = paddle._C_ops.reshape(add_31, full_int_array_4)
+        del add_31
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_13 = paddle._C_ops.transpose(reshape_13, [0, 2, 1, 3])
+        del reshape_13
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_14 = paddle._C_ops.reshape(add_32, full_int_array_4)
+        del add_32
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_14 = paddle._C_ops.transpose(reshape_14, [0, 2, 1, 3])
+        del reshape_14
+
+        # pd_op.matmul: (1x12x11x11xf32) <- (1x12x11x64xf32, 1x12x11x64xf32)
+        matmul_28 = paddle._C_ops.matmul(transpose_12, transpose_13, False, True)
+        del transpose_12, transpose_13
+
+        # pd_op.scale: (1x12x11x11xf32) <- (1x12x11x11xf32, 1xf32)
+        scale_5 = paddle._C_ops.scale(matmul_28, full_2, float("0"), True)
+        del matmul_28
+
+        # pd_op.add: (1x12x11x11xf32) <- (1x12x11x11xf32, 1x1x1x11xf32)
+        add_33 = paddle._C_ops.add(scale_5, scale_1)
+        del scale_5
+
+        # pd_op.softmax: (1x12x11x11xf32) <- (1x12x11x11xf32)
+        softmax_3 = paddle._C_ops.softmax(add_33, -1)
+        del add_33
+
+        # pd_op.matmul: (1x12x11x64xf32) <- (1x12x11x11xf32, 1x12x11x64xf32)
+        matmul_29 = paddle._C_ops.matmul(softmax_3, transpose_14, False, False)
+        del softmax_3, transpose_14
+
+        # pd_op.transpose: (1x11x12x64xf32) <- (1x12x11x64xf32)
+        transpose_15 = paddle._C_ops.transpose(matmul_29, [0, 2, 1, 3])
+        del matmul_29
+
+        # pd_op.reshape: (1x11x768xf32) <- (1x11x12x64xf32, 3xi64)
+        reshape_15 = paddle._C_ops.reshape(transpose_15, full_int_array_5)
+        del transpose_15
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_30 = paddle._C_ops.matmul(reshape_15, parameter_10, False, False)
+        del reshape_15
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_34 = paddle._C_ops.add(matmul_30, parameter_9)
+        del matmul_30
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 1x11x768xf32)
+        add_35 = paddle._C_ops.add(layer_norm_18, add_34)
+        del add_34, layer_norm_18
+
+        # pd_op.layer_norm: (1x11x768xf32, 1x11xf32, 1x11xf32) <- (1x11x768xf32, 768xf32, 768xf32)
+        layer_norm_21, layer_norm_22, layer_norm_23 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_35, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_35
+
+        # pd_op.matmul: (1x11x3072xf32) <- (1x11x768xf32, 768x3072xf32)
+        matmul_31 = paddle._C_ops.matmul(layer_norm_21, parameter_6, False, False)
+
+        # pd_op.add: (1x11x3072xf32) <- (1x11x3072xf32, 3072xf32)
+        add_36 = paddle._C_ops.add(matmul_31, parameter_5)
+        del matmul_31
+
+        # pd_op.relu: (1x11x3072xf32) <- (1x11x3072xf32)
+        relu_3 = paddle._C_ops.relu(add_36)
+        del add_36
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x3072xf32, 3072x768xf32)
+        matmul_32 = paddle._C_ops.matmul(relu_3, parameter_4, False, False)
+        del relu_3
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_37 = paddle._C_ops.add(matmul_32, parameter_3)
+        del matmul_32
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 1x11x768xf32)
+        add_38 = paddle._C_ops.add(add_37, layer_norm_21)
+        del add_37, layer_norm_21
+
+        # pd_op.layer_norm: (1x11x768xf32, 1x11xf32, 1x11xf32) <- (1x11x768xf32, 768xf32, 768xf32)
+        layer_norm_24, layer_norm_25, layer_norm_26 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_38, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_38
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_33 = paddle._C_ops.matmul(layer_norm_24, parameter_16, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_39 = paddle._C_ops.add(matmul_33, parameter_15)
+        del matmul_33
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_34 = paddle._C_ops.matmul(layer_norm_24, parameter_14, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_40 = paddle._C_ops.add(matmul_34, parameter_13)
+        del matmul_34
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_35 = paddle._C_ops.matmul(layer_norm_24, parameter_12, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_41 = paddle._C_ops.add(matmul_35, parameter_11)
+        del matmul_35
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_16 = paddle._C_ops.reshape(add_39, full_int_array_4)
+        del add_39
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_16 = paddle._C_ops.transpose(reshape_16, [0, 2, 1, 3])
+        del reshape_16
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_17 = paddle._C_ops.reshape(add_40, full_int_array_4)
+        del add_40
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_17 = paddle._C_ops.transpose(reshape_17, [0, 2, 1, 3])
+        del reshape_17
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_18 = paddle._C_ops.reshape(add_41, full_int_array_4)
+        del add_41
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_18 = paddle._C_ops.transpose(reshape_18, [0, 2, 1, 3])
+        del reshape_18
+
+        # pd_op.matmul: (1x12x11x11xf32) <- (1x12x11x64xf32, 1x12x11x64xf32)
+        matmul_36 = paddle._C_ops.matmul(transpose_16, transpose_17, False, True)
+        del transpose_16, transpose_17
+
+        # pd_op.scale: (1x12x11x11xf32) <- (1x12x11x11xf32, 1xf32)
+        scale_6 = paddle._C_ops.scale(matmul_36, full_2, float("0"), True)
+        del matmul_36
+
+        # pd_op.add: (1x12x11x11xf32) <- (1x12x11x11xf32, 1x1x1x11xf32)
+        add_42 = paddle._C_ops.add(scale_6, scale_1)
+        del scale_6
+
+        # pd_op.softmax: (1x12x11x11xf32) <- (1x12x11x11xf32)
+        softmax_4 = paddle._C_ops.softmax(add_42, -1)
+        del add_42
+
+        # pd_op.matmul: (1x12x11x64xf32) <- (1x12x11x11xf32, 1x12x11x64xf32)
+        matmul_37 = paddle._C_ops.matmul(softmax_4, transpose_18, False, False)
+        del softmax_4, transpose_18
+
+        # pd_op.transpose: (1x11x12x64xf32) <- (1x12x11x64xf32)
+        transpose_19 = paddle._C_ops.transpose(matmul_37, [0, 2, 1, 3])
+        del matmul_37
+
+        # pd_op.reshape: (1x11x768xf32) <- (1x11x12x64xf32, 3xi64)
+        reshape_19 = paddle._C_ops.reshape(transpose_19, full_int_array_5)
+        del transpose_19
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_38 = paddle._C_ops.matmul(reshape_19, parameter_10, False, False)
+        del reshape_19
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_43 = paddle._C_ops.add(matmul_38, parameter_9)
+        del matmul_38
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 1x11x768xf32)
+        add_44 = paddle._C_ops.add(layer_norm_24, add_43)
+        del add_43, layer_norm_24
+
+        # pd_op.layer_norm: (1x11x768xf32, 1x11xf32, 1x11xf32) <- (1x11x768xf32, 768xf32, 768xf32)
+        layer_norm_27, layer_norm_28, layer_norm_29 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_44, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_44
+
+        # pd_op.matmul: (1x11x3072xf32) <- (1x11x768xf32, 768x3072xf32)
+        matmul_39 = paddle._C_ops.matmul(layer_norm_27, parameter_6, False, False)
+
+        # pd_op.add: (1x11x3072xf32) <- (1x11x3072xf32, 3072xf32)
+        add_45 = paddle._C_ops.add(matmul_39, parameter_5)
+        del matmul_39
+
+        # pd_op.relu: (1x11x3072xf32) <- (1x11x3072xf32)
+        relu_4 = paddle._C_ops.relu(add_45)
+        del add_45
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x3072xf32, 3072x768xf32)
+        matmul_40 = paddle._C_ops.matmul(relu_4, parameter_4, False, False)
+        del relu_4
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_46 = paddle._C_ops.add(matmul_40, parameter_3)
+        del matmul_40
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 1x11x768xf32)
+        add_47 = paddle._C_ops.add(add_46, layer_norm_27)
+        del add_46, layer_norm_27
+
+        # pd_op.layer_norm: (1x11x768xf32, 1x11xf32, 1x11xf32) <- (1x11x768xf32, 768xf32, 768xf32)
+        layer_norm_30, layer_norm_31, layer_norm_32 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_47, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_47
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_41 = paddle._C_ops.matmul(layer_norm_30, parameter_16, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_48 = paddle._C_ops.add(matmul_41, parameter_15)
+        del matmul_41
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_42 = paddle._C_ops.matmul(layer_norm_30, parameter_14, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_49 = paddle._C_ops.add(matmul_42, parameter_13)
+        del matmul_42
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_43 = paddle._C_ops.matmul(layer_norm_30, parameter_12, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_50 = paddle._C_ops.add(matmul_43, parameter_11)
+        del matmul_43
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_20 = paddle._C_ops.reshape(add_48, full_int_array_4)
+        del add_48
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_20 = paddle._C_ops.transpose(reshape_20, [0, 2, 1, 3])
+        del reshape_20
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_21 = paddle._C_ops.reshape(add_49, full_int_array_4)
+        del add_49
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_21 = paddle._C_ops.transpose(reshape_21, [0, 2, 1, 3])
+        del reshape_21
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_22 = paddle._C_ops.reshape(add_50, full_int_array_4)
+        del add_50
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_22 = paddle._C_ops.transpose(reshape_22, [0, 2, 1, 3])
+        del reshape_22
+
+        # pd_op.matmul: (1x12x11x11xf32) <- (1x12x11x64xf32, 1x12x11x64xf32)
+        matmul_44 = paddle._C_ops.matmul(transpose_20, transpose_21, False, True)
+        del transpose_20, transpose_21
+
+        # pd_op.scale: (1x12x11x11xf32) <- (1x12x11x11xf32, 1xf32)
+        scale_7 = paddle._C_ops.scale(matmul_44, full_2, float("0"), True)
+        del matmul_44
+
+        # pd_op.add: (1x12x11x11xf32) <- (1x12x11x11xf32, 1x1x1x11xf32)
+        add_51 = paddle._C_ops.add(scale_7, scale_1)
+        del scale_7
+
+        # pd_op.softmax: (1x12x11x11xf32) <- (1x12x11x11xf32)
+        softmax_5 = paddle._C_ops.softmax(add_51, -1)
+        del add_51
+
+        # pd_op.matmul: (1x12x11x64xf32) <- (1x12x11x11xf32, 1x12x11x64xf32)
+        matmul_45 = paddle._C_ops.matmul(softmax_5, transpose_22, False, False)
+        del softmax_5, transpose_22
+
+        # pd_op.transpose: (1x11x12x64xf32) <- (1x12x11x64xf32)
+        transpose_23 = paddle._C_ops.transpose(matmul_45, [0, 2, 1, 3])
+        del matmul_45
+
+        # pd_op.reshape: (1x11x768xf32) <- (1x11x12x64xf32, 3xi64)
+        reshape_23 = paddle._C_ops.reshape(transpose_23, full_int_array_5)
+        del transpose_23
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_46 = paddle._C_ops.matmul(reshape_23, parameter_10, False, False)
+        del reshape_23
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_52 = paddle._C_ops.add(matmul_46, parameter_9)
+        del matmul_46
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 1x11x768xf32)
+        add_53 = paddle._C_ops.add(layer_norm_30, add_52)
+        del add_52, layer_norm_30
+
+        # pd_op.layer_norm: (1x11x768xf32, 1x11xf32, 1x11xf32) <- (1x11x768xf32, 768xf32, 768xf32)
+        layer_norm_33, layer_norm_34, layer_norm_35 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_53, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_53
+
+        # pd_op.matmul: (1x11x3072xf32) <- (1x11x768xf32, 768x3072xf32)
+        matmul_47 = paddle._C_ops.matmul(layer_norm_33, parameter_6, False, False)
+
+        # pd_op.add: (1x11x3072xf32) <- (1x11x3072xf32, 3072xf32)
+        add_54 = paddle._C_ops.add(matmul_47, parameter_5)
+        del matmul_47
+
+        # pd_op.relu: (1x11x3072xf32) <- (1x11x3072xf32)
+        relu_5 = paddle._C_ops.relu(add_54)
+        del add_54
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x3072xf32, 3072x768xf32)
+        matmul_48 = paddle._C_ops.matmul(relu_5, parameter_4, False, False)
+        del relu_5
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_55 = paddle._C_ops.add(matmul_48, parameter_3)
+        del matmul_48
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 1x11x768xf32)
+        add_56 = paddle._C_ops.add(add_55, layer_norm_33)
+        del add_55, layer_norm_33
+
+        # pd_op.layer_norm: (1x11x768xf32, 1x11xf32, 1x11xf32) <- (1x11x768xf32, 768xf32, 768xf32)
+        layer_norm_36, layer_norm_37, layer_norm_38 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_56, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_56
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_49 = paddle._C_ops.matmul(layer_norm_36, parameter_16, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_57 = paddle._C_ops.add(matmul_49, parameter_15)
+        del matmul_49
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_50 = paddle._C_ops.matmul(layer_norm_36, parameter_14, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_58 = paddle._C_ops.add(matmul_50, parameter_13)
+        del matmul_50
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_51 = paddle._C_ops.matmul(layer_norm_36, parameter_12, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_59 = paddle._C_ops.add(matmul_51, parameter_11)
+        del matmul_51
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_24 = paddle._C_ops.reshape(add_57, full_int_array_4)
+        del add_57
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_24 = paddle._C_ops.transpose(reshape_24, [0, 2, 1, 3])
+        del reshape_24
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_25 = paddle._C_ops.reshape(add_58, full_int_array_4)
+        del add_58
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_25 = paddle._C_ops.transpose(reshape_25, [0, 2, 1, 3])
+        del reshape_25
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_26 = paddle._C_ops.reshape(add_59, full_int_array_4)
+        del add_59
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_26 = paddle._C_ops.transpose(reshape_26, [0, 2, 1, 3])
+        del reshape_26
+
+        # pd_op.matmul: (1x12x11x11xf32) <- (1x12x11x64xf32, 1x12x11x64xf32)
+        matmul_52 = paddle._C_ops.matmul(transpose_24, transpose_25, False, True)
+        del transpose_24, transpose_25
+
+        # pd_op.scale: (1x12x11x11xf32) <- (1x12x11x11xf32, 1xf32)
+        scale_8 = paddle._C_ops.scale(matmul_52, full_2, float("0"), True)
+        del matmul_52
+
+        # pd_op.add: (1x12x11x11xf32) <- (1x12x11x11xf32, 1x1x1x11xf32)
+        add_60 = paddle._C_ops.add(scale_8, scale_1)
+        del scale_8
+
+        # pd_op.softmax: (1x12x11x11xf32) <- (1x12x11x11xf32)
+        softmax_6 = paddle._C_ops.softmax(add_60, -1)
+        del add_60
+
+        # pd_op.matmul: (1x12x11x64xf32) <- (1x12x11x11xf32, 1x12x11x64xf32)
+        matmul_53 = paddle._C_ops.matmul(softmax_6, transpose_26, False, False)
+        del softmax_6, transpose_26
+
+        # pd_op.transpose: (1x11x12x64xf32) <- (1x12x11x64xf32)
+        transpose_27 = paddle._C_ops.transpose(matmul_53, [0, 2, 1, 3])
+        del matmul_53
+
+        # pd_op.reshape: (1x11x768xf32) <- (1x11x12x64xf32, 3xi64)
+        reshape_27 = paddle._C_ops.reshape(transpose_27, full_int_array_5)
+        del transpose_27
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_54 = paddle._C_ops.matmul(reshape_27, parameter_10, False, False)
+        del reshape_27
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_61 = paddle._C_ops.add(matmul_54, parameter_9)
+        del matmul_54
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 1x11x768xf32)
+        add_62 = paddle._C_ops.add(layer_norm_36, add_61)
+        del add_61, layer_norm_36
+
+        # pd_op.layer_norm: (1x11x768xf32, 1x11xf32, 1x11xf32) <- (1x11x768xf32, 768xf32, 768xf32)
+        layer_norm_39, layer_norm_40, layer_norm_41 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_62, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_62
+
+        # pd_op.matmul: (1x11x3072xf32) <- (1x11x768xf32, 768x3072xf32)
+        matmul_55 = paddle._C_ops.matmul(layer_norm_39, parameter_6, False, False)
+
+        # pd_op.add: (1x11x3072xf32) <- (1x11x3072xf32, 3072xf32)
+        add_63 = paddle._C_ops.add(matmul_55, parameter_5)
+        del matmul_55
+
+        # pd_op.relu: (1x11x3072xf32) <- (1x11x3072xf32)
+        relu_6 = paddle._C_ops.relu(add_63)
+        del add_63
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x3072xf32, 3072x768xf32)
+        matmul_56 = paddle._C_ops.matmul(relu_6, parameter_4, False, False)
+        del relu_6
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_64 = paddle._C_ops.add(matmul_56, parameter_3)
+        del matmul_56
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 1x11x768xf32)
+        add_65 = paddle._C_ops.add(add_64, layer_norm_39)
+        del add_64, layer_norm_39
+
+        # pd_op.layer_norm: (1x11x768xf32, 1x11xf32, 1x11xf32) <- (1x11x768xf32, 768xf32, 768xf32)
+        layer_norm_42, layer_norm_43, layer_norm_44 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_65, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_65
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_57 = paddle._C_ops.matmul(layer_norm_42, parameter_16, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_66 = paddle._C_ops.add(matmul_57, parameter_15)
+        del matmul_57
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_58 = paddle._C_ops.matmul(layer_norm_42, parameter_14, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_67 = paddle._C_ops.add(matmul_58, parameter_13)
+        del matmul_58
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_59 = paddle._C_ops.matmul(layer_norm_42, parameter_12, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_68 = paddle._C_ops.add(matmul_59, parameter_11)
+        del matmul_59
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_28 = paddle._C_ops.reshape(add_66, full_int_array_4)
+        del add_66
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_28 = paddle._C_ops.transpose(reshape_28, [0, 2, 1, 3])
+        del reshape_28
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_29 = paddle._C_ops.reshape(add_67, full_int_array_4)
+        del add_67
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_29 = paddle._C_ops.transpose(reshape_29, [0, 2, 1, 3])
+        del reshape_29
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_30 = paddle._C_ops.reshape(add_68, full_int_array_4)
+        del add_68
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_30 = paddle._C_ops.transpose(reshape_30, [0, 2, 1, 3])
+        del reshape_30
+
+        # pd_op.matmul: (1x12x11x11xf32) <- (1x12x11x64xf32, 1x12x11x64xf32)
+        matmul_60 = paddle._C_ops.matmul(transpose_28, transpose_29, False, True)
+        del transpose_28, transpose_29
+
+        # pd_op.scale: (1x12x11x11xf32) <- (1x12x11x11xf32, 1xf32)
+        scale_9 = paddle._C_ops.scale(matmul_60, full_2, float("0"), True)
+        del matmul_60
+
+        # pd_op.add: (1x12x11x11xf32) <- (1x12x11x11xf32, 1x1x1x11xf32)
+        add_69 = paddle._C_ops.add(scale_9, scale_1)
+        del scale_9
+
+        # pd_op.softmax: (1x12x11x11xf32) <- (1x12x11x11xf32)
+        softmax_7 = paddle._C_ops.softmax(add_69, -1)
+        del add_69
+
+        # pd_op.matmul: (1x12x11x64xf32) <- (1x12x11x11xf32, 1x12x11x64xf32)
+        matmul_61 = paddle._C_ops.matmul(softmax_7, transpose_30, False, False)
+        del softmax_7, transpose_30
+
+        # pd_op.transpose: (1x11x12x64xf32) <- (1x12x11x64xf32)
+        transpose_31 = paddle._C_ops.transpose(matmul_61, [0, 2, 1, 3])
+        del matmul_61
+
+        # pd_op.reshape: (1x11x768xf32) <- (1x11x12x64xf32, 3xi64)
+        reshape_31 = paddle._C_ops.reshape(transpose_31, full_int_array_5)
+        del transpose_31
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_62 = paddle._C_ops.matmul(reshape_31, parameter_10, False, False)
+        del reshape_31
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_70 = paddle._C_ops.add(matmul_62, parameter_9)
+        del matmul_62
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 1x11x768xf32)
+        add_71 = paddle._C_ops.add(layer_norm_42, add_70)
+        del add_70, layer_norm_42
+
+        # pd_op.layer_norm: (1x11x768xf32, 1x11xf32, 1x11xf32) <- (1x11x768xf32, 768xf32, 768xf32)
+        layer_norm_45, layer_norm_46, layer_norm_47 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_71, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_71
+
+        # pd_op.matmul: (1x11x3072xf32) <- (1x11x768xf32, 768x3072xf32)
+        matmul_63 = paddle._C_ops.matmul(layer_norm_45, parameter_6, False, False)
+
+        # pd_op.add: (1x11x3072xf32) <- (1x11x3072xf32, 3072xf32)
+        add_72 = paddle._C_ops.add(matmul_63, parameter_5)
+        del matmul_63
+
+        # pd_op.relu: (1x11x3072xf32) <- (1x11x3072xf32)
+        relu_7 = paddle._C_ops.relu(add_72)
+        del add_72
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x3072xf32, 3072x768xf32)
+        matmul_64 = paddle._C_ops.matmul(relu_7, parameter_4, False, False)
+        del relu_7
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_73 = paddle._C_ops.add(matmul_64, parameter_3)
+        del matmul_64
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 1x11x768xf32)
+        add_74 = paddle._C_ops.add(add_73, layer_norm_45)
+        del add_73, layer_norm_45
+
+        # pd_op.layer_norm: (1x11x768xf32, 1x11xf32, 1x11xf32) <- (1x11x768xf32, 768xf32, 768xf32)
+        layer_norm_48, layer_norm_49, layer_norm_50 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_74, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_74
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_65 = paddle._C_ops.matmul(layer_norm_48, parameter_16, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_75 = paddle._C_ops.add(matmul_65, parameter_15)
+        del matmul_65
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_66 = paddle._C_ops.matmul(layer_norm_48, parameter_14, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_76 = paddle._C_ops.add(matmul_66, parameter_13)
+        del matmul_66
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_67 = paddle._C_ops.matmul(layer_norm_48, parameter_12, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_77 = paddle._C_ops.add(matmul_67, parameter_11)
+        del matmul_67
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_32 = paddle._C_ops.reshape(add_75, full_int_array_4)
+        del add_75
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_32 = paddle._C_ops.transpose(reshape_32, [0, 2, 1, 3])
+        del reshape_32
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_33 = paddle._C_ops.reshape(add_76, full_int_array_4)
+        del add_76
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_33 = paddle._C_ops.transpose(reshape_33, [0, 2, 1, 3])
+        del reshape_33
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_34 = paddle._C_ops.reshape(add_77, full_int_array_4)
+        del add_77
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_34 = paddle._C_ops.transpose(reshape_34, [0, 2, 1, 3])
+        del reshape_34
+
+        # pd_op.matmul: (1x12x11x11xf32) <- (1x12x11x64xf32, 1x12x11x64xf32)
+        matmul_68 = paddle._C_ops.matmul(transpose_32, transpose_33, False, True)
+        del transpose_32, transpose_33
+
+        # pd_op.scale: (1x12x11x11xf32) <- (1x12x11x11xf32, 1xf32)
+        scale_10 = paddle._C_ops.scale(matmul_68, full_2, float("0"), True)
+        del matmul_68
+
+        # pd_op.add: (1x12x11x11xf32) <- (1x12x11x11xf32, 1x1x1x11xf32)
+        add_78 = paddle._C_ops.add(scale_10, scale_1)
+        del scale_10
+
+        # pd_op.softmax: (1x12x11x11xf32) <- (1x12x11x11xf32)
+        softmax_8 = paddle._C_ops.softmax(add_78, -1)
+        del add_78
+
+        # pd_op.matmul: (1x12x11x64xf32) <- (1x12x11x11xf32, 1x12x11x64xf32)
+        matmul_69 = paddle._C_ops.matmul(softmax_8, transpose_34, False, False)
+        del softmax_8, transpose_34
+
+        # pd_op.transpose: (1x11x12x64xf32) <- (1x12x11x64xf32)
+        transpose_35 = paddle._C_ops.transpose(matmul_69, [0, 2, 1, 3])
+        del matmul_69
+
+        # pd_op.reshape: (1x11x768xf32) <- (1x11x12x64xf32, 3xi64)
+        reshape_35 = paddle._C_ops.reshape(transpose_35, full_int_array_5)
+        del transpose_35
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_70 = paddle._C_ops.matmul(reshape_35, parameter_10, False, False)
+        del reshape_35
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_79 = paddle._C_ops.add(matmul_70, parameter_9)
+        del matmul_70
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 1x11x768xf32)
+        add_80 = paddle._C_ops.add(layer_norm_48, add_79)
+        del add_79, layer_norm_48
+
+        # pd_op.layer_norm: (1x11x768xf32, 1x11xf32, 1x11xf32) <- (1x11x768xf32, 768xf32, 768xf32)
+        layer_norm_51, layer_norm_52, layer_norm_53 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_80, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_80
+
+        # pd_op.matmul: (1x11x3072xf32) <- (1x11x768xf32, 768x3072xf32)
+        matmul_71 = paddle._C_ops.matmul(layer_norm_51, parameter_6, False, False)
+
+        # pd_op.add: (1x11x3072xf32) <- (1x11x3072xf32, 3072xf32)
+        add_81 = paddle._C_ops.add(matmul_71, parameter_5)
+        del matmul_71
+
+        # pd_op.relu: (1x11x3072xf32) <- (1x11x3072xf32)
+        relu_8 = paddle._C_ops.relu(add_81)
+        del add_81
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x3072xf32, 3072x768xf32)
+        matmul_72 = paddle._C_ops.matmul(relu_8, parameter_4, False, False)
+        del relu_8
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_82 = paddle._C_ops.add(matmul_72, parameter_3)
+        del matmul_72
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 1x11x768xf32)
+        add_83 = paddle._C_ops.add(add_82, layer_norm_51)
+        del add_82, layer_norm_51
+
+        # pd_op.layer_norm: (1x11x768xf32, 1x11xf32, 1x11xf32) <- (1x11x768xf32, 768xf32, 768xf32)
+        layer_norm_54, layer_norm_55, layer_norm_56 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_83, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_83
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_73 = paddle._C_ops.matmul(layer_norm_54, parameter_16, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_84 = paddle._C_ops.add(matmul_73, parameter_15)
+        del matmul_73
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_74 = paddle._C_ops.matmul(layer_norm_54, parameter_14, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_85 = paddle._C_ops.add(matmul_74, parameter_13)
+        del matmul_74
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_75 = paddle._C_ops.matmul(layer_norm_54, parameter_12, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_86 = paddle._C_ops.add(matmul_75, parameter_11)
+        del matmul_75
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_36 = paddle._C_ops.reshape(add_84, full_int_array_4)
+        del add_84
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_36 = paddle._C_ops.transpose(reshape_36, [0, 2, 1, 3])
+        del reshape_36
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_37 = paddle._C_ops.reshape(add_85, full_int_array_4)
+        del add_85
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_37 = paddle._C_ops.transpose(reshape_37, [0, 2, 1, 3])
+        del reshape_37
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_38 = paddle._C_ops.reshape(add_86, full_int_array_4)
+        del add_86
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_38 = paddle._C_ops.transpose(reshape_38, [0, 2, 1, 3])
+        del reshape_38
+
+        # pd_op.matmul: (1x12x11x11xf32) <- (1x12x11x64xf32, 1x12x11x64xf32)
+        matmul_76 = paddle._C_ops.matmul(transpose_36, transpose_37, False, True)
+        del transpose_36, transpose_37
+
+        # pd_op.scale: (1x12x11x11xf32) <- (1x12x11x11xf32, 1xf32)
+        scale_11 = paddle._C_ops.scale(matmul_76, full_2, float("0"), True)
+        del matmul_76
+
+        # pd_op.add: (1x12x11x11xf32) <- (1x12x11x11xf32, 1x1x1x11xf32)
+        add_87 = paddle._C_ops.add(scale_11, scale_1)
+        del scale_11
+
+        # pd_op.softmax: (1x12x11x11xf32) <- (1x12x11x11xf32)
+        softmax_9 = paddle._C_ops.softmax(add_87, -1)
+        del add_87
+
+        # pd_op.matmul: (1x12x11x64xf32) <- (1x12x11x11xf32, 1x12x11x64xf32)
+        matmul_77 = paddle._C_ops.matmul(softmax_9, transpose_38, False, False)
+        del softmax_9, transpose_38
+
+        # pd_op.transpose: (1x11x12x64xf32) <- (1x12x11x64xf32)
+        transpose_39 = paddle._C_ops.transpose(matmul_77, [0, 2, 1, 3])
+        del matmul_77
+
+        # pd_op.reshape: (1x11x768xf32) <- (1x11x12x64xf32, 3xi64)
+        reshape_39 = paddle._C_ops.reshape(transpose_39, full_int_array_5)
+        del transpose_39
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_78 = paddle._C_ops.matmul(reshape_39, parameter_10, False, False)
+        del reshape_39
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_88 = paddle._C_ops.add(matmul_78, parameter_9)
+        del matmul_78
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 1x11x768xf32)
+        add_89 = paddle._C_ops.add(layer_norm_54, add_88)
+        del add_88, layer_norm_54
+
+        # pd_op.layer_norm: (1x11x768xf32, 1x11xf32, 1x11xf32) <- (1x11x768xf32, 768xf32, 768xf32)
+        layer_norm_57, layer_norm_58, layer_norm_59 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_89, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_89
+
+        # pd_op.matmul: (1x11x3072xf32) <- (1x11x768xf32, 768x3072xf32)
+        matmul_79 = paddle._C_ops.matmul(layer_norm_57, parameter_6, False, False)
+
+        # pd_op.add: (1x11x3072xf32) <- (1x11x3072xf32, 3072xf32)
+        add_90 = paddle._C_ops.add(matmul_79, parameter_5)
+        del matmul_79
+
+        # pd_op.relu: (1x11x3072xf32) <- (1x11x3072xf32)
+        relu_9 = paddle._C_ops.relu(add_90)
+        del add_90
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x3072xf32, 3072x768xf32)
+        matmul_80 = paddle._C_ops.matmul(relu_9, parameter_4, False, False)
+        del relu_9
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_91 = paddle._C_ops.add(matmul_80, parameter_3)
+        del matmul_80
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 1x11x768xf32)
+        add_92 = paddle._C_ops.add(add_91, layer_norm_57)
+        del add_91, layer_norm_57
+
+        # pd_op.layer_norm: (1x11x768xf32, 1x11xf32, 1x11xf32) <- (1x11x768xf32, 768xf32, 768xf32)
+        layer_norm_60, layer_norm_61, layer_norm_62 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_92, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_92
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_81 = paddle._C_ops.matmul(layer_norm_60, parameter_16, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_93 = paddle._C_ops.add(matmul_81, parameter_15)
+        del matmul_81
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_82 = paddle._C_ops.matmul(layer_norm_60, parameter_14, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_94 = paddle._C_ops.add(matmul_82, parameter_13)
+        del matmul_82
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_83 = paddle._C_ops.matmul(layer_norm_60, parameter_12, False, False)
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_95 = paddle._C_ops.add(matmul_83, parameter_11)
+        del matmul_83
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_40 = paddle._C_ops.reshape(add_93, full_int_array_4)
+        del add_93
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_40 = paddle._C_ops.transpose(reshape_40, [0, 2, 1, 3])
+        del reshape_40
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_41 = paddle._C_ops.reshape(add_94, full_int_array_4)
+        del add_94
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_41 = paddle._C_ops.transpose(reshape_41, [0, 2, 1, 3])
+        del reshape_41
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_42 = paddle._C_ops.reshape(add_95, full_int_array_4)
+        del add_95
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_42 = paddle._C_ops.transpose(reshape_42, [0, 2, 1, 3])
+        del reshape_42
+
+        # pd_op.matmul: (1x12x11x11xf32) <- (1x12x11x64xf32, 1x12x11x64xf32)
+        matmul_84 = paddle._C_ops.matmul(transpose_40, transpose_41, False, True)
+        del transpose_40, transpose_41
+
+        # pd_op.scale: (1x12x11x11xf32) <- (1x12x11x11xf32, 1xf32)
+        scale_12 = paddle._C_ops.scale(matmul_84, full_2, float("0"), True)
+        del matmul_84
+
+        # pd_op.add: (1x12x11x11xf32) <- (1x12x11x11xf32, 1x1x1x11xf32)
+        add_96 = paddle._C_ops.add(scale_12, scale_1)
+        del scale_12
+
+        # pd_op.softmax: (1x12x11x11xf32) <- (1x12x11x11xf32)
+        softmax_10 = paddle._C_ops.softmax(add_96, -1)
+        del add_96
+
+        # pd_op.matmul: (1x12x11x64xf32) <- (1x12x11x11xf32, 1x12x11x64xf32)
+        matmul_85 = paddle._C_ops.matmul(softmax_10, transpose_42, False, False)
+        del softmax_10, transpose_42
+
+        # pd_op.transpose: (1x11x12x64xf32) <- (1x12x11x64xf32)
+        transpose_43 = paddle._C_ops.transpose(matmul_85, [0, 2, 1, 3])
+        del matmul_85
+
+        # pd_op.reshape: (1x11x768xf32) <- (1x11x12x64xf32, 3xi64)
+        reshape_43 = paddle._C_ops.reshape(transpose_43, full_int_array_5)
+        del transpose_43
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_86 = paddle._C_ops.matmul(reshape_43, parameter_10, False, False)
+        del reshape_43
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_97 = paddle._C_ops.add(matmul_86, parameter_9)
+        del matmul_86
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 1x11x768xf32)
+        add_98 = paddle._C_ops.add(layer_norm_60, add_97)
+        del add_97, layer_norm_60
+
+        # pd_op.layer_norm: (1x11x768xf32, 1x11xf32, 1x11xf32) <- (1x11x768xf32, 768xf32, 768xf32)
+        layer_norm_63, layer_norm_64, layer_norm_65 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_98, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_98
+
+        # pd_op.matmul: (1x11x3072xf32) <- (1x11x768xf32, 768x3072xf32)
+        matmul_87 = paddle._C_ops.matmul(layer_norm_63, parameter_6, False, False)
+
+        # pd_op.add: (1x11x3072xf32) <- (1x11x3072xf32, 3072xf32)
+        add_99 = paddle._C_ops.add(matmul_87, parameter_5)
+        del matmul_87
+
+        # pd_op.relu: (1x11x3072xf32) <- (1x11x3072xf32)
+        relu_10 = paddle._C_ops.relu(add_99)
+        del add_99
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x3072xf32, 3072x768xf32)
+        matmul_88 = paddle._C_ops.matmul(relu_10, parameter_4, False, False)
+        del relu_10
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_100 = paddle._C_ops.add(matmul_88, parameter_3)
+        del matmul_88
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 1x11x768xf32)
+        add_101 = paddle._C_ops.add(add_100, layer_norm_63)
+        del add_100, layer_norm_63
+
+        # pd_op.layer_norm: (1x11x768xf32, 1x11xf32, 1x11xf32) <- (1x11x768xf32, 768xf32, 768xf32)
+        layer_norm_66, layer_norm_67, layer_norm_68 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_101, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_101
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_89 = paddle._C_ops.matmul(layer_norm_66, parameter_16, False, False)
+        del parameter_16
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_102 = paddle._C_ops.add(matmul_89, parameter_15)
+        del matmul_89, parameter_15
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_90 = paddle._C_ops.matmul(layer_norm_66, parameter_14, False, False)
+        del parameter_14
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_103 = paddle._C_ops.add(matmul_90, parameter_13)
+        del matmul_90, parameter_13
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_91 = paddle._C_ops.matmul(layer_norm_66, parameter_12, False, False)
+        del parameter_12
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_104 = paddle._C_ops.add(matmul_91, parameter_11)
+        del matmul_91, parameter_11
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_44 = paddle._C_ops.reshape(add_102, full_int_array_4)
+        del add_102
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_44 = paddle._C_ops.transpose(reshape_44, [0, 2, 1, 3])
+        del reshape_44
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_45 = paddle._C_ops.reshape(add_103, full_int_array_4)
+        del add_103
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_45 = paddle._C_ops.transpose(reshape_45, [0, 2, 1, 3])
+        del reshape_45
+
+        # pd_op.reshape: (1x11x12x64xf32) <- (1x11x768xf32, 4xi64)
+        reshape_46 = paddle._C_ops.reshape(add_104, full_int_array_4)
+        del add_104, full_int_array_4
+
+        # pd_op.transpose: (1x12x11x64xf32) <- (1x11x12x64xf32)
+        transpose_46 = paddle._C_ops.transpose(reshape_46, [0, 2, 1, 3])
+        del reshape_46
+
+        # pd_op.matmul: (1x12x11x11xf32) <- (1x12x11x64xf32, 1x12x11x64xf32)
+        matmul_92 = paddle._C_ops.matmul(transpose_44, transpose_45, False, True)
+        del transpose_44, transpose_45
+
+        # pd_op.scale: (1x12x11x11xf32) <- (1x12x11x11xf32, 1xf32)
+        scale_13 = paddle._C_ops.scale(matmul_92, full_2, float("0"), True)
+        del full_2, matmul_92
+
+        # pd_op.add: (1x12x11x11xf32) <- (1x12x11x11xf32, 1x1x1x11xf32)
+        add_105 = paddle._C_ops.add(scale_13, scale_1)
+        del scale_1, scale_13
+
+        # pd_op.softmax: (1x12x11x11xf32) <- (1x12x11x11xf32)
+        softmax_11 = paddle._C_ops.softmax(add_105, -1)
+        del add_105
+
+        # pd_op.matmul: (1x12x11x64xf32) <- (1x12x11x11xf32, 1x12x11x64xf32)
+        matmul_93 = paddle._C_ops.matmul(softmax_11, transpose_46, False, False)
+        del softmax_11, transpose_46
+
+        # pd_op.transpose: (1x11x12x64xf32) <- (1x12x11x64xf32)
+        transpose_47 = paddle._C_ops.transpose(matmul_93, [0, 2, 1, 3])
+        del matmul_93
+
+        # pd_op.reshape: (1x11x768xf32) <- (1x11x12x64xf32, 3xi64)
+        reshape_47 = paddle._C_ops.reshape(transpose_47, full_int_array_5)
+        del full_int_array_5, transpose_47
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x768xf32, 768x768xf32)
+        matmul_94 = paddle._C_ops.matmul(reshape_47, parameter_10, False, False)
+        del parameter_10, reshape_47
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_106 = paddle._C_ops.add(matmul_94, parameter_9)
+        del matmul_94, parameter_9
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 1x11x768xf32)
+        add_107 = paddle._C_ops.add(layer_norm_66, add_106)
+        del add_106, layer_norm_66
+
+        # pd_op.layer_norm: (1x11x768xf32, 1x11xf32, 1x11xf32) <- (1x11x768xf32, 768xf32, 768xf32)
+        layer_norm_69, layer_norm_70, layer_norm_71 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_107, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_107, parameter_7, parameter_8
+
+        # pd_op.matmul: (1x11x3072xf32) <- (1x11x768xf32, 768x3072xf32)
+        matmul_95 = paddle._C_ops.matmul(layer_norm_69, parameter_6, False, False)
+        del parameter_6
+
+        # pd_op.add: (1x11x3072xf32) <- (1x11x3072xf32, 3072xf32)
+        add_108 = paddle._C_ops.add(matmul_95, parameter_5)
+        del matmul_95, parameter_5
+
+        # pd_op.relu: (1x11x3072xf32) <- (1x11x3072xf32)
+        relu_11 = paddle._C_ops.relu(add_108)
+        del add_108
+
+        # pd_op.matmul: (1x11x768xf32) <- (1x11x3072xf32, 3072x768xf32)
+        matmul_96 = paddle._C_ops.matmul(relu_11, parameter_4, False, False)
+        del parameter_4, relu_11
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 768xf32)
+        add_109 = paddle._C_ops.add(matmul_96, parameter_3)
+        del matmul_96, parameter_3
+
+        # pd_op.add: (1x11x768xf32) <- (1x11x768xf32, 1x11x768xf32)
+        add_110 = paddle._C_ops.add(add_109, layer_norm_69)
+        del add_109, layer_norm_69
+
+        # pd_op.layer_norm: (1x11x768xf32, 1x11xf32, 1x11xf32) <- (1x11x768xf32, 768xf32, 768xf32)
+        layer_norm_72, layer_norm_73, layer_norm_74 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_110, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_110, parameter_17, parameter_18
+
+        # pd_op.slice: (1x768xf32) <- (1x11x768xf32, 1xi64, 1xi64)
+        slice_1 = paddle._C_ops.slice(
+            layer_norm_72, [1], full_int_array_2, full_int_array_0, [1], [1]
+        )
+        del full_int_array_0, full_int_array_2
+
+        # pd_op.matmul: (1x768xf32) <- (1x768xf32, 768x768xf32)
+        matmul_97 = paddle._C_ops.matmul(slice_1, parameter_2, False, False)
+        del parameter_2, slice_1
+
+        # pd_op.add: (1x768xf32) <- (1x768xf32, 768xf32)
+        add_111 = paddle._C_ops.add(matmul_97, parameter_1)
+        del matmul_97, parameter_1
+
+        # pd_op.tanh: (1x768xf32) <- (1x768xf32)
+        tanh_0 = paddle._C_ops.tanh(add_111)
+        del add_111, layer_norm_72
+
+        return tanh_0
diff --git a/paddle_samples/PaddleNLP/albert-chinese-base/weight_meta.py b/paddle_samples/PaddleNLP/albert-chinese-base/weight_meta.py
new file mode 100644
index 000000000..d9bd49e44
--- /dev/null
+++ b/paddle_samples/PaddleNLP/albert-chinese-base/weight_meta.py
@@ -0,0 +1,235 @@
+class Program_weight_tensor_parameter_0:
+    name = "parameter_0"
+    shape = [1, 512]
+    dtype = "int64"
+    min_val = 0
+    max_val = 511
+    data = None
+
+
+class Program_weight_tensor_parameter_1:
+    name = "parameter_1"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_2:
+    name = "parameter_2"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0892679")
+    max_val = float("0.0906134")
+    mean = float("-3.00388e-06")
+    std = float("0.0199841")
+    data = None
+
+
+class Program_weight_tensor_parameter_3:
+    name = "parameter_3"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_4:
+    name = "parameter_4"
+    shape = [3072, 768]
+    dtype = "float32"
+    min_val = float("-0.101152")
+    max_val = float("0.0999676")
+    mean = float("1.01817e-05")
+    std = float("0.019991")
+    data = None
+
+
+class Program_weight_tensor_parameter_5:
+    name = "parameter_5"
+    shape = [3072]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_6:
+    name = "parameter_6"
+    shape = [768, 3072]
+    dtype = "float32"
+    min_val = float("-0.0942874")
+    max_val = float("0.0995946")
+    mean = float("2.76684e-07")
+    std = float("0.0199973")
+    data = None
+
+
+class Program_weight_tensor_parameter_7:
+    name = "parameter_7"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_8:
+    name = "parameter_8"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_9:
+    name = "parameter_9"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_10:
+    name = "parameter_10"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0992178")
+    max_val = float("0.0932706")
+    mean = float("2.48208e-05")
+    std = float("0.0200258")
+    data = None
+
+
+class Program_weight_tensor_parameter_11:
+    name = "parameter_11"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_12:
+    name = "parameter_12"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0932884")
+    max_val = float("0.0917139")
+    mean = float("1.4562e-05")
+    std = float("0.0200167")
+    data = None
+
+
+class Program_weight_tensor_parameter_13:
+    name = "parameter_13"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_14:
+    name = "parameter_14"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0930002")
+    max_val = float("0.0961642")
+    mean = float("-5.85622e-05")
+    std = float("0.019987")
+    data = None
+
+
+class Program_weight_tensor_parameter_15:
+    name = "parameter_15"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_16:
+    name = "parameter_16"
+    shape = [768, 768]
+    dtype = "float32"
+    min_val = float("-0.0916871")
+    max_val = float("0.089794")
+    mean = float("-3.99677e-05")
+    std = float("0.0200198")
+    data = None
+
+
+class Program_weight_tensor_parameter_17:
+    name = "parameter_17"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_18:
+    name = "parameter_18"
+    shape = [768]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_19:
+    name = "parameter_19"
+    shape = [768]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_20:
+    name = "parameter_20"
+    shape = [128, 768]
+    dtype = "float32"
+    min_val = float("-0.0860205")
+    max_val = float("0.0973591")
+    mean = float("2.12175e-05")
+    std = float("0.0200251")
+    data = None
+
+
+class Program_weight_tensor_parameter_21:
+    name = "parameter_21"
+    shape = [128]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_22:
+    name = "parameter_22"
+    shape = [128]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_23:
+    name = "parameter_23"
+    shape = [2, 128]
+    dtype = "float32"
+    min_val = float("-0.0590662")
+    max_val = float("0.0505173")
+    mean = float("-0.000780354")
+    std = float("0.0203115")
+    data = None
+
+
+class Program_weight_tensor_parameter_24:
+    name = "parameter_24"
+    shape = [512, 128]
+    dtype = "float32"
+    min_val = float("-0.0905854")
+    max_val = float("0.0949802")
+    mean = float("1.62754e-05")
+    std = float("0.019975")
+    data = None
+
+
+class Program_weight_tensor_parameter_25:
+    name = "parameter_25"
+    shape = [21128, 128]
+    dtype = "float32"
+    min_val = float("-0.0995291")
+    max_val = float("0.0971798")
+    mean = float("-5.40105e-06")
+    std = float("0.0200164")
+    data = None
diff --git a/paddle_samples/PaddleNLP/albert-chinese-small/graph_net.json b/paddle_samples/PaddleNLP/albert-chinese-small/graph_net.json
new file mode 100644
index 000000000..aac3cc2f4
--- /dev/null
+++ b/paddle_samples/PaddleNLP/albert-chinese-small/graph_net.json
@@ -0,0 +1,6 @@
+{
+    "framework": "paddle",
+    "model_name": "albert-chinese-small",
+    "num_devices_required": 1,
+    "num_nodes_required": 1
+}
\ No newline at end of file
diff --git a/paddle_samples/PaddleNLP/albert-chinese-small/input_meta.py b/paddle_samples/PaddleNLP/albert-chinese-small/input_meta.py
new file mode 100644
index 000000000..3708564f7
--- /dev/null
+++ b/paddle_samples/PaddleNLP/albert-chinese-small/input_meta.py
@@ -0,0 +1,19 @@
+class Program_weight_tensor_data_0:
+    name = "data_0"
+    shape = [1, 11]
+    dtype = "int64"
+    data = [101, 3614, 6816, 886, 4500, 4636, 2428, 7607, 3444, 106, 102]
+
+
+class Program_weight_tensor_data_1:
+    name = "data_1"
+    shape = [1, 11]
+    dtype = "int64"
+    data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+
+
+class Program_weight_tensor_data_2:
+    name = "data_2"
+    shape = [1, 11]
+    dtype = "int64"
+    data = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
diff --git a/paddle_samples/PaddleNLP/albert-chinese-small/model.py b/paddle_samples/PaddleNLP/albert-chinese-small/model.py
new file mode 100644
index 000000000..aac4bc18d
--- /dev/null
+++ b/paddle_samples/PaddleNLP/albert-chinese-small/model.py
@@ -0,0 +1,914 @@
+import paddle
+
+
+class GraphModule(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self,
+        parameter_0,
+        parameter_1,
+        parameter_2,
+        parameter_3,
+        parameter_4,
+        parameter_5,
+        parameter_6,
+        parameter_7,
+        parameter_8,
+        parameter_9,
+        parameter_10,
+        parameter_11,
+        parameter_12,
+        parameter_13,
+        parameter_14,
+        parameter_15,
+        parameter_16,
+        parameter_17,
+        parameter_18,
+        parameter_19,
+        parameter_20,
+        parameter_21,
+        parameter_22,
+        parameter_23,
+        parameter_24,
+        parameter_25,
+        data_0,
+        data_1,
+        data_2,
+    ):
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_0 = [1]
+
+        # pd_op.unsqueeze: (1x1x11xi64) <- (1x11xi64, 1xi64)
+        unsqueeze_0 = paddle._C_ops.unsqueeze(data_1, full_int_array_0)
+        del data_1
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_1 = [2]
+
+        # pd_op.unsqueeze: (1x1x1x11xi64) <- (1x1x11xi64, 1xi64)
+        unsqueeze_1 = paddle._C_ops.unsqueeze(unsqueeze_0, full_int_array_1)
+        del full_int_array_1, unsqueeze_0
+
+        # pd_op.cast: (1x1x1x11xf32) <- (1x1x1x11xi64)
+        cast_0 = paddle._C_ops.cast(unsqueeze_1, paddle.float32)
+        del unsqueeze_1
+
+        # pd_op.full: (1xf32) <- ()
+        full_0 = paddle._C_ops.full(
+            [1], float("-1"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x1x1x11xf32) <- (1x1x1x11xf32, 1xf32)
+        scale_0 = paddle._C_ops.scale(cast_0, full_0, float("1"), True)
+        del cast_0, full_0
+
+        # pd_op.full: (1xf32) <- ()
+        full_1 = paddle._C_ops.full(
+            [1], float("-10000"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x1x1x11xf32) <- (1x1x1x11xf32, 1xf32)
+        scale_1 = paddle._C_ops.scale(scale_0, full_1, float("0"), True)
+        del full_1, scale_0
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_2 = [0]
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_3 = [11]
+
+        # pd_op.slice: (1x11xi64) <- (1x512xi64, 1xi64, 1xi64)
+        slice_0 = paddle._C_ops.slice(
+            parameter_0, [1], full_int_array_2, full_int_array_3, [1], []
+        )
+        del full_int_array_3, parameter_0
+
+        # pd_op.embedding: (1x11x128xf32) <- (1x11xi64, 21128x128xf32)
+        embedding_0 = paddle._C_ops.embedding(data_0, parameter_25, 0, False)
+        del data_0, parameter_25
+
+        # pd_op.embedding: (1x11x128xf32) <- (1x11xi64, 2x128xf32)
+        embedding_1 = paddle._C_ops.embedding(data_2, parameter_23, -1, False)
+        del data_2, parameter_23
+
+        # pd_op.add: (1x11x128xf32) <- (1x11x128xf32, 1x11x128xf32)
+        add_0 = paddle._C_ops.add(embedding_0, embedding_1)
+        del embedding_0, embedding_1
+
+        # pd_op.embedding: (1x11x128xf32) <- (1x11xi64, 512x128xf32)
+        embedding_2 = paddle._C_ops.embedding(slice_0, parameter_24, -1, False)
+        del parameter_24, slice_0
+
+        # pd_op.add: (1x11x128xf32) <- (1x11x128xf32, 1x11x128xf32)
+        add_1 = paddle._C_ops.add(add_0, embedding_2)
+        del add_0, embedding_2
+
+        # pd_op.layer_norm: (1x11x128xf32, 1x11xf32, 1x11xf32) <- (1x11x128xf32, 128xf32, 128xf32)
+        layer_norm_0, layer_norm_1, layer_norm_2 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_1, parameter_22, parameter_21, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_1, parameter_21, parameter_22
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x128xf32, 128x384xf32)
+        matmul_0 = paddle._C_ops.matmul(layer_norm_0, parameter_20, False, False)
+        del layer_norm_0, parameter_20
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_2 = paddle._C_ops.add(matmul_0, parameter_19)
+        del matmul_0, parameter_19
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x384xf32, 384x384xf32)
+        matmul_1 = paddle._C_ops.matmul(add_2, parameter_16, False, False)
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_3 = paddle._C_ops.add(matmul_1, parameter_15)
+        del matmul_1
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x384xf32, 384x384xf32)
+        matmul_2 = paddle._C_ops.matmul(add_2, parameter_14, False, False)
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_4 = paddle._C_ops.add(matmul_2, parameter_13)
+        del matmul_2
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x384xf32, 384x384xf32)
+        matmul_3 = paddle._C_ops.matmul(add_2, parameter_12, False, False)
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_5 = paddle._C_ops.add(matmul_3, parameter_11)
+        del matmul_3
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_4 = [1, 11, 12, 32]
+
+        # pd_op.reshape: (1x11x12x32xf32) <- (1x11x384xf32, 4xi64)
+        reshape_0 = paddle._C_ops.reshape(add_3, full_int_array_4)
+        del add_3
+
+        # pd_op.transpose: (1x12x11x32xf32) <- (1x11x12x32xf32)
+        transpose_0 = paddle._C_ops.transpose(reshape_0, [0, 2, 1, 3])
+        del reshape_0
+
+        # pd_op.reshape: (1x11x12x32xf32) <- (1x11x384xf32, 4xi64)
+        reshape_1 = paddle._C_ops.reshape(add_4, full_int_array_4)
+        del add_4
+
+        # pd_op.transpose: (1x12x11x32xf32) <- (1x11x12x32xf32)
+        transpose_1 = paddle._C_ops.transpose(reshape_1, [0, 2, 1, 3])
+        del reshape_1
+
+        # pd_op.reshape: (1x11x12x32xf32) <- (1x11x384xf32, 4xi64)
+        reshape_2 = paddle._C_ops.reshape(add_5, full_int_array_4)
+        del add_5
+
+        # pd_op.transpose: (1x12x11x32xf32) <- (1x11x12x32xf32)
+        transpose_2 = paddle._C_ops.transpose(reshape_2, [0, 2, 1, 3])
+        del reshape_2
+
+        # pd_op.matmul: (1x12x11x11xf32) <- (1x12x11x32xf32, 1x12x11x32xf32)
+        matmul_4 = paddle._C_ops.matmul(transpose_0, transpose_1, False, True)
+        del transpose_0, transpose_1
+
+        # pd_op.full: (1xf32) <- ()
+        full_2 = paddle._C_ops.full(
+            [1], float("0.176777"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x12x11x11xf32) <- (1x12x11x11xf32, 1xf32)
+        scale_2 = paddle._C_ops.scale(matmul_4, full_2, float("0"), True)
+        del matmul_4
+
+        # pd_op.add: (1x12x11x11xf32) <- (1x12x11x11xf32, 1x1x1x11xf32)
+        add_6 = paddle._C_ops.add(scale_2, scale_1)
+        del scale_2
+
+        # pd_op.softmax: (1x12x11x11xf32) <- (1x12x11x11xf32)
+        softmax_0 = paddle._C_ops.softmax(add_6, -1)
+        del add_6
+
+        # pd_op.matmul: (1x12x11x32xf32) <- (1x12x11x11xf32, 1x12x11x32xf32)
+        matmul_5 = paddle._C_ops.matmul(softmax_0, transpose_2, False, False)
+        del softmax_0, transpose_2
+
+        # pd_op.transpose: (1x11x12x32xf32) <- (1x12x11x32xf32)
+        transpose_3 = paddle._C_ops.transpose(matmul_5, [0, 2, 1, 3])
+        del matmul_5
+
+        # pd_op.full_int_array: (3xi64) <- ()
+        full_int_array_5 = [0, 0, -1]
+
+        # pd_op.reshape: (1x11x384xf32) <- (1x11x12x32xf32, 3xi64)
+        reshape_3 = paddle._C_ops.reshape(transpose_3, full_int_array_5)
+        del transpose_3
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x384xf32, 384x384xf32)
+        matmul_6 = paddle._C_ops.matmul(reshape_3, parameter_10, False, False)
+        del reshape_3
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_7 = paddle._C_ops.add(matmul_6, parameter_9)
+        del matmul_6
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 1x11x384xf32)
+        add_8 = paddle._C_ops.add(add_2, add_7)
+        del add_2, add_7
+
+        # pd_op.layer_norm: (1x11x384xf32, 1x11xf32, 1x11xf32) <- (1x11x384xf32, 384xf32, 384xf32)
+        layer_norm_3, layer_norm_4, layer_norm_5 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_8, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_8
+
+        # pd_op.matmul: (1x11x1536xf32) <- (1x11x384xf32, 384x1536xf32)
+        matmul_7 = paddle._C_ops.matmul(layer_norm_3, parameter_6, False, False)
+
+        # pd_op.add: (1x11x1536xf32) <- (1x11x1536xf32, 1536xf32)
+        add_9 = paddle._C_ops.add(matmul_7, parameter_5)
+        del matmul_7
+
+        # pd_op.gelu: (1x11x1536xf32) <- (1x11x1536xf32)
+        gelu_0 = paddle._C_ops.gelu(add_9, False)
+        del add_9
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x1536xf32, 1536x384xf32)
+        matmul_8 = paddle._C_ops.matmul(gelu_0, parameter_4, False, False)
+        del gelu_0
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_10 = paddle._C_ops.add(matmul_8, parameter_3)
+        del matmul_8
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 1x11x384xf32)
+        add_11 = paddle._C_ops.add(add_10, layer_norm_3)
+        del add_10, layer_norm_3
+
+        # pd_op.layer_norm: (1x11x384xf32, 1x11xf32, 1x11xf32) <- (1x11x384xf32, 384xf32, 384xf32)
+        layer_norm_6, layer_norm_7, layer_norm_8 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_11, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_11
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x384xf32, 384x384xf32)
+        matmul_9 = paddle._C_ops.matmul(layer_norm_6, parameter_16, False, False)
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_12 = paddle._C_ops.add(matmul_9, parameter_15)
+        del matmul_9
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x384xf32, 384x384xf32)
+        matmul_10 = paddle._C_ops.matmul(layer_norm_6, parameter_14, False, False)
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_13 = paddle._C_ops.add(matmul_10, parameter_13)
+        del matmul_10
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x384xf32, 384x384xf32)
+        matmul_11 = paddle._C_ops.matmul(layer_norm_6, parameter_12, False, False)
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_14 = paddle._C_ops.add(matmul_11, parameter_11)
+        del matmul_11
+
+        # pd_op.reshape: (1x11x12x32xf32) <- (1x11x384xf32, 4xi64)
+        reshape_4 = paddle._C_ops.reshape(add_12, full_int_array_4)
+        del add_12
+
+        # pd_op.transpose: (1x12x11x32xf32) <- (1x11x12x32xf32)
+        transpose_4 = paddle._C_ops.transpose(reshape_4, [0, 2, 1, 3])
+        del reshape_4
+
+        # pd_op.reshape: (1x11x12x32xf32) <- (1x11x384xf32, 4xi64)
+        reshape_5 = paddle._C_ops.reshape(add_13, full_int_array_4)
+        del add_13
+
+        # pd_op.transpose: (1x12x11x32xf32) <- (1x11x12x32xf32)
+        transpose_5 = paddle._C_ops.transpose(reshape_5, [0, 2, 1, 3])
+        del reshape_5
+
+        # pd_op.reshape: (1x11x12x32xf32) <- (1x11x384xf32, 4xi64)
+        reshape_6 = paddle._C_ops.reshape(add_14, full_int_array_4)
+        del add_14
+
+        # pd_op.transpose: (1x12x11x32xf32) <- (1x11x12x32xf32)
+        transpose_6 = paddle._C_ops.transpose(reshape_6, [0, 2, 1, 3])
+        del reshape_6
+
+        # pd_op.matmul: (1x12x11x11xf32) <- (1x12x11x32xf32, 1x12x11x32xf32)
+        matmul_12 = paddle._C_ops.matmul(transpose_4, transpose_5, False, True)
+        del transpose_4, transpose_5
+
+        # pd_op.scale: (1x12x11x11xf32) <- (1x12x11x11xf32, 1xf32)
+        scale_3 = paddle._C_ops.scale(matmul_12, full_2, float("0"), True)
+        del matmul_12
+
+        # pd_op.add: (1x12x11x11xf32) <- (1x12x11x11xf32, 1x1x1x11xf32)
+        add_15 = paddle._C_ops.add(scale_3, scale_1)
+        del scale_3
+
+        # pd_op.softmax: (1x12x11x11xf32) <- (1x12x11x11xf32)
+        softmax_1 = paddle._C_ops.softmax(add_15, -1)
+        del add_15
+
+        # pd_op.matmul: (1x12x11x32xf32) <- (1x12x11x11xf32, 1x12x11x32xf32)
+        matmul_13 = paddle._C_ops.matmul(softmax_1, transpose_6, False, False)
+        del softmax_1, transpose_6
+
+        # pd_op.transpose: (1x11x12x32xf32) <- (1x12x11x32xf32)
+        transpose_7 = paddle._C_ops.transpose(matmul_13, [0, 2, 1, 3])
+        del matmul_13
+
+        # pd_op.reshape: (1x11x384xf32) <- (1x11x12x32xf32, 3xi64)
+        reshape_7 = paddle._C_ops.reshape(transpose_7, full_int_array_5)
+        del transpose_7
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x384xf32, 384x384xf32)
+        matmul_14 = paddle._C_ops.matmul(reshape_7, parameter_10, False, False)
+        del reshape_7
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_16 = paddle._C_ops.add(matmul_14, parameter_9)
+        del matmul_14
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 1x11x384xf32)
+        add_17 = paddle._C_ops.add(layer_norm_6, add_16)
+        del add_16, layer_norm_6
+
+        # pd_op.layer_norm: (1x11x384xf32, 1x11xf32, 1x11xf32) <- (1x11x384xf32, 384xf32, 384xf32)
+        layer_norm_9, layer_norm_10, layer_norm_11 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_17, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_17
+
+        # pd_op.matmul: (1x11x1536xf32) <- (1x11x384xf32, 384x1536xf32)
+        matmul_15 = paddle._C_ops.matmul(layer_norm_9, parameter_6, False, False)
+
+        # pd_op.add: (1x11x1536xf32) <- (1x11x1536xf32, 1536xf32)
+        add_18 = paddle._C_ops.add(matmul_15, parameter_5)
+        del matmul_15
+
+        # pd_op.gelu: (1x11x1536xf32) <- (1x11x1536xf32)
+        gelu_1 = paddle._C_ops.gelu(add_18, False)
+        del add_18
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x1536xf32, 1536x384xf32)
+        matmul_16 = paddle._C_ops.matmul(gelu_1, parameter_4, False, False)
+        del gelu_1
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_19 = paddle._C_ops.add(matmul_16, parameter_3)
+        del matmul_16
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 1x11x384xf32)
+        add_20 = paddle._C_ops.add(add_19, layer_norm_9)
+        del add_19, layer_norm_9
+
+        # pd_op.layer_norm: (1x11x384xf32, 1x11xf32, 1x11xf32) <- (1x11x384xf32, 384xf32, 384xf32)
+        layer_norm_12, layer_norm_13, layer_norm_14 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_20, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_20
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x384xf32, 384x384xf32)
+        matmul_17 = paddle._C_ops.matmul(layer_norm_12, parameter_16, False, False)
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_21 = paddle._C_ops.add(matmul_17, parameter_15)
+        del matmul_17
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x384xf32, 384x384xf32)
+        matmul_18 = paddle._C_ops.matmul(layer_norm_12, parameter_14, False, False)
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_22 = paddle._C_ops.add(matmul_18, parameter_13)
+        del matmul_18
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x384xf32, 384x384xf32)
+        matmul_19 = paddle._C_ops.matmul(layer_norm_12, parameter_12, False, False)
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_23 = paddle._C_ops.add(matmul_19, parameter_11)
+        del matmul_19
+
+        # pd_op.reshape: (1x11x12x32xf32) <- (1x11x384xf32, 4xi64)
+        reshape_8 = paddle._C_ops.reshape(add_21, full_int_array_4)
+        del add_21
+
+        # pd_op.transpose: (1x12x11x32xf32) <- (1x11x12x32xf32)
+        transpose_8 = paddle._C_ops.transpose(reshape_8, [0, 2, 1, 3])
+        del reshape_8
+
+        # pd_op.reshape: (1x11x12x32xf32) <- (1x11x384xf32, 4xi64)
+        reshape_9 = paddle._C_ops.reshape(add_22, full_int_array_4)
+        del add_22
+
+        # pd_op.transpose: (1x12x11x32xf32) <- (1x11x12x32xf32)
+        transpose_9 = paddle._C_ops.transpose(reshape_9, [0, 2, 1, 3])
+        del reshape_9
+
+        # pd_op.reshape: (1x11x12x32xf32) <- (1x11x384xf32, 4xi64)
+        reshape_10 = paddle._C_ops.reshape(add_23, full_int_array_4)
+        del add_23
+
+        # pd_op.transpose: (1x12x11x32xf32) <- (1x11x12x32xf32)
+        transpose_10 = paddle._C_ops.transpose(reshape_10, [0, 2, 1, 3])
+        del reshape_10
+
+        # pd_op.matmul: (1x12x11x11xf32) <- (1x12x11x32xf32, 1x12x11x32xf32)
+        matmul_20 = paddle._C_ops.matmul(transpose_8, transpose_9, False, True)
+        del transpose_8, transpose_9
+
+        # pd_op.scale: (1x12x11x11xf32) <- (1x12x11x11xf32, 1xf32)
+        scale_4 = paddle._C_ops.scale(matmul_20, full_2, float("0"), True)
+        del matmul_20
+
+        # pd_op.add: (1x12x11x11xf32) <- (1x12x11x11xf32, 1x1x1x11xf32)
+        add_24 = paddle._C_ops.add(scale_4, scale_1)
+        del scale_4
+
+        # pd_op.softmax: (1x12x11x11xf32) <- (1x12x11x11xf32)
+        softmax_2 = paddle._C_ops.softmax(add_24, -1)
+        del add_24
+
+        # pd_op.matmul: (1x12x11x32xf32) <- (1x12x11x11xf32, 1x12x11x32xf32)
+        matmul_21 = paddle._C_ops.matmul(softmax_2, transpose_10, False, False)
+        del softmax_2, transpose_10
+
+        # pd_op.transpose: (1x11x12x32xf32) <- (1x12x11x32xf32)
+        transpose_11 = paddle._C_ops.transpose(matmul_21, [0, 2, 1, 3])
+        del matmul_21
+
+        # pd_op.reshape: (1x11x384xf32) <- (1x11x12x32xf32, 3xi64)
+        reshape_11 = paddle._C_ops.reshape(transpose_11, full_int_array_5)
+        del transpose_11
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x384xf32, 384x384xf32)
+        matmul_22 = paddle._C_ops.matmul(reshape_11, parameter_10, False, False)
+        del reshape_11
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_25 = paddle._C_ops.add(matmul_22, parameter_9)
+        del matmul_22
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 1x11x384xf32)
+        add_26 = paddle._C_ops.add(layer_norm_12, add_25)
+        del add_25, layer_norm_12
+
+        # pd_op.layer_norm: (1x11x384xf32, 1x11xf32, 1x11xf32) <- (1x11x384xf32, 384xf32, 384xf32)
+        layer_norm_15, layer_norm_16, layer_norm_17 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_26, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_26
+
+        # pd_op.matmul: (1x11x1536xf32) <- (1x11x384xf32, 384x1536xf32)
+        matmul_23 = paddle._C_ops.matmul(layer_norm_15, parameter_6, False, False)
+
+        # pd_op.add: (1x11x1536xf32) <- (1x11x1536xf32, 1536xf32)
+        add_27 = paddle._C_ops.add(matmul_23, parameter_5)
+        del matmul_23
+
+        # pd_op.gelu: (1x11x1536xf32) <- (1x11x1536xf32)
+        gelu_2 = paddle._C_ops.gelu(add_27, False)
+        del add_27
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x1536xf32, 1536x384xf32)
+        matmul_24 = paddle._C_ops.matmul(gelu_2, parameter_4, False, False)
+        del gelu_2
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_28 = paddle._C_ops.add(matmul_24, parameter_3)
+        del matmul_24
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 1x11x384xf32)
+        add_29 = paddle._C_ops.add(add_28, layer_norm_15)
+        del add_28, layer_norm_15
+
+        # pd_op.layer_norm: (1x11x384xf32, 1x11xf32, 1x11xf32) <- (1x11x384xf32, 384xf32, 384xf32)
+        layer_norm_18, layer_norm_19, layer_norm_20 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_29, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_29
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x384xf32, 384x384xf32)
+        matmul_25 = paddle._C_ops.matmul(layer_norm_18, parameter_16, False, False)
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_30 = paddle._C_ops.add(matmul_25, parameter_15)
+        del matmul_25
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x384xf32, 384x384xf32)
+        matmul_26 = paddle._C_ops.matmul(layer_norm_18, parameter_14, False, False)
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_31 = paddle._C_ops.add(matmul_26, parameter_13)
+        del matmul_26
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x384xf32, 384x384xf32)
+        matmul_27 = paddle._C_ops.matmul(layer_norm_18, parameter_12, False, False)
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_32 = paddle._C_ops.add(matmul_27, parameter_11)
+        del matmul_27
+
+        # pd_op.reshape: (1x11x12x32xf32) <- (1x11x384xf32, 4xi64)
+        reshape_12 = paddle._C_ops.reshape(add_30, full_int_array_4)
+        del add_30
+
+        # pd_op.transpose: (1x12x11x32xf32) <- (1x11x12x32xf32)
+        transpose_12 = paddle._C_ops.transpose(reshape_12, [0, 2, 1, 3])
+        del reshape_12
+
+        # pd_op.reshape: (1x11x12x32xf32) <- (1x11x384xf32, 4xi64)
+        reshape_13 = paddle._C_ops.reshape(add_31, full_int_array_4)
+        del add_31
+
+        # pd_op.transpose: (1x12x11x32xf32) <- (1x11x12x32xf32)
+        transpose_13 = paddle._C_ops.transpose(reshape_13, [0, 2, 1, 3])
+        del reshape_13
+
+        # pd_op.reshape: (1x11x12x32xf32) <- (1x11x384xf32, 4xi64)
+        reshape_14 = paddle._C_ops.reshape(add_32, full_int_array_4)
+        del add_32
+
+        # pd_op.transpose: (1x12x11x32xf32) <- (1x11x12x32xf32)
+        transpose_14 = paddle._C_ops.transpose(reshape_14, [0, 2, 1, 3])
+        del reshape_14
+
+        # pd_op.matmul: (1x12x11x11xf32) <- (1x12x11x32xf32, 1x12x11x32xf32)
+        matmul_28 = paddle._C_ops.matmul(transpose_12, transpose_13, False, True)
+        del transpose_12, transpose_13
+
+        # pd_op.scale: (1x12x11x11xf32) <- (1x12x11x11xf32, 1xf32)
+        scale_5 = paddle._C_ops.scale(matmul_28, full_2, float("0"), True)
+        del matmul_28
+
+        # pd_op.add: (1x12x11x11xf32) <- (1x12x11x11xf32, 1x1x1x11xf32)
+        add_33 = paddle._C_ops.add(scale_5, scale_1)
+        del scale_5
+
+        # pd_op.softmax: (1x12x11x11xf32) <- (1x12x11x11xf32)
+        softmax_3 = paddle._C_ops.softmax(add_33, -1)
+        del add_33
+
+        # pd_op.matmul: (1x12x11x32xf32) <- (1x12x11x11xf32, 1x12x11x32xf32)
+        matmul_29 = paddle._C_ops.matmul(softmax_3, transpose_14, False, False)
+        del softmax_3, transpose_14
+
+        # pd_op.transpose: (1x11x12x32xf32) <- (1x12x11x32xf32)
+        transpose_15 = paddle._C_ops.transpose(matmul_29, [0, 2, 1, 3])
+        del matmul_29
+
+        # pd_op.reshape: (1x11x384xf32) <- (1x11x12x32xf32, 3xi64)
+        reshape_15 = paddle._C_ops.reshape(transpose_15, full_int_array_5)
+        del transpose_15
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x384xf32, 384x384xf32)
+        matmul_30 = paddle._C_ops.matmul(reshape_15, parameter_10, False, False)
+        del reshape_15
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_34 = paddle._C_ops.add(matmul_30, parameter_9)
+        del matmul_30
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 1x11x384xf32)
+        add_35 = paddle._C_ops.add(layer_norm_18, add_34)
+        del add_34, layer_norm_18
+
+        # pd_op.layer_norm: (1x11x384xf32, 1x11xf32, 1x11xf32) <- (1x11x384xf32, 384xf32, 384xf32)
+        layer_norm_21, layer_norm_22, layer_norm_23 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_35, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_35
+
+        # pd_op.matmul: (1x11x1536xf32) <- (1x11x384xf32, 384x1536xf32)
+        matmul_31 = paddle._C_ops.matmul(layer_norm_21, parameter_6, False, False)
+
+        # pd_op.add: (1x11x1536xf32) <- (1x11x1536xf32, 1536xf32)
+        add_36 = paddle._C_ops.add(matmul_31, parameter_5)
+        del matmul_31
+
+        # pd_op.gelu: (1x11x1536xf32) <- (1x11x1536xf32)
+        gelu_3 = paddle._C_ops.gelu(add_36, False)
+        del add_36
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x1536xf32, 1536x384xf32)
+        matmul_32 = paddle._C_ops.matmul(gelu_3, parameter_4, False, False)
+        del gelu_3
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_37 = paddle._C_ops.add(matmul_32, parameter_3)
+        del matmul_32
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 1x11x384xf32)
+        add_38 = paddle._C_ops.add(add_37, layer_norm_21)
+        del add_37, layer_norm_21
+
+        # pd_op.layer_norm: (1x11x384xf32, 1x11xf32, 1x11xf32) <- (1x11x384xf32, 384xf32, 384xf32)
+        layer_norm_24, layer_norm_25, layer_norm_26 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_38, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_38
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x384xf32, 384x384xf32)
+        matmul_33 = paddle._C_ops.matmul(layer_norm_24, parameter_16, False, False)
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_39 = paddle._C_ops.add(matmul_33, parameter_15)
+        del matmul_33
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x384xf32, 384x384xf32)
+        matmul_34 = paddle._C_ops.matmul(layer_norm_24, parameter_14, False, False)
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_40 = paddle._C_ops.add(matmul_34, parameter_13)
+        del matmul_34
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x384xf32, 384x384xf32)
+        matmul_35 = paddle._C_ops.matmul(layer_norm_24, parameter_12, False, False)
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_41 = paddle._C_ops.add(matmul_35, parameter_11)
+        del matmul_35
+
+        # pd_op.reshape: (1x11x12x32xf32) <- (1x11x384xf32, 4xi64)
+        reshape_16 = paddle._C_ops.reshape(add_39, full_int_array_4)
+        del add_39
+
+        # pd_op.transpose: (1x12x11x32xf32) <- (1x11x12x32xf32)
+        transpose_16 = paddle._C_ops.transpose(reshape_16, [0, 2, 1, 3])
+        del reshape_16
+
+        # pd_op.reshape: (1x11x12x32xf32) <- (1x11x384xf32, 4xi64)
+        reshape_17 = paddle._C_ops.reshape(add_40, full_int_array_4)
+        del add_40
+
+        # pd_op.transpose: (1x12x11x32xf32) <- (1x11x12x32xf32)
+        transpose_17 = paddle._C_ops.transpose(reshape_17, [0, 2, 1, 3])
+        del reshape_17
+
+        # pd_op.reshape: (1x11x12x32xf32) <- (1x11x384xf32, 4xi64)
+        reshape_18 = paddle._C_ops.reshape(add_41, full_int_array_4)
+        del add_41
+
+        # pd_op.transpose: (1x12x11x32xf32) <- (1x11x12x32xf32)
+        transpose_18 = paddle._C_ops.transpose(reshape_18, [0, 2, 1, 3])
+        del reshape_18
+
+        # pd_op.matmul: (1x12x11x11xf32) <- (1x12x11x32xf32, 1x12x11x32xf32)
+        matmul_36 = paddle._C_ops.matmul(transpose_16, transpose_17, False, True)
+        del transpose_16, transpose_17
+
+        # pd_op.scale: (1x12x11x11xf32) <- (1x12x11x11xf32, 1xf32)
+        scale_6 = paddle._C_ops.scale(matmul_36, full_2, float("0"), True)
+        del matmul_36
+
+        # pd_op.add: (1x12x11x11xf32) <- (1x12x11x11xf32, 1x1x1x11xf32)
+        add_42 = paddle._C_ops.add(scale_6, scale_1)
+        del scale_6
+
+        # pd_op.softmax: (1x12x11x11xf32) <- (1x12x11x11xf32)
+        softmax_4 = paddle._C_ops.softmax(add_42, -1)
+        del add_42
+
+        # pd_op.matmul: (1x12x11x32xf32) <- (1x12x11x11xf32, 1x12x11x32xf32)
+        matmul_37 = paddle._C_ops.matmul(softmax_4, transpose_18, False, False)
+        del softmax_4, transpose_18
+
+        # pd_op.transpose: (1x11x12x32xf32) <- (1x12x11x32xf32)
+        transpose_19 = paddle._C_ops.transpose(matmul_37, [0, 2, 1, 3])
+        del matmul_37
+
+        # pd_op.reshape: (1x11x384xf32) <- (1x11x12x32xf32, 3xi64)
+        reshape_19 = paddle._C_ops.reshape(transpose_19, full_int_array_5)
+        del transpose_19
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x384xf32, 384x384xf32)
+        matmul_38 = paddle._C_ops.matmul(reshape_19, parameter_10, False, False)
+        del reshape_19
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_43 = paddle._C_ops.add(matmul_38, parameter_9)
+        del matmul_38
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 1x11x384xf32)
+        add_44 = paddle._C_ops.add(layer_norm_24, add_43)
+        del add_43, layer_norm_24
+
+        # pd_op.layer_norm: (1x11x384xf32, 1x11xf32, 1x11xf32) <- (1x11x384xf32, 384xf32, 384xf32)
+        layer_norm_27, layer_norm_28, layer_norm_29 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_44, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_44
+
+        # pd_op.matmul: (1x11x1536xf32) <- (1x11x384xf32, 384x1536xf32)
+        matmul_39 = paddle._C_ops.matmul(layer_norm_27, parameter_6, False, False)
+
+        # pd_op.add: (1x11x1536xf32) <- (1x11x1536xf32, 1536xf32)
+        add_45 = paddle._C_ops.add(matmul_39, parameter_5)
+        del matmul_39
+
+        # pd_op.gelu: (1x11x1536xf32) <- (1x11x1536xf32)
+        gelu_4 = paddle._C_ops.gelu(add_45, False)
+        del add_45
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x1536xf32, 1536x384xf32)
+        matmul_40 = paddle._C_ops.matmul(gelu_4, parameter_4, False, False)
+        del gelu_4
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_46 = paddle._C_ops.add(matmul_40, parameter_3)
+        del matmul_40
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 1x11x384xf32)
+        add_47 = paddle._C_ops.add(add_46, layer_norm_27)
+        del add_46, layer_norm_27
+
+        # pd_op.layer_norm: (1x11x384xf32, 1x11xf32, 1x11xf32) <- (1x11x384xf32, 384xf32, 384xf32)
+        layer_norm_30, layer_norm_31, layer_norm_32 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_47, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_47
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x384xf32, 384x384xf32)
+        matmul_41 = paddle._C_ops.matmul(layer_norm_30, parameter_16, False, False)
+        del parameter_16
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_48 = paddle._C_ops.add(matmul_41, parameter_15)
+        del matmul_41, parameter_15
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x384xf32, 384x384xf32)
+        matmul_42 = paddle._C_ops.matmul(layer_norm_30, parameter_14, False, False)
+        del parameter_14
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_49 = paddle._C_ops.add(matmul_42, parameter_13)
+        del matmul_42, parameter_13
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x384xf32, 384x384xf32)
+        matmul_43 = paddle._C_ops.matmul(layer_norm_30, parameter_12, False, False)
+        del parameter_12
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_50 = paddle._C_ops.add(matmul_43, parameter_11)
+        del matmul_43, parameter_11
+
+        # pd_op.reshape: (1x11x12x32xf32) <- (1x11x384xf32, 4xi64)
+        reshape_20 = paddle._C_ops.reshape(add_48, full_int_array_4)
+        del add_48
+
+        # pd_op.transpose: (1x12x11x32xf32) <- (1x11x12x32xf32)
+        transpose_20 = paddle._C_ops.transpose(reshape_20, [0, 2, 1, 3])
+        del reshape_20
+
+        # pd_op.reshape: (1x11x12x32xf32) <- (1x11x384xf32, 4xi64)
+        reshape_21 = paddle._C_ops.reshape(add_49, full_int_array_4)
+        del add_49
+
+        # pd_op.transpose: (1x12x11x32xf32) <- (1x11x12x32xf32)
+        transpose_21 = paddle._C_ops.transpose(reshape_21, [0, 2, 1, 3])
+        del reshape_21
+
+        # pd_op.reshape: (1x11x12x32xf32) <- (1x11x384xf32, 4xi64)
+        reshape_22 = paddle._C_ops.reshape(add_50, full_int_array_4)
+        del add_50, full_int_array_4
+
+        # pd_op.transpose: (1x12x11x32xf32) <- (1x11x12x32xf32)
+        transpose_22 = paddle._C_ops.transpose(reshape_22, [0, 2, 1, 3])
+        del reshape_22
+
+        # pd_op.matmul: (1x12x11x11xf32) <- (1x12x11x32xf32, 1x12x11x32xf32)
+        matmul_44 = paddle._C_ops.matmul(transpose_20, transpose_21, False, True)
+        del transpose_20, transpose_21
+
+        # pd_op.scale: (1x12x11x11xf32) <- (1x12x11x11xf32, 1xf32)
+        scale_7 = paddle._C_ops.scale(matmul_44, full_2, float("0"), True)
+        del full_2, matmul_44
+
+        # pd_op.add: (1x12x11x11xf32) <- (1x12x11x11xf32, 1x1x1x11xf32)
+        add_51 = paddle._C_ops.add(scale_7, scale_1)
+        del scale_1, scale_7
+
+        # pd_op.softmax: (1x12x11x11xf32) <- (1x12x11x11xf32)
+        softmax_5 = paddle._C_ops.softmax(add_51, -1)
+        del add_51
+
+        # pd_op.matmul: (1x12x11x32xf32) <- (1x12x11x11xf32, 1x12x11x32xf32)
+        matmul_45 = paddle._C_ops.matmul(softmax_5, transpose_22, False, False)
+        del softmax_5, transpose_22
+
+        # pd_op.transpose: (1x11x12x32xf32) <- (1x12x11x32xf32)
+        transpose_23 = paddle._C_ops.transpose(matmul_45, [0, 2, 1, 3])
+        del matmul_45
+
+        # pd_op.reshape: (1x11x384xf32) <- (1x11x12x32xf32, 3xi64)
+        reshape_23 = paddle._C_ops.reshape(transpose_23, full_int_array_5)
+        del full_int_array_5, transpose_23
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x384xf32, 384x384xf32)
+        matmul_46 = paddle._C_ops.matmul(reshape_23, parameter_10, False, False)
+        del parameter_10, reshape_23
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_52 = paddle._C_ops.add(matmul_46, parameter_9)
+        del matmul_46, parameter_9
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 1x11x384xf32)
+        add_53 = paddle._C_ops.add(layer_norm_30, add_52)
+        del add_52, layer_norm_30
+
+        # pd_op.layer_norm: (1x11x384xf32, 1x11xf32, 1x11xf32) <- (1x11x384xf32, 384xf32, 384xf32)
+        layer_norm_33, layer_norm_34, layer_norm_35 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_53, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_53, parameter_7, parameter_8
+
+        # pd_op.matmul: (1x11x1536xf32) <- (1x11x384xf32, 384x1536xf32)
+        matmul_47 = paddle._C_ops.matmul(layer_norm_33, parameter_6, False, False)
+        del parameter_6
+
+        # pd_op.add: (1x11x1536xf32) <- (1x11x1536xf32, 1536xf32)
+        add_54 = paddle._C_ops.add(matmul_47, parameter_5)
+        del matmul_47, parameter_5
+
+        # pd_op.gelu: (1x11x1536xf32) <- (1x11x1536xf32)
+        gelu_5 = paddle._C_ops.gelu(add_54, False)
+        del add_54
+
+        # pd_op.matmul: (1x11x384xf32) <- (1x11x1536xf32, 1536x384xf32)
+        matmul_48 = paddle._C_ops.matmul(gelu_5, parameter_4, False, False)
+        del gelu_5, parameter_4
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 384xf32)
+        add_55 = paddle._C_ops.add(matmul_48, parameter_3)
+        del matmul_48, parameter_3
+
+        # pd_op.add: (1x11x384xf32) <- (1x11x384xf32, 1x11x384xf32)
+        add_56 = paddle._C_ops.add(add_55, layer_norm_33)
+        del add_55, layer_norm_33
+
+        # pd_op.layer_norm: (1x11x384xf32, 1x11xf32, 1x11xf32) <- (1x11x384xf32, 384xf32, 384xf32)
+        layer_norm_36, layer_norm_37, layer_norm_38 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_56, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_56, parameter_17, parameter_18
+
+        # pd_op.slice: (1x384xf32) <- (1x11x384xf32, 1xi64, 1xi64)
+        slice_1 = paddle._C_ops.slice(
+            layer_norm_36, [1], full_int_array_2, full_int_array_0, [1], [1]
+        )
+        del full_int_array_0, full_int_array_2
+
+        # pd_op.matmul: (1x384xf32) <- (1x384xf32, 384x384xf32)
+        matmul_49 = paddle._C_ops.matmul(slice_1, parameter_2, False, False)
+        del parameter_2, slice_1
+
+        # pd_op.add: (1x384xf32) <- (1x384xf32, 384xf32)
+        add_57 = paddle._C_ops.add(matmul_49, parameter_1)
+        del matmul_49, parameter_1
+
+        # pd_op.tanh: (1x384xf32) <- (1x384xf32)
+        tanh_0 = paddle._C_ops.tanh(add_57)
+        del add_57, layer_norm_36
+
+        return tanh_0
diff --git a/paddle_samples/PaddleNLP/albert-chinese-small/weight_meta.py b/paddle_samples/PaddleNLP/albert-chinese-small/weight_meta.py
new file mode 100644
index 000000000..a92a1fce1
--- /dev/null
+++ b/paddle_samples/PaddleNLP/albert-chinese-small/weight_meta.py
@@ -0,0 +1,237 @@
+class Program_weight_tensor_parameter_0:
+    name = "parameter_0"
+    shape = [1, 512]
+    dtype = "int64"
+    min_val = 0
+    max_val = 511
+    data = None
+
+
+class Program_weight_tensor_parameter_1:
+    name = "parameter_1"
+    shape = [384]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_2:
+    name = "parameter_2"
+    shape = [384, 384]
+    dtype = "float32"
+    min_val = float("-0.0920974")
+    max_val = float("0.0898767")
+    mean = float("2.2093e-05")
+    std = float("0.0200502")
+    data = None
+
+
+class Program_weight_tensor_parameter_3:
+    name = "parameter_3"
+    shape = [384]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_4:
+    name = "parameter_4"
+    shape = [1536, 384]
+    dtype = "float32"
+    min_val = float("-0.0904018")
+    max_val = float("0.0969833")
+    mean = float("-3.42421e-05")
+    std = float("0.0199846")
+    data = None
+
+
+class Program_weight_tensor_parameter_5:
+    name = "parameter_5"
+    shape = [1536]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_6:
+    name = "parameter_6"
+    shape = [384, 1536]
+    dtype = "float32"
+    min_val = float("-0.101128")
+    max_val = float("0.0953021")
+    mean = float("-3.2029e-05")
+    std = float("0.0200196")
+    data = None
+
+
+class Program_weight_tensor_parameter_7:
+    name = "parameter_7"
+    shape = [384]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_8:
+    name = "parameter_8"
+    shape = [384]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    std = float("5.96046e-08")
+    data = None
+
+
+class Program_weight_tensor_parameter_9:
+    name = "parameter_9"
+    shape = [384]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_10:
+    name = "parameter_10"
+    shape = [384, 384]
+    dtype = "float32"
+    min_val = float("-0.090305")
+    max_val = float("0.0853635")
+    mean = float("-3.53809e-05")
+    std = float("0.0199774")
+    data = None
+
+
+class Program_weight_tensor_parameter_11:
+    name = "parameter_11"
+    shape = [384]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_12:
+    name = "parameter_12"
+    shape = [384, 384]
+    dtype = "float32"
+    min_val = float("-0.0886189")
+    max_val = float("0.0784958")
+    mean = float("6.89041e-05")
+    std = float("0.0199391")
+    data = None
+
+
+class Program_weight_tensor_parameter_13:
+    name = "parameter_13"
+    shape = [384]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_14:
+    name = "parameter_14"
+    shape = [384, 384]
+    dtype = "float32"
+    min_val = float("-0.085532")
+    max_val = float("0.0893973")
+    mean = float("1.21036e-05")
+    std = float("0.0199489")
+    data = None
+
+
+class Program_weight_tensor_parameter_15:
+    name = "parameter_15"
+    shape = [384]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_16:
+    name = "parameter_16"
+    shape = [384, 384]
+    dtype = "float32"
+    min_val = float("-0.082137")
+    max_val = float("0.0837528")
+    mean = float("3.60444e-05")
+    std = float("0.0199884")
+    data = None
+
+
+class Program_weight_tensor_parameter_17:
+    name = "parameter_17"
+    shape = [384]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_18:
+    name = "parameter_18"
+    shape = [384]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    std = float("5.96046e-08")
+    data = None
+
+
+class Program_weight_tensor_parameter_19:
+    name = "parameter_19"
+    shape = [384]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_20:
+    name = "parameter_20"
+    shape = [128, 384]
+    dtype = "float32"
+    min_val = float("-0.0793836")
+    max_val = float("0.0805083")
+    mean = float("0.000188609")
+    std = float("0.0199533")
+    data = None
+
+
+class Program_weight_tensor_parameter_21:
+    name = "parameter_21"
+    shape = [128]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_22:
+    name = "parameter_22"
+    shape = [128]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_23:
+    name = "parameter_23"
+    shape = [2, 128]
+    dtype = "float32"
+    min_val = float("-0.0491545")
+    max_val = float("0.0631497")
+    mean = float("0.000771939")
+    std = float("0.0196944")
+    data = None
+
+
+class Program_weight_tensor_parameter_24:
+    name = "parameter_24"
+    shape = [512, 128]
+    dtype = "float32"
+    min_val = float("-0.0755785")
+    max_val = float("0.0787668")
+    mean = float("9.04328e-06")
+    std = float("0.0199793")
+    data = None
+
+
+class Program_weight_tensor_parameter_25:
+    name = "parameter_25"
+    shape = [21128, 128]
+    dtype = "float32"
+    min_val = float("-0.0980633")
+    max_val = float("0.0982526")
+    mean = float("-7.01566e-06")
+    std = float("0.0200018")
+    data = None
diff --git a/paddle_samples/PaddleNLP/albert-chinese-tiny/graph_net.json b/paddle_samples/PaddleNLP/albert-chinese-tiny/graph_net.json
new file mode 100644
index 000000000..d83669f6d
--- /dev/null
+++ b/paddle_samples/PaddleNLP/albert-chinese-tiny/graph_net.json
@@ -0,0 +1,6 @@
+{
+    "framework": "paddle",
+    "model_name": "albert-chinese-tiny",
+    "num_devices_required": 1,
+    "num_nodes_required": 1
+}
\ No newline at end of file
diff --git a/paddle_samples/PaddleNLP/albert-chinese-tiny/input_meta.py b/paddle_samples/PaddleNLP/albert-chinese-tiny/input_meta.py
new file mode 100644
index 000000000..3708564f7
--- /dev/null
+++ b/paddle_samples/PaddleNLP/albert-chinese-tiny/input_meta.py
@@ -0,0 +1,19 @@
+class Program_weight_tensor_data_0:
+    name = "data_0"
+    shape = [1, 11]
+    dtype = "int64"
+    data = [101, 3614, 6816, 886, 4500, 4636, 2428, 7607, 3444, 106, 102]
+
+
+class Program_weight_tensor_data_1:
+    name = "data_1"
+    shape = [1, 11]
+    dtype = "int64"
+    data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+
+
+class Program_weight_tensor_data_2:
+    name = "data_2"
+    shape = [1, 11]
+    dtype = "int64"
+    data = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
diff --git a/paddle_samples/PaddleNLP/albert-chinese-tiny/model.py b/paddle_samples/PaddleNLP/albert-chinese-tiny/model.py
new file mode 100644
index 000000000..558af36b2
--- /dev/null
+++ b/paddle_samples/PaddleNLP/albert-chinese-tiny/model.py
@@ -0,0 +1,662 @@
+import paddle
+
+
+class GraphModule(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self,
+        parameter_0,
+        parameter_1,
+        parameter_2,
+        parameter_3,
+        parameter_4,
+        parameter_5,
+        parameter_6,
+        parameter_7,
+        parameter_8,
+        parameter_9,
+        parameter_10,
+        parameter_11,
+        parameter_12,
+        parameter_13,
+        parameter_14,
+        parameter_15,
+        parameter_16,
+        parameter_17,
+        parameter_18,
+        parameter_19,
+        parameter_20,
+        parameter_21,
+        parameter_22,
+        parameter_23,
+        parameter_24,
+        parameter_25,
+        data_0,
+        data_1,
+        data_2,
+    ):
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_0 = [1]
+
+        # pd_op.unsqueeze: (1x1x11xi64) <- (1x11xi64, 1xi64)
+        unsqueeze_0 = paddle._C_ops.unsqueeze(data_1, full_int_array_0)
+        del data_1
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_1 = [2]
+
+        # pd_op.unsqueeze: (1x1x1x11xi64) <- (1x1x11xi64, 1xi64)
+        unsqueeze_1 = paddle._C_ops.unsqueeze(unsqueeze_0, full_int_array_1)
+        del full_int_array_1, unsqueeze_0
+
+        # pd_op.cast: (1x1x1x11xf32) <- (1x1x1x11xi64)
+        cast_0 = paddle._C_ops.cast(unsqueeze_1, paddle.float32)
+        del unsqueeze_1
+
+        # pd_op.full: (1xf32) <- ()
+        full_0 = paddle._C_ops.full(
+            [1], float("-1"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x1x1x11xf32) <- (1x1x1x11xf32, 1xf32)
+        scale_0 = paddle._C_ops.scale(cast_0, full_0, float("1"), True)
+        del cast_0, full_0
+
+        # pd_op.full: (1xf32) <- ()
+        full_1 = paddle._C_ops.full(
+            [1], float("-10000"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x1x1x11xf32) <- (1x1x1x11xf32, 1xf32)
+        scale_1 = paddle._C_ops.scale(scale_0, full_1, float("0"), True)
+        del full_1, scale_0
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_2 = [0]
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_3 = [11]
+
+        # pd_op.slice: (1x11xi64) <- (1x512xi64, 1xi64, 1xi64)
+        slice_0 = paddle._C_ops.slice(
+            parameter_0, [1], full_int_array_2, full_int_array_3, [1], []
+        )
+        del full_int_array_3, parameter_0
+
+        # pd_op.embedding: (1x11x128xf32) <- (1x11xi64, 21128x128xf32)
+        embedding_0 = paddle._C_ops.embedding(data_0, parameter_25, 0, False)
+        del data_0, parameter_25
+
+        # pd_op.embedding: (1x11x128xf32) <- (1x11xi64, 2x128xf32)
+        embedding_1 = paddle._C_ops.embedding(data_2, parameter_23, -1, False)
+        del data_2, parameter_23
+
+        # pd_op.add: (1x11x128xf32) <- (1x11x128xf32, 1x11x128xf32)
+        add_0 = paddle._C_ops.add(embedding_0, embedding_1)
+        del embedding_0, embedding_1
+
+        # pd_op.embedding: (1x11x128xf32) <- (1x11xi64, 512x128xf32)
+        embedding_2 = paddle._C_ops.embedding(slice_0, parameter_24, -1, False)
+        del parameter_24, slice_0
+
+        # pd_op.add: (1x11x128xf32) <- (1x11x128xf32, 1x11x128xf32)
+        add_1 = paddle._C_ops.add(add_0, embedding_2)
+        del add_0, embedding_2
+
+        # pd_op.layer_norm: (1x11x128xf32, 1x11xf32, 1x11xf32) <- (1x11x128xf32, 128xf32, 128xf32)
+        layer_norm_0, layer_norm_1, layer_norm_2 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_1, parameter_22, parameter_21, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_1, parameter_21, parameter_22
+
+        # pd_op.matmul: (1x11x312xf32) <- (1x11x128xf32, 128x312xf32)
+        matmul_0 = paddle._C_ops.matmul(layer_norm_0, parameter_20, False, False)
+        del layer_norm_0, parameter_20
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 312xf32)
+        add_2 = paddle._C_ops.add(matmul_0, parameter_19)
+        del matmul_0, parameter_19
+
+        # pd_op.matmul: (1x11x312xf32) <- (1x11x312xf32, 312x312xf32)
+        matmul_1 = paddle._C_ops.matmul(add_2, parameter_16, False, False)
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 312xf32)
+        add_3 = paddle._C_ops.add(matmul_1, parameter_15)
+        del matmul_1
+
+        # pd_op.matmul: (1x11x312xf32) <- (1x11x312xf32, 312x312xf32)
+        matmul_2 = paddle._C_ops.matmul(add_2, parameter_14, False, False)
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 312xf32)
+        add_4 = paddle._C_ops.add(matmul_2, parameter_13)
+        del matmul_2
+
+        # pd_op.matmul: (1x11x312xf32) <- (1x11x312xf32, 312x312xf32)
+        matmul_3 = paddle._C_ops.matmul(add_2, parameter_12, False, False)
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 312xf32)
+        add_5 = paddle._C_ops.add(matmul_3, parameter_11)
+        del matmul_3
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_4 = [1, 11, 12, 26]
+
+        # pd_op.reshape: (1x11x12x26xf32) <- (1x11x312xf32, 4xi64)
+        reshape_0 = paddle._C_ops.reshape(add_3, full_int_array_4)
+        del add_3
+
+        # pd_op.transpose: (1x12x11x26xf32) <- (1x11x12x26xf32)
+        transpose_0 = paddle._C_ops.transpose(reshape_0, [0, 2, 1, 3])
+        del reshape_0
+
+        # pd_op.reshape: (1x11x12x26xf32) <- (1x11x312xf32, 4xi64)
+        reshape_1 = paddle._C_ops.reshape(add_4, full_int_array_4)
+        del add_4
+
+        # pd_op.transpose: (1x12x11x26xf32) <- (1x11x12x26xf32)
+        transpose_1 = paddle._C_ops.transpose(reshape_1, [0, 2, 1, 3])
+        del reshape_1
+
+        # pd_op.reshape: (1x11x12x26xf32) <- (1x11x312xf32, 4xi64)
+        reshape_2 = paddle._C_ops.reshape(add_5, full_int_array_4)
+        del add_5
+
+        # pd_op.transpose: (1x12x11x26xf32) <- (1x11x12x26xf32)
+        transpose_2 = paddle._C_ops.transpose(reshape_2, [0, 2, 1, 3])
+        del reshape_2
+
+        # pd_op.matmul: (1x12x11x11xf32) <- (1x12x11x26xf32, 1x12x11x26xf32)
+        matmul_4 = paddle._C_ops.matmul(transpose_0, transpose_1, False, True)
+        del transpose_0, transpose_1
+
+        # pd_op.full: (1xf32) <- ()
+        full_2 = paddle._C_ops.full(
+            [1], float("0.196116"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x12x11x11xf32) <- (1x12x11x11xf32, 1xf32)
+        scale_2 = paddle._C_ops.scale(matmul_4, full_2, float("0"), True)
+        del matmul_4
+
+        # pd_op.add: (1x12x11x11xf32) <- (1x12x11x11xf32, 1x1x1x11xf32)
+        add_6 = paddle._C_ops.add(scale_2, scale_1)
+        del scale_2
+
+        # pd_op.softmax: (1x12x11x11xf32) <- (1x12x11x11xf32)
+        softmax_0 = paddle._C_ops.softmax(add_6, -1)
+        del add_6
+
+        # pd_op.matmul: (1x12x11x26xf32) <- (1x12x11x11xf32, 1x12x11x26xf32)
+        matmul_5 = paddle._C_ops.matmul(softmax_0, transpose_2, False, False)
+        del softmax_0, transpose_2
+
+        # pd_op.transpose: (1x11x12x26xf32) <- (1x12x11x26xf32)
+        transpose_3 = paddle._C_ops.transpose(matmul_5, [0, 2, 1, 3])
+        del matmul_5
+
+        # pd_op.full_int_array: (3xi64) <- ()
+        full_int_array_5 = [0, 0, -1]
+
+        # pd_op.reshape: (1x11x312xf32) <- (1x11x12x26xf32, 3xi64)
+        reshape_3 = paddle._C_ops.reshape(transpose_3, full_int_array_5)
+        del transpose_3
+
+        # pd_op.matmul: (1x11x312xf32) <- (1x11x312xf32, 312x312xf32)
+        matmul_6 = paddle._C_ops.matmul(reshape_3, parameter_10, False, False)
+        del reshape_3
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 312xf32)
+        add_7 = paddle._C_ops.add(matmul_6, parameter_9)
+        del matmul_6
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 1x11x312xf32)
+        add_8 = paddle._C_ops.add(add_2, add_7)
+        del add_2, add_7
+
+        # pd_op.layer_norm: (1x11x312xf32, 1x11xf32, 1x11xf32) <- (1x11x312xf32, 312xf32, 312xf32)
+        layer_norm_3, layer_norm_4, layer_norm_5 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_8, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_8
+
+        # pd_op.matmul: (1x11x1248xf32) <- (1x11x312xf32, 312x1248xf32)
+        matmul_7 = paddle._C_ops.matmul(layer_norm_3, parameter_6, False, False)
+
+        # pd_op.add: (1x11x1248xf32) <- (1x11x1248xf32, 1248xf32)
+        add_9 = paddle._C_ops.add(matmul_7, parameter_5)
+        del matmul_7
+
+        # pd_op.gelu: (1x11x1248xf32) <- (1x11x1248xf32)
+        gelu_0 = paddle._C_ops.gelu(add_9, False)
+        del add_9
+
+        # pd_op.matmul: (1x11x312xf32) <- (1x11x1248xf32, 1248x312xf32)
+        matmul_8 = paddle._C_ops.matmul(gelu_0, parameter_4, False, False)
+        del gelu_0
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 312xf32)
+        add_10 = paddle._C_ops.add(matmul_8, parameter_3)
+        del matmul_8
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 1x11x312xf32)
+        add_11 = paddle._C_ops.add(add_10, layer_norm_3)
+        del add_10, layer_norm_3
+
+        # pd_op.layer_norm: (1x11x312xf32, 1x11xf32, 1x11xf32) <- (1x11x312xf32, 312xf32, 312xf32)
+        layer_norm_6, layer_norm_7, layer_norm_8 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_11, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_11
+
+        # pd_op.matmul: (1x11x312xf32) <- (1x11x312xf32, 312x312xf32)
+        matmul_9 = paddle._C_ops.matmul(layer_norm_6, parameter_16, False, False)
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 312xf32)
+        add_12 = paddle._C_ops.add(matmul_9, parameter_15)
+        del matmul_9
+
+        # pd_op.matmul: (1x11x312xf32) <- (1x11x312xf32, 312x312xf32)
+        matmul_10 = paddle._C_ops.matmul(layer_norm_6, parameter_14, False, False)
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 312xf32)
+        add_13 = paddle._C_ops.add(matmul_10, parameter_13)
+        del matmul_10
+
+        # pd_op.matmul: (1x11x312xf32) <- (1x11x312xf32, 312x312xf32)
+        matmul_11 = paddle._C_ops.matmul(layer_norm_6, parameter_12, False, False)
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 312xf32)
+        add_14 = paddle._C_ops.add(matmul_11, parameter_11)
+        del matmul_11
+
+        # pd_op.reshape: (1x11x12x26xf32) <- (1x11x312xf32, 4xi64)
+        reshape_4 = paddle._C_ops.reshape(add_12, full_int_array_4)
+        del add_12
+
+        # pd_op.transpose: (1x12x11x26xf32) <- (1x11x12x26xf32)
+        transpose_4 = paddle._C_ops.transpose(reshape_4, [0, 2, 1, 3])
+        del reshape_4
+
+        # pd_op.reshape: (1x11x12x26xf32) <- (1x11x312xf32, 4xi64)
+        reshape_5 = paddle._C_ops.reshape(add_13, full_int_array_4)
+        del add_13
+
+        # pd_op.transpose: (1x12x11x26xf32) <- (1x11x12x26xf32)
+        transpose_5 = paddle._C_ops.transpose(reshape_5, [0, 2, 1, 3])
+        del reshape_5
+
+        # pd_op.reshape: (1x11x12x26xf32) <- (1x11x312xf32, 4xi64)
+        reshape_6 = paddle._C_ops.reshape(add_14, full_int_array_4)
+        del add_14
+
+        # pd_op.transpose: (1x12x11x26xf32) <- (1x11x12x26xf32)
+        transpose_6 = paddle._C_ops.transpose(reshape_6, [0, 2, 1, 3])
+        del reshape_6
+
+        # pd_op.matmul: (1x12x11x11xf32) <- (1x12x11x26xf32, 1x12x11x26xf32)
+        matmul_12 = paddle._C_ops.matmul(transpose_4, transpose_5, False, True)
+        del transpose_4, transpose_5
+
+        # pd_op.scale: (1x12x11x11xf32) <- (1x12x11x11xf32, 1xf32)
+        scale_3 = paddle._C_ops.scale(matmul_12, full_2, float("0"), True)
+        del matmul_12
+
+        # pd_op.add: (1x12x11x11xf32) <- (1x12x11x11xf32, 1x1x1x11xf32)
+        add_15 = paddle._C_ops.add(scale_3, scale_1)
+        del scale_3
+
+        # pd_op.softmax: (1x12x11x11xf32) <- (1x12x11x11xf32)
+        softmax_1 = paddle._C_ops.softmax(add_15, -1)
+        del add_15
+
+        # pd_op.matmul: (1x12x11x26xf32) <- (1x12x11x11xf32, 1x12x11x26xf32)
+        matmul_13 = paddle._C_ops.matmul(softmax_1, transpose_6, False, False)
+        del softmax_1, transpose_6
+
+        # pd_op.transpose: (1x11x12x26xf32) <- (1x12x11x26xf32)
+        transpose_7 = paddle._C_ops.transpose(matmul_13, [0, 2, 1, 3])
+        del matmul_13
+
+        # pd_op.reshape: (1x11x312xf32) <- (1x11x12x26xf32, 3xi64)
+        reshape_7 = paddle._C_ops.reshape(transpose_7, full_int_array_5)
+        del transpose_7
+
+        # pd_op.matmul: (1x11x312xf32) <- (1x11x312xf32, 312x312xf32)
+        matmul_14 = paddle._C_ops.matmul(reshape_7, parameter_10, False, False)
+        del reshape_7
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 312xf32)
+        add_16 = paddle._C_ops.add(matmul_14, parameter_9)
+        del matmul_14
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 1x11x312xf32)
+        add_17 = paddle._C_ops.add(layer_norm_6, add_16)
+        del add_16, layer_norm_6
+
+        # pd_op.layer_norm: (1x11x312xf32, 1x11xf32, 1x11xf32) <- (1x11x312xf32, 312xf32, 312xf32)
+        layer_norm_9, layer_norm_10, layer_norm_11 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_17, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_17
+
+        # pd_op.matmul: (1x11x1248xf32) <- (1x11x312xf32, 312x1248xf32)
+        matmul_15 = paddle._C_ops.matmul(layer_norm_9, parameter_6, False, False)
+
+        # pd_op.add: (1x11x1248xf32) <- (1x11x1248xf32, 1248xf32)
+        add_18 = paddle._C_ops.add(matmul_15, parameter_5)
+        del matmul_15
+
+        # pd_op.gelu: (1x11x1248xf32) <- (1x11x1248xf32)
+        gelu_1 = paddle._C_ops.gelu(add_18, False)
+        del add_18
+
+        # pd_op.matmul: (1x11x312xf32) <- (1x11x1248xf32, 1248x312xf32)
+        matmul_16 = paddle._C_ops.matmul(gelu_1, parameter_4, False, False)
+        del gelu_1
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 312xf32)
+        add_19 = paddle._C_ops.add(matmul_16, parameter_3)
+        del matmul_16
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 1x11x312xf32)
+        add_20 = paddle._C_ops.add(add_19, layer_norm_9)
+        del add_19, layer_norm_9
+
+        # pd_op.layer_norm: (1x11x312xf32, 1x11xf32, 1x11xf32) <- (1x11x312xf32, 312xf32, 312xf32)
+        layer_norm_12, layer_norm_13, layer_norm_14 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_20, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_20
+
+        # pd_op.matmul: (1x11x312xf32) <- (1x11x312xf32, 312x312xf32)
+        matmul_17 = paddle._C_ops.matmul(layer_norm_12, parameter_16, False, False)
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 312xf32)
+        add_21 = paddle._C_ops.add(matmul_17, parameter_15)
+        del matmul_17
+
+        # pd_op.matmul: (1x11x312xf32) <- (1x11x312xf32, 312x312xf32)
+        matmul_18 = paddle._C_ops.matmul(layer_norm_12, parameter_14, False, False)
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 312xf32)
+        add_22 = paddle._C_ops.add(matmul_18, parameter_13)
+        del matmul_18
+
+        # pd_op.matmul: (1x11x312xf32) <- (1x11x312xf32, 312x312xf32)
+        matmul_19 = paddle._C_ops.matmul(layer_norm_12, parameter_12, False, False)
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 312xf32)
+        add_23 = paddle._C_ops.add(matmul_19, parameter_11)
+        del matmul_19
+
+        # pd_op.reshape: (1x11x12x26xf32) <- (1x11x312xf32, 4xi64)
+        reshape_8 = paddle._C_ops.reshape(add_21, full_int_array_4)
+        del add_21
+
+        # pd_op.transpose: (1x12x11x26xf32) <- (1x11x12x26xf32)
+        transpose_8 = paddle._C_ops.transpose(reshape_8, [0, 2, 1, 3])
+        del reshape_8
+
+        # pd_op.reshape: (1x11x12x26xf32) <- (1x11x312xf32, 4xi64)
+        reshape_9 = paddle._C_ops.reshape(add_22, full_int_array_4)
+        del add_22
+
+        # pd_op.transpose: (1x12x11x26xf32) <- (1x11x12x26xf32)
+        transpose_9 = paddle._C_ops.transpose(reshape_9, [0, 2, 1, 3])
+        del reshape_9
+
+        # pd_op.reshape: (1x11x12x26xf32) <- (1x11x312xf32, 4xi64)
+        reshape_10 = paddle._C_ops.reshape(add_23, full_int_array_4)
+        del add_23
+
+        # pd_op.transpose: (1x12x11x26xf32) <- (1x11x12x26xf32)
+        transpose_10 = paddle._C_ops.transpose(reshape_10, [0, 2, 1, 3])
+        del reshape_10
+
+        # pd_op.matmul: (1x12x11x11xf32) <- (1x12x11x26xf32, 1x12x11x26xf32)
+        matmul_20 = paddle._C_ops.matmul(transpose_8, transpose_9, False, True)
+        del transpose_8, transpose_9
+
+        # pd_op.scale: (1x12x11x11xf32) <- (1x12x11x11xf32, 1xf32)
+        scale_4 = paddle._C_ops.scale(matmul_20, full_2, float("0"), True)
+        del matmul_20
+
+        # pd_op.add: (1x12x11x11xf32) <- (1x12x11x11xf32, 1x1x1x11xf32)
+        add_24 = paddle._C_ops.add(scale_4, scale_1)
+        del scale_4
+
+        # pd_op.softmax: (1x12x11x11xf32) <- (1x12x11x11xf32)
+        softmax_2 = paddle._C_ops.softmax(add_24, -1)
+        del add_24
+
+        # pd_op.matmul: (1x12x11x26xf32) <- (1x12x11x11xf32, 1x12x11x26xf32)
+        matmul_21 = paddle._C_ops.matmul(softmax_2, transpose_10, False, False)
+        del softmax_2, transpose_10
+
+        # pd_op.transpose: (1x11x12x26xf32) <- (1x12x11x26xf32)
+        transpose_11 = paddle._C_ops.transpose(matmul_21, [0, 2, 1, 3])
+        del matmul_21
+
+        # pd_op.reshape: (1x11x312xf32) <- (1x11x12x26xf32, 3xi64)
+        reshape_11 = paddle._C_ops.reshape(transpose_11, full_int_array_5)
+        del transpose_11
+
+        # pd_op.matmul: (1x11x312xf32) <- (1x11x312xf32, 312x312xf32)
+        matmul_22 = paddle._C_ops.matmul(reshape_11, parameter_10, False, False)
+        del reshape_11
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 312xf32)
+        add_25 = paddle._C_ops.add(matmul_22, parameter_9)
+        del matmul_22
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 1x11x312xf32)
+        add_26 = paddle._C_ops.add(layer_norm_12, add_25)
+        del add_25, layer_norm_12
+
+        # pd_op.layer_norm: (1x11x312xf32, 1x11xf32, 1x11xf32) <- (1x11x312xf32, 312xf32, 312xf32)
+        layer_norm_15, layer_norm_16, layer_norm_17 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_26, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_26
+
+        # pd_op.matmul: (1x11x1248xf32) <- (1x11x312xf32, 312x1248xf32)
+        matmul_23 = paddle._C_ops.matmul(layer_norm_15, parameter_6, False, False)
+
+        # pd_op.add: (1x11x1248xf32) <- (1x11x1248xf32, 1248xf32)
+        add_27 = paddle._C_ops.add(matmul_23, parameter_5)
+        del matmul_23
+
+        # pd_op.gelu: (1x11x1248xf32) <- (1x11x1248xf32)
+        gelu_2 = paddle._C_ops.gelu(add_27, False)
+        del add_27
+
+        # pd_op.matmul: (1x11x312xf32) <- (1x11x1248xf32, 1248x312xf32)
+        matmul_24 = paddle._C_ops.matmul(gelu_2, parameter_4, False, False)
+        del gelu_2
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 312xf32)
+        add_28 = paddle._C_ops.add(matmul_24, parameter_3)
+        del matmul_24
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 1x11x312xf32)
+        add_29 = paddle._C_ops.add(add_28, layer_norm_15)
+        del add_28, layer_norm_15
+
+        # pd_op.layer_norm: (1x11x312xf32, 1x11xf32, 1x11xf32) <- (1x11x312xf32, 312xf32, 312xf32)
+        layer_norm_18, layer_norm_19, layer_norm_20 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_29, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_29
+
+        # pd_op.matmul: (1x11x312xf32) <- (1x11x312xf32, 312x312xf32)
+        matmul_25 = paddle._C_ops.matmul(layer_norm_18, parameter_16, False, False)
+        del parameter_16
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 312xf32)
+        add_30 = paddle._C_ops.add(matmul_25, parameter_15)
+        del matmul_25, parameter_15
+
+        # pd_op.matmul: (1x11x312xf32) <- (1x11x312xf32, 312x312xf32)
+        matmul_26 = paddle._C_ops.matmul(layer_norm_18, parameter_14, False, False)
+        del parameter_14
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 312xf32)
+        add_31 = paddle._C_ops.add(matmul_26, parameter_13)
+        del matmul_26, parameter_13
+
+        # pd_op.matmul: (1x11x312xf32) <- (1x11x312xf32, 312x312xf32)
+        matmul_27 = paddle._C_ops.matmul(layer_norm_18, parameter_12, False, False)
+        del parameter_12
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 312xf32)
+        add_32 = paddle._C_ops.add(matmul_27, parameter_11)
+        del matmul_27, parameter_11
+
+        # pd_op.reshape: (1x11x12x26xf32) <- (1x11x312xf32, 4xi64)
+        reshape_12 = paddle._C_ops.reshape(add_30, full_int_array_4)
+        del add_30
+
+        # pd_op.transpose: (1x12x11x26xf32) <- (1x11x12x26xf32)
+        transpose_12 = paddle._C_ops.transpose(reshape_12, [0, 2, 1, 3])
+        del reshape_12
+
+        # pd_op.reshape: (1x11x12x26xf32) <- (1x11x312xf32, 4xi64)
+        reshape_13 = paddle._C_ops.reshape(add_31, full_int_array_4)
+        del add_31
+
+        # pd_op.transpose: (1x12x11x26xf32) <- (1x11x12x26xf32)
+        transpose_13 = paddle._C_ops.transpose(reshape_13, [0, 2, 1, 3])
+        del reshape_13
+
+        # pd_op.reshape: (1x11x12x26xf32) <- (1x11x312xf32, 4xi64)
+        reshape_14 = paddle._C_ops.reshape(add_32, full_int_array_4)
+        del add_32, full_int_array_4
+
+        # pd_op.transpose: (1x12x11x26xf32) <- (1x11x12x26xf32)
+        transpose_14 = paddle._C_ops.transpose(reshape_14, [0, 2, 1, 3])
+        del reshape_14
+
+        # pd_op.matmul: (1x12x11x11xf32) <- (1x12x11x26xf32, 1x12x11x26xf32)
+        matmul_28 = paddle._C_ops.matmul(transpose_12, transpose_13, False, True)
+        del transpose_12, transpose_13
+
+        # pd_op.scale: (1x12x11x11xf32) <- (1x12x11x11xf32, 1xf32)
+        scale_5 = paddle._C_ops.scale(matmul_28, full_2, float("0"), True)
+        del full_2, matmul_28
+
+        # pd_op.add: (1x12x11x11xf32) <- (1x12x11x11xf32, 1x1x1x11xf32)
+        add_33 = paddle._C_ops.add(scale_5, scale_1)
+        del scale_1, scale_5
+
+        # pd_op.softmax: (1x12x11x11xf32) <- (1x12x11x11xf32)
+        softmax_3 = paddle._C_ops.softmax(add_33, -1)
+        del add_33
+
+        # pd_op.matmul: (1x12x11x26xf32) <- (1x12x11x11xf32, 1x12x11x26xf32)
+        matmul_29 = paddle._C_ops.matmul(softmax_3, transpose_14, False, False)
+        del softmax_3, transpose_14
+
+        # pd_op.transpose: (1x11x12x26xf32) <- (1x12x11x26xf32)
+        transpose_15 = paddle._C_ops.transpose(matmul_29, [0, 2, 1, 3])
+        del matmul_29
+
+        # pd_op.reshape: (1x11x312xf32) <- (1x11x12x26xf32, 3xi64)
+        reshape_15 = paddle._C_ops.reshape(transpose_15, full_int_array_5)
+        del full_int_array_5, transpose_15
+
+        # pd_op.matmul: (1x11x312xf32) <- (1x11x312xf32, 312x312xf32)
+        matmul_30 = paddle._C_ops.matmul(reshape_15, parameter_10, False, False)
+        del parameter_10, reshape_15
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 312xf32)
+        add_34 = paddle._C_ops.add(matmul_30, parameter_9)
+        del matmul_30, parameter_9
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 1x11x312xf32)
+        add_35 = paddle._C_ops.add(layer_norm_18, add_34)
+        del add_34, layer_norm_18
+
+        # pd_op.layer_norm: (1x11x312xf32, 1x11xf32, 1x11xf32) <- (1x11x312xf32, 312xf32, 312xf32)
+        layer_norm_21, layer_norm_22, layer_norm_23 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_35, parameter_8, parameter_7, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_35, parameter_7, parameter_8
+
+        # pd_op.matmul: (1x11x1248xf32) <- (1x11x312xf32, 312x1248xf32)
+        matmul_31 = paddle._C_ops.matmul(layer_norm_21, parameter_6, False, False)
+        del parameter_6
+
+        # pd_op.add: (1x11x1248xf32) <- (1x11x1248xf32, 1248xf32)
+        add_36 = paddle._C_ops.add(matmul_31, parameter_5)
+        del matmul_31, parameter_5
+
+        # pd_op.gelu: (1x11x1248xf32) <- (1x11x1248xf32)
+        gelu_3 = paddle._C_ops.gelu(add_36, False)
+        del add_36
+
+        # pd_op.matmul: (1x11x312xf32) <- (1x11x1248xf32, 1248x312xf32)
+        matmul_32 = paddle._C_ops.matmul(gelu_3, parameter_4, False, False)
+        del gelu_3, parameter_4
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 312xf32)
+        add_37 = paddle._C_ops.add(matmul_32, parameter_3)
+        del matmul_32, parameter_3
+
+        # pd_op.add: (1x11x312xf32) <- (1x11x312xf32, 1x11x312xf32)
+        add_38 = paddle._C_ops.add(add_37, layer_norm_21)
+        del add_37, layer_norm_21
+
+        # pd_op.layer_norm: (1x11x312xf32, 1x11xf32, 1x11xf32) <- (1x11x312xf32, 312xf32, 312xf32)
+        layer_norm_24, layer_norm_25, layer_norm_26 = (lambda x, f: f(x))(
+            paddle._C_ops.layer_norm(
+                add_38, parameter_18, parameter_17, float("1e-12"), 2
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None, None),
+        )
+        del add_38, parameter_17, parameter_18
+
+        # pd_op.slice: (1x312xf32) <- (1x11x312xf32, 1xi64, 1xi64)
+        slice_1 = paddle._C_ops.slice(
+            layer_norm_24, [1], full_int_array_2, full_int_array_0, [1], [1]
+        )
+        del full_int_array_0, full_int_array_2
+
+        # pd_op.matmul: (1x312xf32) <- (1x312xf32, 312x312xf32)
+        matmul_33 = paddle._C_ops.matmul(slice_1, parameter_2, False, False)
+        del parameter_2, slice_1
+
+        # pd_op.add: (1x312xf32) <- (1x312xf32, 312xf32)
+        add_39 = paddle._C_ops.add(matmul_33, parameter_1)
+        del matmul_33, parameter_1
+
+        # pd_op.tanh: (1x312xf32) <- (1x312xf32)
+        tanh_0 = paddle._C_ops.tanh(add_39)
+        del add_39, layer_norm_24
+
+        return tanh_0
diff --git a/paddle_samples/PaddleNLP/albert-chinese-tiny/weight_meta.py b/paddle_samples/PaddleNLP/albert-chinese-tiny/weight_meta.py
new file mode 100644
index 000000000..3bac8dae2
--- /dev/null
+++ b/paddle_samples/PaddleNLP/albert-chinese-tiny/weight_meta.py
@@ -0,0 +1,235 @@
+class Program_weight_tensor_parameter_0:
+    name = "parameter_0"
+    shape = [1, 512]
+    dtype = "int64"
+    min_val = 0
+    max_val = 511
+    data = None
+
+
+class Program_weight_tensor_parameter_1:
+    name = "parameter_1"
+    shape = [312]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_2:
+    name = "parameter_2"
+    shape = [312, 312]
+    dtype = "float32"
+    min_val = float("-0.0943927")
+    max_val = float("0.0805598")
+    mean = float("-5.04225e-05")
+    std = float("0.0199894")
+    data = None
+
+
+class Program_weight_tensor_parameter_3:
+    name = "parameter_3"
+    shape = [312]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_4:
+    name = "parameter_4"
+    shape = [1248, 312]
+    dtype = "float32"
+    min_val = float("-0.0882163")
+    max_val = float("0.0910547")
+    mean = float("-7.22546e-06")
+    std = float("0.0199967")
+    data = None
+
+
+class Program_weight_tensor_parameter_5:
+    name = "parameter_5"
+    shape = [1248]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_6:
+    name = "parameter_6"
+    shape = [312, 1248]
+    dtype = "float32"
+    min_val = float("-0.089193")
+    max_val = float("0.10013")
+    mean = float("-5.28496e-05")
+    std = float("0.0199953")
+    data = None
+
+
+class Program_weight_tensor_parameter_7:
+    name = "parameter_7"
+    shape = [312]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_8:
+    name = "parameter_8"
+    shape = [312]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_9:
+    name = "parameter_9"
+    shape = [312]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_10:
+    name = "parameter_10"
+    shape = [312, 312]
+    dtype = "float32"
+    min_val = float("-0.0864897")
+    max_val = float("0.0923653")
+    mean = float("6.8981e-05")
+    std = float("0.0200065")
+    data = None
+
+
+class Program_weight_tensor_parameter_11:
+    name = "parameter_11"
+    shape = [312]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_12:
+    name = "parameter_12"
+    shape = [312, 312]
+    dtype = "float32"
+    min_val = float("-0.0912581")
+    max_val = float("0.0870574")
+    mean = float("-4.12729e-05")
+    std = float("0.0200247")
+    data = None
+
+
+class Program_weight_tensor_parameter_13:
+    name = "parameter_13"
+    shape = [312]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_14:
+    name = "parameter_14"
+    shape = [312, 312]
+    dtype = "float32"
+    min_val = float("-0.0832851")
+    max_val = float("0.0934653")
+    mean = float("-2.10013e-05")
+    std = float("0.0200296")
+    data = None
+
+
+class Program_weight_tensor_parameter_15:
+    name = "parameter_15"
+    shape = [312]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_16:
+    name = "parameter_16"
+    shape = [312, 312]
+    dtype = "float32"
+    min_val = float("-0.0901047")
+    max_val = float("0.0851487")
+    mean = float("-2.36235e-06")
+    std = float("0.020018")
+    data = None
+
+
+class Program_weight_tensor_parameter_17:
+    name = "parameter_17"
+    shape = [312]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_18:
+    name = "parameter_18"
+    shape = [312]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_19:
+    name = "parameter_19"
+    shape = [312]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_20:
+    name = "parameter_20"
+    shape = [128, 312]
+    dtype = "float32"
+    min_val = float("-0.0820946")
+    max_val = float("0.0974006")
+    mean = float("-5.19528e-05")
+    std = float("0.0200838")
+    data = None
+
+
+class Program_weight_tensor_parameter_21:
+    name = "parameter_21"
+    shape = [128]
+    dtype = "float32"
+    data = None
+
+
+class Program_weight_tensor_parameter_22:
+    name = "parameter_22"
+    shape = [128]
+    dtype = "float32"
+    min_val = float("1.0")
+    max_val = float("1.0")
+    mean = float("1.0")
+    data = None
+
+
+class Program_weight_tensor_parameter_23:
+    name = "parameter_23"
+    shape = [2, 128]
+    dtype = "float32"
+    min_val = float("-0.0599234")
+    max_val = float("0.066722")
+    mean = float("-0.00042005")
+    std = float("0.0224804")
+    data = None
+
+
+class Program_weight_tensor_parameter_24:
+    name = "parameter_24"
+    shape = [512, 128]
+    dtype = "float32"
+    min_val = float("-0.0834669")
+    max_val = float("0.0840402")
+    mean = float("0.000142117")
+    std = float("0.0199409")
+    data = None
+
+
+class Program_weight_tensor_parameter_25:
+    name = "parameter_25"
+    shape = [21128, 128]
+    dtype = "float32"
+    min_val = float("-0.0938932")
+    max_val = float("0.105185")
+    mean = float("1.19757e-05")
+    std = float("0.0199975")
+    data = None
diff --git a/paddle_samples/PaddleNLP/t5-small/graph_net.json b/paddle_samples/PaddleNLP/t5-small/graph_net.json
new file mode 100644
index 000000000..6b649b3dd
--- /dev/null
+++ b/paddle_samples/PaddleNLP/t5-small/graph_net.json
@@ -0,0 +1,6 @@
+{
+    "framework": "paddle",
+    "model_name": "t5-small",
+    "num_devices_required": 1,
+    "num_nodes_required": 1
+}
\ No newline at end of file
diff --git a/paddle_samples/PaddleNLP/t5-small/input_meta.py b/paddle_samples/PaddleNLP/t5-small/input_meta.py
new file mode 100644
index 000000000..846bab065
--- /dev/null
+++ b/paddle_samples/PaddleNLP/t5-small/input_meta.py
@@ -0,0 +1,40 @@
+class Program_weight_tensor_data_0:
+    name = "data_0"
+    shape = [1, 20]
+    dtype = "int64"
+    data = [
+        8774,
+        6,
+        82,
+        564,
+        19,
+        5762,
+        5,
+        27,
+        183,
+        1036,
+        81,
+        508,
+        1612,
+        2250,
+        11,
+        70,
+        4648,
+        7,
+        5,
+        1,
+    ]
+
+
+class Program_weight_tensor_data_1:
+    name = "data_1"
+    shape = [1, 20]
+    dtype = "int64"
+    data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+
+
+class Program_weight_tensor_data_2:
+    name = "data_2"
+    shape = [1, 1]
+    dtype = "int64"
+    data = [0]
diff --git a/paddle_samples/PaddleNLP/t5-small/model.py b/paddle_samples/PaddleNLP/t5-small/model.py
new file mode 100644
index 000000000..29476168f
--- /dev/null
+++ b/paddle_samples/PaddleNLP/t5-small/model.py
@@ -0,0 +1,3317 @@
+import paddle
+
+
+class GraphModule(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self,
+        parameter_0,
+        parameter_1,
+        parameter_2,
+        parameter_3,
+        parameter_4,
+        parameter_5,
+        parameter_6,
+        parameter_7,
+        parameter_8,
+        parameter_9,
+        parameter_10,
+        parameter_11,
+        parameter_12,
+        parameter_13,
+        parameter_14,
+        parameter_15,
+        parameter_16,
+        parameter_17,
+        parameter_18,
+        parameter_19,
+        parameter_20,
+        parameter_21,
+        parameter_22,
+        parameter_23,
+        parameter_24,
+        parameter_25,
+        parameter_26,
+        parameter_27,
+        parameter_28,
+        parameter_29,
+        parameter_30,
+        parameter_31,
+        parameter_32,
+        parameter_33,
+        parameter_34,
+        parameter_35,
+        parameter_36,
+        parameter_37,
+        parameter_38,
+        parameter_39,
+        parameter_40,
+        parameter_41,
+        parameter_42,
+        parameter_43,
+        parameter_44,
+        parameter_45,
+        parameter_46,
+        parameter_47,
+        parameter_48,
+        parameter_49,
+        parameter_50,
+        parameter_51,
+        parameter_52,
+        parameter_53,
+        parameter_54,
+        parameter_55,
+        parameter_56,
+        parameter_57,
+        parameter_58,
+        parameter_59,
+        parameter_60,
+        parameter_61,
+        parameter_62,
+        parameter_63,
+        parameter_64,
+        parameter_65,
+        parameter_66,
+        parameter_67,
+        parameter_68,
+        parameter_69,
+        parameter_70,
+        parameter_71,
+        parameter_72,
+        parameter_73,
+        parameter_74,
+        parameter_75,
+        parameter_76,
+        parameter_77,
+        parameter_78,
+        parameter_79,
+        parameter_80,
+        parameter_81,
+        parameter_82,
+        parameter_83,
+        parameter_84,
+        parameter_85,
+        parameter_86,
+        parameter_87,
+        parameter_88,
+        parameter_89,
+        parameter_90,
+        parameter_91,
+        parameter_92,
+        parameter_93,
+        parameter_94,
+        parameter_95,
+        parameter_96,
+        parameter_97,
+        parameter_98,
+        parameter_99,
+        parameter_100,
+        parameter_101,
+        parameter_102,
+        parameter_103,
+        parameter_104,
+        parameter_105,
+        parameter_106,
+        parameter_107,
+        parameter_108,
+        parameter_109,
+        parameter_110,
+        parameter_111,
+        parameter_112,
+        parameter_113,
+        parameter_114,
+        parameter_115,
+        parameter_116,
+        parameter_117,
+        parameter_118,
+        parameter_119,
+        parameter_120,
+        parameter_121,
+        parameter_122,
+        parameter_123,
+        parameter_124,
+        parameter_125,
+        parameter_126,
+        parameter_127,
+        parameter_128,
+        parameter_129,
+        parameter_130,
+        data_0,
+        data_1,
+        data_2,
+    ):
+        # pd_op.embedding: (1x20x512xf32) <- (1x20xi64, 32128x512xf32)
+        embedding_0 = paddle._C_ops.embedding(data_0, parameter_130, -1, False)
+        del data_0
+
+        # pd_op.full_int_array: (2xi64) <- ()
+        full_int_array_0 = [1, 2]
+
+        # pd_op.unsqueeze: (1x1x1x20xi64) <- (1x20xi64, 2xi64)
+        unsqueeze_0 = paddle._C_ops.unsqueeze(data_1, full_int_array_0)
+        del data_1
+
+        # pd_op.cast: (1x1x1x20xf32) <- (1x1x1x20xi64)
+        cast_0 = paddle._C_ops.cast(unsqueeze_0, paddle.float32)
+        del unsqueeze_0
+
+        # pd_op.full: (1xf32) <- ()
+        full_0 = paddle._C_ops.full(
+            [1], float("-1"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x1x1x20xf32) <- (1x1x1x20xf32, 1xf32)
+        scale_0 = paddle._C_ops.scale(cast_0, full_0, float("1"), True)
+        del cast_0
+
+        # pd_op.full: (1xf32) <- ()
+        full_1 = paddle._C_ops.full(
+            [1], float("-10000"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x1x1x20xf32) <- (1x1x1x20xf32, 1xf32)
+        scale_1 = paddle._C_ops.scale(scale_0, full_1, float("0"), True)
+        del scale_0
+
+        # pd_op.full: (1xf32) <- ()
+        full_2 = paddle._C_ops.full(
+            [1], float("0.1"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.dropout: (1x20x512xf32, 1x20x512xui8) <- (1x20x512xf32, None, 1xf32)
+        dropout_0, dropout_1 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                embedding_0, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del embedding_0
+
+        # pd_op.pow: (1x20x512xf32) <- (1x20x512xf32)
+        pow_0 = paddle._C_ops.pow(dropout_0, float("2"))
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_1 = [-1]
+
+        # pd_op.mean: (1x20x1xf32) <- (1x20x512xf32, 1xi64)
+        mean_0 = paddle._C_ops.mean(pow_0, full_int_array_1, True)
+        del pow_0
+
+        # pd_op.full: (1xf32) <- ()
+        full_3 = paddle._C_ops.full(
+            [1], float("1"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x20x1xf32) <- (1x20x1xf32, 1xf32)
+        scale_2 = paddle._C_ops.scale(mean_0, full_3, float("1e-06"), True)
+        del mean_0
+
+        # pd_op.rsqrt: (1x20x1xf32) <- (1x20x1xf32)
+        rsqrt_0 = paddle._C_ops.rsqrt(scale_2)
+        del scale_2
+
+        # pd_op.multiply: (1x20x512xf32) <- (1x20x512xf32, 1x20x1xf32)
+        multiply_0 = paddle._C_ops.multiply(dropout_0, rsqrt_0)
+        del rsqrt_0
+
+        # pd_op.multiply: (1x20x512xf32) <- (512xf32, 1x20x512xf32)
+        multiply_1 = paddle._C_ops.multiply(parameter_124, multiply_0)
+        del multiply_0, parameter_124
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_1 = paddle._C_ops.matmul(multiply_1, parameter_129, False, False)
+        del parameter_129
+
+        # pd_op.full_int_array: (4xi64) <- ()
+        full_int_array_2 = [1, -1, 8, 64]
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_0 = paddle._C_ops.reshape(matmul_1, full_int_array_2)
+        del matmul_1
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_0 = paddle._C_ops.transpose(reshape_0, [0, 2, 1, 3])
+        del reshape_0
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_2 = paddle._C_ops.matmul(multiply_1, parameter_128, False, False)
+        del parameter_128
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_1 = paddle._C_ops.reshape(matmul_2, full_int_array_2)
+        del matmul_2
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_1 = paddle._C_ops.transpose(reshape_1, [0, 2, 1, 3])
+        del reshape_1
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_3 = paddle._C_ops.matmul(multiply_1, parameter_127, False, False)
+        del multiply_1, parameter_127
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_2 = paddle._C_ops.reshape(matmul_3, full_int_array_2)
+        del matmul_3
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_2 = paddle._C_ops.transpose(reshape_2, [0, 2, 1, 3])
+        del reshape_2
+
+        # pd_op.matmul: (1x8x20x20xf32) <- (1x8x20x64xf32, 1x8x20x64xf32)
+        matmul_4 = paddle._C_ops.matmul(transpose_0, transpose_1, False, True)
+        del transpose_0, transpose_1
+
+        # pd_op.full: (1xf64) <- ()
+        full_4 = paddle._C_ops.full(
+            [1], float("0"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full: (1xf64) <- ()
+        full_5 = paddle._C_ops.full(
+            [1], float("20"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full: (1xf64) <- ()
+        full_6 = paddle._C_ops.full(
+            [1], float("1"), paddle.float64, paddle.core.CPUPlace()
+        )
+
+        # pd_op.arange: (20xi64) <- (1xf64, 1xf64, 1xf64)
+        arange_0 = paddle.arange(full_4, full_5, full_6, dtype="int64")
+        del full_5
+
+        # pd_op.unsqueeze: (20x1xi64) <- (20xi64, 1xi64)
+        unsqueeze_1 = paddle._C_ops.unsqueeze(arange_0, full_int_array_1)
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_3 = [0]
+
+        # pd_op.unsqueeze: (1x20xi64) <- (20xi64, 1xi64)
+        unsqueeze_2 = paddle._C_ops.unsqueeze(arange_0, full_int_array_3)
+        del arange_0
+
+        # pd_op.subtract: (20x20xi64) <- (1x20xi64, 20x1xi64)
+        subtract_0 = paddle._C_ops.subtract(unsqueeze_2, unsqueeze_1)
+        del unsqueeze_1, unsqueeze_2
+
+        # pd_op.full: (xi64) <- ()
+        full_7 = paddle._C_ops.full(
+            [], float("0"), paddle.int64, paddle.framework._current_expected_place()
+        )
+
+        # pd_op.greater_than: (20x20xb) <- (20x20xi64, xi64)
+        greater_than_0 = paddle._C_ops.greater_than(subtract_0, full_7)
+        del full_7
+
+        # pd_op.cast: (20x20xi64) <- (20x20xb)
+        cast_1 = paddle._C_ops.cast(greater_than_0, paddle.int64)
+        del greater_than_0
+
+        # pd_op.full: (1xf32) <- ()
+        full_8 = paddle._C_ops.full(
+            [1], float("16"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (20x20xi64) <- (20x20xi64, 1xf32)
+        scale_3 = paddle._C_ops.scale(cast_1, full_8, float("0"), True)
+        del cast_1
+
+        # pd_op.scale: (20x20xi64) <- (20x20xi64, 1xf32)
+        scale_4 = paddle._C_ops.scale(scale_3, full_3, float("0"), True)
+        del scale_3
+
+        # pd_op.abs: (20x20xi64) <- (20x20xi64)
+        abs_0 = paddle._C_ops.abs(subtract_0)
+        del subtract_0
+
+        # pd_op.full: (xi64) <- ()
+        full_9 = paddle._C_ops.full(
+            [], float("8"), paddle.int64, paddle.framework._current_expected_place()
+        )
+
+        # pd_op.less_than: (20x20xb) <- (20x20xi64, xi64)
+        less_than_0 = paddle._C_ops.less_than(abs_0, full_9)
+        del full_9
+
+        # pd_op.cast: (20x20xf32) <- (20x20xi64)
+        cast_2 = paddle._C_ops.cast(abs_0, paddle.float32)
+
+        # pd_op.full: (1xf32) <- ()
+        full_10 = paddle._C_ops.full(
+            [1], float("0.125"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (20x20xf32) <- (20x20xf32, 1xf32)
+        scale_5 = paddle._C_ops.scale(cast_2, full_10, float("0"), True)
+        del cast_2, full_10
+
+        # pd_op.log: (20x20xf32) <- (20x20xf32)
+        log_0 = paddle._C_ops.log(scale_5)
+        del scale_5
+
+        # pd_op.full: (1xf32) <- ()
+        full_11 = paddle._C_ops.full(
+            [1], float("0.360674"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (20x20xf32) <- (20x20xf32, 1xf32)
+        scale_6 = paddle._C_ops.scale(log_0, full_11, float("0"), True)
+        del full_11, log_0
+
+        # pd_op.full: (1xf32) <- ()
+        full_12 = paddle._C_ops.full(
+            [1], float("8"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (20x20xf32) <- (20x20xf32, 1xf32)
+        scale_7 = paddle._C_ops.scale(scale_6, full_12, float("0"), True)
+        del full_12, scale_6
+
+        # pd_op.cast: (20x20xi64) <- (20x20xf32)
+        cast_3 = paddle._C_ops.cast(scale_7, paddle.int64)
+        del scale_7
+
+        # pd_op.scale: (20x20xi64) <- (20x20xi64, 1xf32)
+        scale_8 = paddle._C_ops.scale(cast_3, full_3, float("8"), True)
+        del cast_3
+
+        # pd_op.full: (1xf32) <- ()
+        full_13 = paddle._C_ops.full(
+            [1], float("15"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full_like: (20x20xi64) <- (20x20xi64, 1xf32)
+        full_like_0 = paddle._C_ops.full_like(
+            scale_8, full_13, paddle.int64, paddle.framework._current_expected_place()
+        )
+        del full_13
+
+        # pd_op.minimum: (20x20xi64) <- (20x20xi64, 20x20xi64)
+        minimum_0 = paddle._C_ops.minimum(scale_8, full_like_0)
+        del full_like_0, scale_8
+
+        # pd_op.where: (20x20xi64) <- (20x20xb, 20x20xi64, 20x20xi64)
+        where_0 = paddle._C_ops.where(less_than_0, abs_0, minimum_0)
+        del abs_0, less_than_0, minimum_0
+
+        # pd_op.add: (20x20xi64) <- (20x20xi64, 20x20xi64)
+        add_0 = paddle._C_ops.add(scale_4, where_0)
+        del scale_4, where_0
+
+        # pd_op.embedding: (20x20x8xf32) <- (20x20xi64, 32x8xf32)
+        embedding_1 = paddle._C_ops.embedding(add_0, parameter_125, -1, False)
+        del add_0, parameter_125
+
+        # pd_op.transpose: (8x20x20xf32) <- (20x20x8xf32)
+        transpose_3 = paddle._C_ops.transpose(embedding_1, [2, 0, 1])
+        del embedding_1
+
+        # pd_op.unsqueeze: (1x8x20x20xf32) <- (8x20x20xf32, 1xi64)
+        unsqueeze_3 = paddle._C_ops.unsqueeze(transpose_3, full_int_array_3)
+        del transpose_3
+
+        # pd_op.add: (1x8x20x20xf32) <- (1x8x20x20xf32, 1x1x1x20xf32)
+        add_1 = paddle._C_ops.add(unsqueeze_3, scale_1)
+        del unsqueeze_3
+
+        # pd_op.add: (1x8x20x20xf32) <- (1x8x20x20xf32, 1x8x20x20xf32)
+        add_2 = paddle._C_ops.add(matmul_4, add_1)
+        del matmul_4
+
+        # pd_op.softmax: (1x8x20x20xf32) <- (1x8x20x20xf32)
+        softmax_0 = paddle._C_ops.softmax(add_2, -1)
+        del add_2
+
+        # pd_op.dropout: (1x8x20x20xf32, 1x8x20x20xui8) <- (1x8x20x20xf32, None, 1xf32)
+        dropout_2, dropout_3 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_0, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_0
+
+        # pd_op.matmul: (1x8x20x64xf32) <- (1x8x20x20xf32, 1x8x20x64xf32)
+        matmul_5 = paddle._C_ops.matmul(dropout_2, transpose_2, False, False)
+        del dropout_2, transpose_2
+
+        # pd_op.transpose: (1x20x8x64xf32) <- (1x8x20x64xf32)
+        transpose_4 = paddle._C_ops.transpose(matmul_5, [0, 2, 1, 3])
+        del matmul_5
+
+        # pd_op.full_int_array: (3xi64) <- ()
+        full_int_array_4 = [1, -1, 512]
+
+        # pd_op.reshape: (1x20x512xf32) <- (1x20x8x64xf32, 3xi64)
+        reshape_3 = paddle._C_ops.reshape(transpose_4, full_int_array_4)
+        del transpose_4
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_6 = paddle._C_ops.matmul(reshape_3, parameter_126, False, False)
+        del parameter_126, reshape_3
+
+        # pd_op.dropout: (1x20x512xf32, 1x20x512xui8) <- (1x20x512xf32, None, 1xf32)
+        dropout_4, dropout_5 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_6, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_6
+
+        # pd_op.add: (1x20x512xf32) <- (1x20x512xf32, 1x20x512xf32)
+        add_3 = paddle._C_ops.add(dropout_0, dropout_4)
+        del dropout_0, dropout_4
+
+        # pd_op.pow: (1x20x512xf32) <- (1x20x512xf32)
+        pow_1 = paddle._C_ops.pow(add_3, float("2"))
+
+        # pd_op.mean: (1x20x1xf32) <- (1x20x512xf32, 1xi64)
+        mean_1 = paddle._C_ops.mean(pow_1, full_int_array_1, True)
+        del pow_1
+
+        # pd_op.scale: (1x20x1xf32) <- (1x20x1xf32, 1xf32)
+        scale_9 = paddle._C_ops.scale(mean_1, full_3, float("1e-06"), True)
+        del mean_1
+
+        # pd_op.rsqrt: (1x20x1xf32) <- (1x20x1xf32)
+        rsqrt_1 = paddle._C_ops.rsqrt(scale_9)
+        del scale_9
+
+        # pd_op.multiply: (1x20x512xf32) <- (1x20x512xf32, 1x20x1xf32)
+        multiply_2 = paddle._C_ops.multiply(add_3, rsqrt_1)
+        del rsqrt_1
+
+        # pd_op.multiply: (1x20x512xf32) <- (512xf32, 1x20x512xf32)
+        multiply_3 = paddle._C_ops.multiply(parameter_121, multiply_2)
+        del multiply_2, parameter_121
+
+        # pd_op.matmul: (1x20x2048xf32) <- (1x20x512xf32, 512x2048xf32)
+        matmul_7 = paddle._C_ops.matmul(multiply_3, parameter_123, False, False)
+        del multiply_3, parameter_123
+
+        # pd_op.relu: (1x20x2048xf32) <- (1x20x2048xf32)
+        relu_0 = paddle._C_ops.relu(matmul_7)
+        del matmul_7
+
+        # pd_op.dropout: (1x20x2048xf32, 1x20x2048xui8) <- (1x20x2048xf32, None, 1xf32)
+        dropout_6, dropout_7 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_0, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_0
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x2048xf32, 2048x512xf32)
+        matmul_8 = paddle._C_ops.matmul(dropout_6, parameter_122, False, False)
+        del dropout_6, parameter_122
+
+        # pd_op.dropout: (1x20x512xf32, 1x20x512xui8) <- (1x20x512xf32, None, 1xf32)
+        dropout_8, dropout_9 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_8, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_8
+
+        # pd_op.add: (1x20x512xf32) <- (1x20x512xf32, 1x20x512xf32)
+        add_4 = paddle._C_ops.add(dropout_8, add_3)
+        del add_3, dropout_8
+
+        # pd_op.pow: (1x20x512xf32) <- (1x20x512xf32)
+        pow_2 = paddle._C_ops.pow(add_4, float("2"))
+
+        # pd_op.mean: (1x20x1xf32) <- (1x20x512xf32, 1xi64)
+        mean_2 = paddle._C_ops.mean(pow_2, full_int_array_1, True)
+        del pow_2
+
+        # pd_op.scale: (1x20x1xf32) <- (1x20x1xf32, 1xf32)
+        scale_10 = paddle._C_ops.scale(mean_2, full_3, float("1e-06"), True)
+        del mean_2
+
+        # pd_op.rsqrt: (1x20x1xf32) <- (1x20x1xf32)
+        rsqrt_2 = paddle._C_ops.rsqrt(scale_10)
+        del scale_10
+
+        # pd_op.multiply: (1x20x512xf32) <- (1x20x512xf32, 1x20x1xf32)
+        multiply_4 = paddle._C_ops.multiply(add_4, rsqrt_2)
+        del rsqrt_2
+
+        # pd_op.multiply: (1x20x512xf32) <- (512xf32, 1x20x512xf32)
+        multiply_5 = paddle._C_ops.multiply(parameter_116, multiply_4)
+        del multiply_4, parameter_116
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_9 = paddle._C_ops.matmul(multiply_5, parameter_120, False, False)
+        del parameter_120
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_4 = paddle._C_ops.reshape(matmul_9, full_int_array_2)
+        del matmul_9
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_5 = paddle._C_ops.transpose(reshape_4, [0, 2, 1, 3])
+        del reshape_4
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_10 = paddle._C_ops.matmul(multiply_5, parameter_119, False, False)
+        del parameter_119
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_5 = paddle._C_ops.reshape(matmul_10, full_int_array_2)
+        del matmul_10
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_6 = paddle._C_ops.transpose(reshape_5, [0, 2, 1, 3])
+        del reshape_5
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_11 = paddle._C_ops.matmul(multiply_5, parameter_118, False, False)
+        del multiply_5, parameter_118
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_6 = paddle._C_ops.reshape(matmul_11, full_int_array_2)
+        del matmul_11
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_7 = paddle._C_ops.transpose(reshape_6, [0, 2, 1, 3])
+        del reshape_6
+
+        # pd_op.matmul: (1x8x20x20xf32) <- (1x8x20x64xf32, 1x8x20x64xf32)
+        matmul_12 = paddle._C_ops.matmul(transpose_5, transpose_6, False, True)
+        del transpose_5, transpose_6
+
+        # pd_op.add: (1x8x20x20xf32) <- (1x8x20x20xf32, 1x8x20x20xf32)
+        add_5 = paddle._C_ops.add(matmul_12, add_1)
+        del matmul_12
+
+        # pd_op.softmax: (1x8x20x20xf32) <- (1x8x20x20xf32)
+        softmax_1 = paddle._C_ops.softmax(add_5, -1)
+        del add_5
+
+        # pd_op.dropout: (1x8x20x20xf32, 1x8x20x20xui8) <- (1x8x20x20xf32, None, 1xf32)
+        dropout_10, dropout_11 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_1, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_1
+
+        # pd_op.matmul: (1x8x20x64xf32) <- (1x8x20x20xf32, 1x8x20x64xf32)
+        matmul_13 = paddle._C_ops.matmul(dropout_10, transpose_7, False, False)
+        del dropout_10, transpose_7
+
+        # pd_op.transpose: (1x20x8x64xf32) <- (1x8x20x64xf32)
+        transpose_8 = paddle._C_ops.transpose(matmul_13, [0, 2, 1, 3])
+        del matmul_13
+
+        # pd_op.reshape: (1x20x512xf32) <- (1x20x8x64xf32, 3xi64)
+        reshape_7 = paddle._C_ops.reshape(transpose_8, full_int_array_4)
+        del transpose_8
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_14 = paddle._C_ops.matmul(reshape_7, parameter_117, False, False)
+        del parameter_117, reshape_7
+
+        # pd_op.dropout: (1x20x512xf32, 1x20x512xui8) <- (1x20x512xf32, None, 1xf32)
+        dropout_12, dropout_13 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_14, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_14
+
+        # pd_op.add: (1x20x512xf32) <- (1x20x512xf32, 1x20x512xf32)
+        add_6 = paddle._C_ops.add(add_4, dropout_12)
+        del add_4, dropout_12
+
+        # pd_op.pow: (1x20x512xf32) <- (1x20x512xf32)
+        pow_3 = paddle._C_ops.pow(add_6, float("2"))
+
+        # pd_op.mean: (1x20x1xf32) <- (1x20x512xf32, 1xi64)
+        mean_3 = paddle._C_ops.mean(pow_3, full_int_array_1, True)
+        del pow_3
+
+        # pd_op.scale: (1x20x1xf32) <- (1x20x1xf32, 1xf32)
+        scale_11 = paddle._C_ops.scale(mean_3, full_3, float("1e-06"), True)
+        del mean_3
+
+        # pd_op.rsqrt: (1x20x1xf32) <- (1x20x1xf32)
+        rsqrt_3 = paddle._C_ops.rsqrt(scale_11)
+        del scale_11
+
+        # pd_op.multiply: (1x20x512xf32) <- (1x20x512xf32, 1x20x1xf32)
+        multiply_6 = paddle._C_ops.multiply(add_6, rsqrt_3)
+        del rsqrt_3
+
+        # pd_op.multiply: (1x20x512xf32) <- (512xf32, 1x20x512xf32)
+        multiply_7 = paddle._C_ops.multiply(parameter_113, multiply_6)
+        del multiply_6, parameter_113
+
+        # pd_op.matmul: (1x20x2048xf32) <- (1x20x512xf32, 512x2048xf32)
+        matmul_15 = paddle._C_ops.matmul(multiply_7, parameter_115, False, False)
+        del multiply_7, parameter_115
+
+        # pd_op.relu: (1x20x2048xf32) <- (1x20x2048xf32)
+        relu_1 = paddle._C_ops.relu(matmul_15)
+        del matmul_15
+
+        # pd_op.dropout: (1x20x2048xf32, 1x20x2048xui8) <- (1x20x2048xf32, None, 1xf32)
+        dropout_14, dropout_15 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_1, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_1
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x2048xf32, 2048x512xf32)
+        matmul_16 = paddle._C_ops.matmul(dropout_14, parameter_114, False, False)
+        del dropout_14, parameter_114
+
+        # pd_op.dropout: (1x20x512xf32, 1x20x512xui8) <- (1x20x512xf32, None, 1xf32)
+        dropout_16, dropout_17 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_16, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_16
+
+        # pd_op.add: (1x20x512xf32) <- (1x20x512xf32, 1x20x512xf32)
+        add_7 = paddle._C_ops.add(dropout_16, add_6)
+        del add_6, dropout_16
+
+        # pd_op.pow: (1x20x512xf32) <- (1x20x512xf32)
+        pow_4 = paddle._C_ops.pow(add_7, float("2"))
+
+        # pd_op.mean: (1x20x1xf32) <- (1x20x512xf32, 1xi64)
+        mean_4 = paddle._C_ops.mean(pow_4, full_int_array_1, True)
+        del pow_4
+
+        # pd_op.scale: (1x20x1xf32) <- (1x20x1xf32, 1xf32)
+        scale_12 = paddle._C_ops.scale(mean_4, full_3, float("1e-06"), True)
+        del mean_4
+
+        # pd_op.rsqrt: (1x20x1xf32) <- (1x20x1xf32)
+        rsqrt_4 = paddle._C_ops.rsqrt(scale_12)
+        del scale_12
+
+        # pd_op.multiply: (1x20x512xf32) <- (1x20x512xf32, 1x20x1xf32)
+        multiply_8 = paddle._C_ops.multiply(add_7, rsqrt_4)
+        del rsqrt_4
+
+        # pd_op.multiply: (1x20x512xf32) <- (512xf32, 1x20x512xf32)
+        multiply_9 = paddle._C_ops.multiply(parameter_108, multiply_8)
+        del multiply_8, parameter_108
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_17 = paddle._C_ops.matmul(multiply_9, parameter_112, False, False)
+        del parameter_112
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_8 = paddle._C_ops.reshape(matmul_17, full_int_array_2)
+        del matmul_17
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_9 = paddle._C_ops.transpose(reshape_8, [0, 2, 1, 3])
+        del reshape_8
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_18 = paddle._C_ops.matmul(multiply_9, parameter_111, False, False)
+        del parameter_111
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_9 = paddle._C_ops.reshape(matmul_18, full_int_array_2)
+        del matmul_18
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_10 = paddle._C_ops.transpose(reshape_9, [0, 2, 1, 3])
+        del reshape_9
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_19 = paddle._C_ops.matmul(multiply_9, parameter_110, False, False)
+        del multiply_9, parameter_110
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_10 = paddle._C_ops.reshape(matmul_19, full_int_array_2)
+        del matmul_19
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_11 = paddle._C_ops.transpose(reshape_10, [0, 2, 1, 3])
+        del reshape_10
+
+        # pd_op.matmul: (1x8x20x20xf32) <- (1x8x20x64xf32, 1x8x20x64xf32)
+        matmul_20 = paddle._C_ops.matmul(transpose_9, transpose_10, False, True)
+        del transpose_10, transpose_9
+
+        # pd_op.add: (1x8x20x20xf32) <- (1x8x20x20xf32, 1x8x20x20xf32)
+        add_8 = paddle._C_ops.add(matmul_20, add_1)
+        del matmul_20
+
+        # pd_op.softmax: (1x8x20x20xf32) <- (1x8x20x20xf32)
+        softmax_2 = paddle._C_ops.softmax(add_8, -1)
+        del add_8
+
+        # pd_op.dropout: (1x8x20x20xf32, 1x8x20x20xui8) <- (1x8x20x20xf32, None, 1xf32)
+        dropout_18, dropout_19 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_2, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_2
+
+        # pd_op.matmul: (1x8x20x64xf32) <- (1x8x20x20xf32, 1x8x20x64xf32)
+        matmul_21 = paddle._C_ops.matmul(dropout_18, transpose_11, False, False)
+        del dropout_18, transpose_11
+
+        # pd_op.transpose: (1x20x8x64xf32) <- (1x8x20x64xf32)
+        transpose_12 = paddle._C_ops.transpose(matmul_21, [0, 2, 1, 3])
+        del matmul_21
+
+        # pd_op.reshape: (1x20x512xf32) <- (1x20x8x64xf32, 3xi64)
+        reshape_11 = paddle._C_ops.reshape(transpose_12, full_int_array_4)
+        del transpose_12
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_22 = paddle._C_ops.matmul(reshape_11, parameter_109, False, False)
+        del parameter_109, reshape_11
+
+        # pd_op.dropout: (1x20x512xf32, 1x20x512xui8) <- (1x20x512xf32, None, 1xf32)
+        dropout_20, dropout_21 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_22, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_22
+
+        # pd_op.add: (1x20x512xf32) <- (1x20x512xf32, 1x20x512xf32)
+        add_9 = paddle._C_ops.add(add_7, dropout_20)
+        del add_7, dropout_20
+
+        # pd_op.pow: (1x20x512xf32) <- (1x20x512xf32)
+        pow_5 = paddle._C_ops.pow(add_9, float("2"))
+
+        # pd_op.mean: (1x20x1xf32) <- (1x20x512xf32, 1xi64)
+        mean_5 = paddle._C_ops.mean(pow_5, full_int_array_1, True)
+        del pow_5
+
+        # pd_op.scale: (1x20x1xf32) <- (1x20x1xf32, 1xf32)
+        scale_13 = paddle._C_ops.scale(mean_5, full_3, float("1e-06"), True)
+        del mean_5
+
+        # pd_op.rsqrt: (1x20x1xf32) <- (1x20x1xf32)
+        rsqrt_5 = paddle._C_ops.rsqrt(scale_13)
+        del scale_13
+
+        # pd_op.multiply: (1x20x512xf32) <- (1x20x512xf32, 1x20x1xf32)
+        multiply_10 = paddle._C_ops.multiply(add_9, rsqrt_5)
+        del rsqrt_5
+
+        # pd_op.multiply: (1x20x512xf32) <- (512xf32, 1x20x512xf32)
+        multiply_11 = paddle._C_ops.multiply(parameter_105, multiply_10)
+        del multiply_10, parameter_105
+
+        # pd_op.matmul: (1x20x2048xf32) <- (1x20x512xf32, 512x2048xf32)
+        matmul_23 = paddle._C_ops.matmul(multiply_11, parameter_107, False, False)
+        del multiply_11, parameter_107
+
+        # pd_op.relu: (1x20x2048xf32) <- (1x20x2048xf32)
+        relu_2 = paddle._C_ops.relu(matmul_23)
+        del matmul_23
+
+        # pd_op.dropout: (1x20x2048xf32, 1x20x2048xui8) <- (1x20x2048xf32, None, 1xf32)
+        dropout_22, dropout_23 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_2, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_2
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x2048xf32, 2048x512xf32)
+        matmul_24 = paddle._C_ops.matmul(dropout_22, parameter_106, False, False)
+        del dropout_22, parameter_106
+
+        # pd_op.dropout: (1x20x512xf32, 1x20x512xui8) <- (1x20x512xf32, None, 1xf32)
+        dropout_24, dropout_25 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_24, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_24
+
+        # pd_op.add: (1x20x512xf32) <- (1x20x512xf32, 1x20x512xf32)
+        add_10 = paddle._C_ops.add(dropout_24, add_9)
+        del add_9, dropout_24
+
+        # pd_op.pow: (1x20x512xf32) <- (1x20x512xf32)
+        pow_6 = paddle._C_ops.pow(add_10, float("2"))
+
+        # pd_op.mean: (1x20x1xf32) <- (1x20x512xf32, 1xi64)
+        mean_6 = paddle._C_ops.mean(pow_6, full_int_array_1, True)
+        del pow_6
+
+        # pd_op.scale: (1x20x1xf32) <- (1x20x1xf32, 1xf32)
+        scale_14 = paddle._C_ops.scale(mean_6, full_3, float("1e-06"), True)
+        del mean_6
+
+        # pd_op.rsqrt: (1x20x1xf32) <- (1x20x1xf32)
+        rsqrt_6 = paddle._C_ops.rsqrt(scale_14)
+        del scale_14
+
+        # pd_op.multiply: (1x20x512xf32) <- (1x20x512xf32, 1x20x1xf32)
+        multiply_12 = paddle._C_ops.multiply(add_10, rsqrt_6)
+        del rsqrt_6
+
+        # pd_op.multiply: (1x20x512xf32) <- (512xf32, 1x20x512xf32)
+        multiply_13 = paddle._C_ops.multiply(parameter_100, multiply_12)
+        del multiply_12, parameter_100
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_25 = paddle._C_ops.matmul(multiply_13, parameter_104, False, False)
+        del parameter_104
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_12 = paddle._C_ops.reshape(matmul_25, full_int_array_2)
+        del matmul_25
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_13 = paddle._C_ops.transpose(reshape_12, [0, 2, 1, 3])
+        del reshape_12
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_26 = paddle._C_ops.matmul(multiply_13, parameter_103, False, False)
+        del parameter_103
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_13 = paddle._C_ops.reshape(matmul_26, full_int_array_2)
+        del matmul_26
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_14 = paddle._C_ops.transpose(reshape_13, [0, 2, 1, 3])
+        del reshape_13
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_27 = paddle._C_ops.matmul(multiply_13, parameter_102, False, False)
+        del multiply_13, parameter_102
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_14 = paddle._C_ops.reshape(matmul_27, full_int_array_2)
+        del matmul_27
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_15 = paddle._C_ops.transpose(reshape_14, [0, 2, 1, 3])
+        del reshape_14
+
+        # pd_op.matmul: (1x8x20x20xf32) <- (1x8x20x64xf32, 1x8x20x64xf32)
+        matmul_28 = paddle._C_ops.matmul(transpose_13, transpose_14, False, True)
+        del transpose_13, transpose_14
+
+        # pd_op.add: (1x8x20x20xf32) <- (1x8x20x20xf32, 1x8x20x20xf32)
+        add_11 = paddle._C_ops.add(matmul_28, add_1)
+        del matmul_28
+
+        # pd_op.softmax: (1x8x20x20xf32) <- (1x8x20x20xf32)
+        softmax_3 = paddle._C_ops.softmax(add_11, -1)
+        del add_11
+
+        # pd_op.dropout: (1x8x20x20xf32, 1x8x20x20xui8) <- (1x8x20x20xf32, None, 1xf32)
+        dropout_26, dropout_27 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_3, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_3
+
+        # pd_op.matmul: (1x8x20x64xf32) <- (1x8x20x20xf32, 1x8x20x64xf32)
+        matmul_29 = paddle._C_ops.matmul(dropout_26, transpose_15, False, False)
+        del dropout_26, transpose_15
+
+        # pd_op.transpose: (1x20x8x64xf32) <- (1x8x20x64xf32)
+        transpose_16 = paddle._C_ops.transpose(matmul_29, [0, 2, 1, 3])
+        del matmul_29
+
+        # pd_op.reshape: (1x20x512xf32) <- (1x20x8x64xf32, 3xi64)
+        reshape_15 = paddle._C_ops.reshape(transpose_16, full_int_array_4)
+        del transpose_16
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_30 = paddle._C_ops.matmul(reshape_15, parameter_101, False, False)
+        del parameter_101, reshape_15
+
+        # pd_op.dropout: (1x20x512xf32, 1x20x512xui8) <- (1x20x512xf32, None, 1xf32)
+        dropout_28, dropout_29 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_30, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_30
+
+        # pd_op.add: (1x20x512xf32) <- (1x20x512xf32, 1x20x512xf32)
+        add_12 = paddle._C_ops.add(add_10, dropout_28)
+        del add_10, dropout_28
+
+        # pd_op.pow: (1x20x512xf32) <- (1x20x512xf32)
+        pow_7 = paddle._C_ops.pow(add_12, float("2"))
+
+        # pd_op.mean: (1x20x1xf32) <- (1x20x512xf32, 1xi64)
+        mean_7 = paddle._C_ops.mean(pow_7, full_int_array_1, True)
+        del pow_7
+
+        # pd_op.scale: (1x20x1xf32) <- (1x20x1xf32, 1xf32)
+        scale_15 = paddle._C_ops.scale(mean_7, full_3, float("1e-06"), True)
+        del mean_7
+
+        # pd_op.rsqrt: (1x20x1xf32) <- (1x20x1xf32)
+        rsqrt_7 = paddle._C_ops.rsqrt(scale_15)
+        del scale_15
+
+        # pd_op.multiply: (1x20x512xf32) <- (1x20x512xf32, 1x20x1xf32)
+        multiply_14 = paddle._C_ops.multiply(add_12, rsqrt_7)
+        del rsqrt_7
+
+        # pd_op.multiply: (1x20x512xf32) <- (512xf32, 1x20x512xf32)
+        multiply_15 = paddle._C_ops.multiply(parameter_97, multiply_14)
+        del multiply_14, parameter_97
+
+        # pd_op.matmul: (1x20x2048xf32) <- (1x20x512xf32, 512x2048xf32)
+        matmul_31 = paddle._C_ops.matmul(multiply_15, parameter_99, False, False)
+        del multiply_15, parameter_99
+
+        # pd_op.relu: (1x20x2048xf32) <- (1x20x2048xf32)
+        relu_3 = paddle._C_ops.relu(matmul_31)
+        del matmul_31
+
+        # pd_op.dropout: (1x20x2048xf32, 1x20x2048xui8) <- (1x20x2048xf32, None, 1xf32)
+        dropout_30, dropout_31 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_3, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_3
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x2048xf32, 2048x512xf32)
+        matmul_32 = paddle._C_ops.matmul(dropout_30, parameter_98, False, False)
+        del dropout_30, parameter_98
+
+        # pd_op.dropout: (1x20x512xf32, 1x20x512xui8) <- (1x20x512xf32, None, 1xf32)
+        dropout_32, dropout_33 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_32, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_32
+
+        # pd_op.add: (1x20x512xf32) <- (1x20x512xf32, 1x20x512xf32)
+        add_13 = paddle._C_ops.add(dropout_32, add_12)
+        del add_12, dropout_32
+
+        # pd_op.pow: (1x20x512xf32) <- (1x20x512xf32)
+        pow_8 = paddle._C_ops.pow(add_13, float("2"))
+
+        # pd_op.mean: (1x20x1xf32) <- (1x20x512xf32, 1xi64)
+        mean_8 = paddle._C_ops.mean(pow_8, full_int_array_1, True)
+        del pow_8
+
+        # pd_op.scale: (1x20x1xf32) <- (1x20x1xf32, 1xf32)
+        scale_16 = paddle._C_ops.scale(mean_8, full_3, float("1e-06"), True)
+        del mean_8
+
+        # pd_op.rsqrt: (1x20x1xf32) <- (1x20x1xf32)
+        rsqrt_8 = paddle._C_ops.rsqrt(scale_16)
+        del scale_16
+
+        # pd_op.multiply: (1x20x512xf32) <- (1x20x512xf32, 1x20x1xf32)
+        multiply_16 = paddle._C_ops.multiply(add_13, rsqrt_8)
+        del rsqrt_8
+
+        # pd_op.multiply: (1x20x512xf32) <- (512xf32, 1x20x512xf32)
+        multiply_17 = paddle._C_ops.multiply(parameter_92, multiply_16)
+        del multiply_16, parameter_92
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_33 = paddle._C_ops.matmul(multiply_17, parameter_96, False, False)
+        del parameter_96
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_16 = paddle._C_ops.reshape(matmul_33, full_int_array_2)
+        del matmul_33
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_17 = paddle._C_ops.transpose(reshape_16, [0, 2, 1, 3])
+        del reshape_16
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_34 = paddle._C_ops.matmul(multiply_17, parameter_95, False, False)
+        del parameter_95
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_17 = paddle._C_ops.reshape(matmul_34, full_int_array_2)
+        del matmul_34
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_18 = paddle._C_ops.transpose(reshape_17, [0, 2, 1, 3])
+        del reshape_17
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_35 = paddle._C_ops.matmul(multiply_17, parameter_94, False, False)
+        del multiply_17, parameter_94
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_18 = paddle._C_ops.reshape(matmul_35, full_int_array_2)
+        del matmul_35
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_19 = paddle._C_ops.transpose(reshape_18, [0, 2, 1, 3])
+        del reshape_18
+
+        # pd_op.matmul: (1x8x20x20xf32) <- (1x8x20x64xf32, 1x8x20x64xf32)
+        matmul_36 = paddle._C_ops.matmul(transpose_17, transpose_18, False, True)
+        del transpose_17, transpose_18
+
+        # pd_op.add: (1x8x20x20xf32) <- (1x8x20x20xf32, 1x8x20x20xf32)
+        add_14 = paddle._C_ops.add(matmul_36, add_1)
+        del matmul_36
+
+        # pd_op.softmax: (1x8x20x20xf32) <- (1x8x20x20xf32)
+        softmax_4 = paddle._C_ops.softmax(add_14, -1)
+        del add_14
+
+        # pd_op.dropout: (1x8x20x20xf32, 1x8x20x20xui8) <- (1x8x20x20xf32, None, 1xf32)
+        dropout_34, dropout_35 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_4, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_4
+
+        # pd_op.matmul: (1x8x20x64xf32) <- (1x8x20x20xf32, 1x8x20x64xf32)
+        matmul_37 = paddle._C_ops.matmul(dropout_34, transpose_19, False, False)
+        del dropout_34, transpose_19
+
+        # pd_op.transpose: (1x20x8x64xf32) <- (1x8x20x64xf32)
+        transpose_20 = paddle._C_ops.transpose(matmul_37, [0, 2, 1, 3])
+        del matmul_37
+
+        # pd_op.reshape: (1x20x512xf32) <- (1x20x8x64xf32, 3xi64)
+        reshape_19 = paddle._C_ops.reshape(transpose_20, full_int_array_4)
+        del transpose_20
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_38 = paddle._C_ops.matmul(reshape_19, parameter_93, False, False)
+        del parameter_93, reshape_19
+
+        # pd_op.dropout: (1x20x512xf32, 1x20x512xui8) <- (1x20x512xf32, None, 1xf32)
+        dropout_36, dropout_37 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_38, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_38
+
+        # pd_op.add: (1x20x512xf32) <- (1x20x512xf32, 1x20x512xf32)
+        add_15 = paddle._C_ops.add(add_13, dropout_36)
+        del add_13, dropout_36
+
+        # pd_op.pow: (1x20x512xf32) <- (1x20x512xf32)
+        pow_9 = paddle._C_ops.pow(add_15, float("2"))
+
+        # pd_op.mean: (1x20x1xf32) <- (1x20x512xf32, 1xi64)
+        mean_9 = paddle._C_ops.mean(pow_9, full_int_array_1, True)
+        del pow_9
+
+        # pd_op.scale: (1x20x1xf32) <- (1x20x1xf32, 1xf32)
+        scale_17 = paddle._C_ops.scale(mean_9, full_3, float("1e-06"), True)
+        del mean_9
+
+        # pd_op.rsqrt: (1x20x1xf32) <- (1x20x1xf32)
+        rsqrt_9 = paddle._C_ops.rsqrt(scale_17)
+        del scale_17
+
+        # pd_op.multiply: (1x20x512xf32) <- (1x20x512xf32, 1x20x1xf32)
+        multiply_18 = paddle._C_ops.multiply(add_15, rsqrt_9)
+        del rsqrt_9
+
+        # pd_op.multiply: (1x20x512xf32) <- (512xf32, 1x20x512xf32)
+        multiply_19 = paddle._C_ops.multiply(parameter_89, multiply_18)
+        del multiply_18, parameter_89
+
+        # pd_op.matmul: (1x20x2048xf32) <- (1x20x512xf32, 512x2048xf32)
+        matmul_39 = paddle._C_ops.matmul(multiply_19, parameter_91, False, False)
+        del multiply_19, parameter_91
+
+        # pd_op.relu: (1x20x2048xf32) <- (1x20x2048xf32)
+        relu_4 = paddle._C_ops.relu(matmul_39)
+        del matmul_39
+
+        # pd_op.dropout: (1x20x2048xf32, 1x20x2048xui8) <- (1x20x2048xf32, None, 1xf32)
+        dropout_38, dropout_39 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_4, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_4
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x2048xf32, 2048x512xf32)
+        matmul_40 = paddle._C_ops.matmul(dropout_38, parameter_90, False, False)
+        del dropout_38, parameter_90
+
+        # pd_op.dropout: (1x20x512xf32, 1x20x512xui8) <- (1x20x512xf32, None, 1xf32)
+        dropout_40, dropout_41 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_40, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_40
+
+        # pd_op.add: (1x20x512xf32) <- (1x20x512xf32, 1x20x512xf32)
+        add_16 = paddle._C_ops.add(dropout_40, add_15)
+        del add_15, dropout_40
+
+        # pd_op.pow: (1x20x512xf32) <- (1x20x512xf32)
+        pow_10 = paddle._C_ops.pow(add_16, float("2"))
+
+        # pd_op.mean: (1x20x1xf32) <- (1x20x512xf32, 1xi64)
+        mean_10 = paddle._C_ops.mean(pow_10, full_int_array_1, True)
+        del pow_10
+
+        # pd_op.scale: (1x20x1xf32) <- (1x20x1xf32, 1xf32)
+        scale_18 = paddle._C_ops.scale(mean_10, full_3, float("1e-06"), True)
+        del mean_10
+
+        # pd_op.rsqrt: (1x20x1xf32) <- (1x20x1xf32)
+        rsqrt_10 = paddle._C_ops.rsqrt(scale_18)
+        del scale_18
+
+        # pd_op.multiply: (1x20x512xf32) <- (1x20x512xf32, 1x20x1xf32)
+        multiply_20 = paddle._C_ops.multiply(add_16, rsqrt_10)
+        del rsqrt_10
+
+        # pd_op.multiply: (1x20x512xf32) <- (512xf32, 1x20x512xf32)
+        multiply_21 = paddle._C_ops.multiply(parameter_84, multiply_20)
+        del multiply_20, parameter_84
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_41 = paddle._C_ops.matmul(multiply_21, parameter_88, False, False)
+        del parameter_88
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_20 = paddle._C_ops.reshape(matmul_41, full_int_array_2)
+        del matmul_41
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_21 = paddle._C_ops.transpose(reshape_20, [0, 2, 1, 3])
+        del reshape_20
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_42 = paddle._C_ops.matmul(multiply_21, parameter_87, False, False)
+        del parameter_87
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_21 = paddle._C_ops.reshape(matmul_42, full_int_array_2)
+        del matmul_42
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_22 = paddle._C_ops.transpose(reshape_21, [0, 2, 1, 3])
+        del reshape_21
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_43 = paddle._C_ops.matmul(multiply_21, parameter_86, False, False)
+        del multiply_21, parameter_86
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_22 = paddle._C_ops.reshape(matmul_43, full_int_array_2)
+        del matmul_43
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_23 = paddle._C_ops.transpose(reshape_22, [0, 2, 1, 3])
+        del reshape_22
+
+        # pd_op.matmul: (1x8x20x20xf32) <- (1x8x20x64xf32, 1x8x20x64xf32)
+        matmul_44 = paddle._C_ops.matmul(transpose_21, transpose_22, False, True)
+        del transpose_21, transpose_22
+
+        # pd_op.add: (1x8x20x20xf32) <- (1x8x20x20xf32, 1x8x20x20xf32)
+        add_17 = paddle._C_ops.add(matmul_44, add_1)
+        del add_1, matmul_44
+
+        # pd_op.softmax: (1x8x20x20xf32) <- (1x8x20x20xf32)
+        softmax_5 = paddle._C_ops.softmax(add_17, -1)
+        del add_17
+
+        # pd_op.dropout: (1x8x20x20xf32, 1x8x20x20xui8) <- (1x8x20x20xf32, None, 1xf32)
+        dropout_42, dropout_43 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_5, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_5
+
+        # pd_op.matmul: (1x8x20x64xf32) <- (1x8x20x20xf32, 1x8x20x64xf32)
+        matmul_45 = paddle._C_ops.matmul(dropout_42, transpose_23, False, False)
+        del dropout_42, transpose_23
+
+        # pd_op.transpose: (1x20x8x64xf32) <- (1x8x20x64xf32)
+        transpose_24 = paddle._C_ops.transpose(matmul_45, [0, 2, 1, 3])
+        del matmul_45
+
+        # pd_op.reshape: (1x20x512xf32) <- (1x20x8x64xf32, 3xi64)
+        reshape_23 = paddle._C_ops.reshape(transpose_24, full_int_array_4)
+        del transpose_24
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_46 = paddle._C_ops.matmul(reshape_23, parameter_85, False, False)
+        del parameter_85, reshape_23
+
+        # pd_op.dropout: (1x20x512xf32, 1x20x512xui8) <- (1x20x512xf32, None, 1xf32)
+        dropout_44, dropout_45 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_46, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_46
+
+        # pd_op.add: (1x20x512xf32) <- (1x20x512xf32, 1x20x512xf32)
+        add_18 = paddle._C_ops.add(add_16, dropout_44)
+        del add_16, dropout_44
+
+        # pd_op.pow: (1x20x512xf32) <- (1x20x512xf32)
+        pow_11 = paddle._C_ops.pow(add_18, float("2"))
+
+        # pd_op.mean: (1x20x1xf32) <- (1x20x512xf32, 1xi64)
+        mean_11 = paddle._C_ops.mean(pow_11, full_int_array_1, True)
+        del pow_11
+
+        # pd_op.scale: (1x20x1xf32) <- (1x20x1xf32, 1xf32)
+        scale_19 = paddle._C_ops.scale(mean_11, full_3, float("1e-06"), True)
+        del mean_11
+
+        # pd_op.rsqrt: (1x20x1xf32) <- (1x20x1xf32)
+        rsqrt_11 = paddle._C_ops.rsqrt(scale_19)
+        del scale_19
+
+        # pd_op.multiply: (1x20x512xf32) <- (1x20x512xf32, 1x20x1xf32)
+        multiply_22 = paddle._C_ops.multiply(add_18, rsqrt_11)
+        del rsqrt_11
+
+        # pd_op.multiply: (1x20x512xf32) <- (512xf32, 1x20x512xf32)
+        multiply_23 = paddle._C_ops.multiply(parameter_81, multiply_22)
+        del multiply_22, parameter_81
+
+        # pd_op.matmul: (1x20x2048xf32) <- (1x20x512xf32, 512x2048xf32)
+        matmul_47 = paddle._C_ops.matmul(multiply_23, parameter_83, False, False)
+        del multiply_23, parameter_83
+
+        # pd_op.relu: (1x20x2048xf32) <- (1x20x2048xf32)
+        relu_5 = paddle._C_ops.relu(matmul_47)
+        del matmul_47
+
+        # pd_op.dropout: (1x20x2048xf32, 1x20x2048xui8) <- (1x20x2048xf32, None, 1xf32)
+        dropout_46, dropout_47 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_5, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_5
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x2048xf32, 2048x512xf32)
+        matmul_48 = paddle._C_ops.matmul(dropout_46, parameter_82, False, False)
+        del dropout_46, parameter_82
+
+        # pd_op.dropout: (1x20x512xf32, 1x20x512xui8) <- (1x20x512xf32, None, 1xf32)
+        dropout_48, dropout_49 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_48, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_48
+
+        # pd_op.add: (1x20x512xf32) <- (1x20x512xf32, 1x20x512xf32)
+        add_19 = paddle._C_ops.add(dropout_48, add_18)
+        del add_18, dropout_48
+
+        # pd_op.pow: (1x20x512xf32) <- (1x20x512xf32)
+        pow_12 = paddle._C_ops.pow(add_19, float("2"))
+
+        # pd_op.mean: (1x20x1xf32) <- (1x20x512xf32, 1xi64)
+        mean_12 = paddle._C_ops.mean(pow_12, full_int_array_1, True)
+        del pow_12
+
+        # pd_op.scale: (1x20x1xf32) <- (1x20x1xf32, 1xf32)
+        scale_20 = paddle._C_ops.scale(mean_12, full_3, float("1e-06"), True)
+        del mean_12
+
+        # pd_op.rsqrt: (1x20x1xf32) <- (1x20x1xf32)
+        rsqrt_12 = paddle._C_ops.rsqrt(scale_20)
+        del scale_20
+
+        # pd_op.multiply: (1x20x512xf32) <- (1x20x512xf32, 1x20x1xf32)
+        multiply_24 = paddle._C_ops.multiply(add_19, rsqrt_12)
+        del add_19, rsqrt_12
+
+        # pd_op.multiply: (1x20x512xf32) <- (512xf32, 1x20x512xf32)
+        multiply_25 = paddle._C_ops.multiply(parameter_80, multiply_24)
+        del multiply_24, parameter_80
+
+        # pd_op.dropout: (1x20x512xf32, 1x20x512xui8) <- (1x20x512xf32, None, 1xf32)
+        dropout_50, dropout_51 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                multiply_25, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del multiply_25
+
+        # pd_op.embedding: (1x1x512xf32) <- (1x1xi64, 32128x512xf32)
+        embedding_2 = paddle._C_ops.embedding(data_2, parameter_130, -1, False)
+        del data_2
+
+        # pd_op.full: (1x1xf32) <- ()
+        full_14 = paddle._C_ops.full(
+            [1, 1],
+            float("1"),
+            paddle.float32,
+            paddle.framework._current_expected_place(),
+        )
+
+        # pd_op.arange: (1xi64) <- (1xf64, 1xf64, 1xf64)
+        arange_1 = paddle.arange(full_4, full_6, full_6, dtype="int64")
+        del full_4, full_6
+
+        # pd_op.full_int_array: (2xi64) <- ()
+        full_int_array_5 = [0, 1]
+
+        # pd_op.unsqueeze: (1x1x1xi64) <- (1xi64, 2xi64)
+        unsqueeze_4 = paddle._C_ops.unsqueeze(arange_1, full_int_array_5)
+        del full_int_array_5
+
+        # pd_op.full_int_array: (3xi64) <- ()
+        full_int_array_6 = [1, 1, 1]
+
+        # pd_op.tile: (1x1x1xi64) <- (1x1x1xi64, 3xi64)
+        tile_0 = paddle._C_ops.tile(unsqueeze_4, full_int_array_6)
+        del full_int_array_6, unsqueeze_4
+
+        # pd_op.full_int_array: (2xi64) <- ()
+        full_int_array_7 = [0, 2]
+
+        # pd_op.unsqueeze: (1x1x1xi64) <- (1xi64, 2xi64)
+        unsqueeze_5 = paddle._C_ops.unsqueeze(arange_1, full_int_array_7)
+        del full_int_array_7
+
+        # pd_op.less_equal: (1x1x1xb) <- (1x1x1xi64, 1x1x1xi64)
+        less_equal_0 = paddle._C_ops.less_equal(tile_0, unsqueeze_5)
+        del tile_0, unsqueeze_5
+
+        # pd_op.cast: (1x1x1xf32) <- (1x1x1xb)
+        cast_4 = paddle._C_ops.cast(less_equal_0, paddle.float32)
+        del less_equal_0
+
+        # pd_op.full_int_array: (1xi64) <- ()
+        full_int_array_8 = [1]
+
+        # pd_op.unsqueeze: (1x1x1x1xf32) <- (1x1x1xf32, 1xi64)
+        unsqueeze_6 = paddle._C_ops.unsqueeze(cast_4, full_int_array_8)
+        del cast_4, full_int_array_8
+
+        # pd_op.unsqueeze: (1x1x1x1xf32) <- (1x1xf32, 2xi64)
+        unsqueeze_7 = paddle._C_ops.unsqueeze(full_14, full_int_array_0)
+        del full_14, full_int_array_0
+
+        # pd_op.multiply: (1x1x1x1xf32) <- (1x1x1x1xf32, 1x1x1x1xf32)
+        multiply_26 = paddle._C_ops.multiply(unsqueeze_6, unsqueeze_7)
+        del unsqueeze_6, unsqueeze_7
+
+        # pd_op.scale: (1x1x1x1xf32) <- (1x1x1x1xf32, 1xf32)
+        scale_21 = paddle._C_ops.scale(multiply_26, full_0, float("1"), True)
+        del multiply_26
+
+        # pd_op.scale: (1x1x1x1xf32) <- (1x1x1x1xf32, 1xf32)
+        scale_22 = paddle._C_ops.scale(scale_21, full_1, float("0"), True)
+        del full_1, scale_21
+
+        # pd_op.dropout: (1x1x512xf32, 1x1x512xui8) <- (1x1x512xf32, None, 1xf32)
+        dropout_52, dropout_53 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                embedding_2, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del embedding_2
+
+        # pd_op.pow: (1x1x512xf32) <- (1x1x512xf32)
+        pow_13 = paddle._C_ops.pow(dropout_52, float("2"))
+
+        # pd_op.mean: (1x1x1xf32) <- (1x1x512xf32, 1xi64)
+        mean_13 = paddle._C_ops.mean(pow_13, full_int_array_1, True)
+        del pow_13
+
+        # pd_op.scale: (1x1x1xf32) <- (1x1x1xf32, 1xf32)
+        scale_23 = paddle._C_ops.scale(mean_13, full_3, float("1e-06"), True)
+        del mean_13
+
+        # pd_op.rsqrt: (1x1x1xf32) <- (1x1x1xf32)
+        rsqrt_13 = paddle._C_ops.rsqrt(scale_23)
+        del scale_23
+
+        # pd_op.multiply: (1x1x512xf32) <- (1x1x512xf32, 1x1x1xf32)
+        multiply_27 = paddle._C_ops.multiply(dropout_52, rsqrt_13)
+        del rsqrt_13
+
+        # pd_op.multiply: (1x1x512xf32) <- (512xf32, 1x1x512xf32)
+        multiply_28 = paddle._C_ops.multiply(parameter_74, multiply_27)
+        del multiply_27, parameter_74
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_49 = paddle._C_ops.matmul(multiply_28, parameter_79, False, False)
+        del parameter_79
+
+        # pd_op.reshape: (1x1x8x64xf32) <- (1x1x512xf32, 4xi64)
+        reshape_24 = paddle._C_ops.reshape(matmul_49, full_int_array_2)
+        del matmul_49
+
+        # pd_op.transpose: (1x8x1x64xf32) <- (1x1x8x64xf32)
+        transpose_25 = paddle._C_ops.transpose(reshape_24, [0, 2, 1, 3])
+        del reshape_24
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_50 = paddle._C_ops.matmul(multiply_28, parameter_78, False, False)
+        del parameter_78
+
+        # pd_op.reshape: (1x1x8x64xf32) <- (1x1x512xf32, 4xi64)
+        reshape_25 = paddle._C_ops.reshape(matmul_50, full_int_array_2)
+        del matmul_50
+
+        # pd_op.transpose: (1x8x1x64xf32) <- (1x1x8x64xf32)
+        transpose_26 = paddle._C_ops.transpose(reshape_25, [0, 2, 1, 3])
+        del reshape_25
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_51 = paddle._C_ops.matmul(multiply_28, parameter_77, False, False)
+        del multiply_28, parameter_77
+
+        # pd_op.reshape: (1x1x8x64xf32) <- (1x1x512xf32, 4xi64)
+        reshape_26 = paddle._C_ops.reshape(matmul_51, full_int_array_2)
+        del matmul_51
+
+        # pd_op.transpose: (1x8x1x64xf32) <- (1x1x8x64xf32)
+        transpose_27 = paddle._C_ops.transpose(reshape_26, [0, 2, 1, 3])
+        del reshape_26
+
+        # pd_op.matmul: (1x8x1x1xf32) <- (1x8x1x64xf32, 1x8x1x64xf32)
+        matmul_52 = paddle._C_ops.matmul(transpose_25, transpose_26, False, True)
+        del transpose_25
+
+        # pd_op.unsqueeze: (1x1xi64) <- (1xi64, 1xi64)
+        unsqueeze_8 = paddle._C_ops.unsqueeze(arange_1, full_int_array_1)
+
+        # pd_op.unsqueeze: (1x1xi64) <- (1xi64, 1xi64)
+        unsqueeze_9 = paddle._C_ops.unsqueeze(arange_1, full_int_array_3)
+        del arange_1
+
+        # pd_op.subtract: (1x1xi64) <- (1x1xi64, 1x1xi64)
+        subtract_1 = paddle._C_ops.subtract(unsqueeze_9, unsqueeze_8)
+        del unsqueeze_8, unsqueeze_9
+
+        # pd_op.full: (1xf32) <- ()
+        full_15 = paddle._C_ops.full(
+            [1], float("0"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full_like: (1x1xi64) <- (1x1xi64, 1xf32)
+        full_like_1 = paddle._C_ops.full_like(
+            subtract_1,
+            full_15,
+            paddle.int64,
+            paddle.framework._current_expected_place(),
+        )
+        del full_15
+
+        # pd_op.minimum: (1x1xi64) <- (1x1xi64, 1x1xi64)
+        minimum_1 = paddle._C_ops.minimum(subtract_1, full_like_1)
+        del full_like_1, subtract_1
+
+        # pd_op.scale: (1x1xi64) <- (1x1xi64, 1xf32)
+        scale_24 = paddle._C_ops.scale(minimum_1, full_0, float("0"), True)
+        del full_0, minimum_1
+
+        # pd_op.full: (xi64) <- ()
+        full_16 = paddle._C_ops.full(
+            [], float("16"), paddle.int64, paddle.framework._current_expected_place()
+        )
+
+        # pd_op.less_than: (1x1xb) <- (1x1xi64, xi64)
+        less_than_1 = paddle._C_ops.less_than(scale_24, full_16)
+        del full_16
+
+        # pd_op.cast: (1x1xf32) <- (1x1xi64)
+        cast_5 = paddle._C_ops.cast(scale_24, paddle.float32)
+
+        # pd_op.full: (1xf32) <- ()
+        full_17 = paddle._C_ops.full(
+            [1], float("0.0625"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x1xf32) <- (1x1xf32, 1xf32)
+        scale_25 = paddle._C_ops.scale(cast_5, full_17, float("0"), True)
+        del cast_5, full_17
+
+        # pd_op.log: (1x1xf32) <- (1x1xf32)
+        log_1 = paddle._C_ops.log(scale_25)
+        del scale_25
+
+        # pd_op.full: (1xf32) <- ()
+        full_18 = paddle._C_ops.full(
+            [1], float("0.480898"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x1xf32) <- (1x1xf32, 1xf32)
+        scale_26 = paddle._C_ops.scale(log_1, full_18, float("0"), True)
+        del full_18, log_1
+
+        # pd_op.scale: (1x1xf32) <- (1x1xf32, 1xf32)
+        scale_27 = paddle._C_ops.scale(scale_26, full_8, float("0"), True)
+        del full_8, scale_26
+
+        # pd_op.cast: (1x1xi64) <- (1x1xf32)
+        cast_6 = paddle._C_ops.cast(scale_27, paddle.int64)
+        del scale_27
+
+        # pd_op.scale: (1x1xi64) <- (1x1xi64, 1xf32)
+        scale_28 = paddle._C_ops.scale(cast_6, full_3, float("16"), True)
+        del cast_6
+
+        # pd_op.full: (1xf32) <- ()
+        full_19 = paddle._C_ops.full(
+            [1], float("31"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.full_like: (1x1xi64) <- (1x1xi64, 1xf32)
+        full_like_2 = paddle._C_ops.full_like(
+            scale_28, full_19, paddle.int64, paddle.framework._current_expected_place()
+        )
+        del full_19
+
+        # pd_op.minimum: (1x1xi64) <- (1x1xi64, 1x1xi64)
+        minimum_2 = paddle._C_ops.minimum(scale_28, full_like_2)
+        del full_like_2, scale_28
+
+        # pd_op.where: (1x1xi64) <- (1x1xb, 1x1xi64, 1x1xi64)
+        where_1 = paddle._C_ops.where(less_than_1, scale_24, minimum_2)
+        del less_than_1, minimum_2, scale_24
+
+        # pd_op.scale: (1x1xi64) <- (1x1xi64, 1xf32)
+        scale_29 = paddle._C_ops.scale(where_1, full_3, float("0"), True)
+        del where_1
+
+        # pd_op.embedding: (1x1x8xf32) <- (1x1xi64, 32x8xf32)
+        embedding_3 = paddle._C_ops.embedding(scale_29, parameter_75, -1, False)
+        del parameter_75, scale_29
+
+        # pd_op.transpose: (8x1x1xf32) <- (1x1x8xf32)
+        transpose_28 = paddle._C_ops.transpose(embedding_3, [2, 0, 1])
+        del embedding_3
+
+        # pd_op.unsqueeze: (1x8x1x1xf32) <- (8x1x1xf32, 1xi64)
+        unsqueeze_10 = paddle._C_ops.unsqueeze(transpose_28, full_int_array_3)
+        del full_int_array_3, transpose_28
+
+        # pd_op.add: (1x8x1x1xf32) <- (1x8x1x1xf32, 1x1x1x1xf32)
+        add_20 = paddle._C_ops.add(unsqueeze_10, scale_22)
+        del scale_22, unsqueeze_10
+
+        # pd_op.add: (1x8x1x1xf32) <- (1x8x1x1xf32, 1x8x1x1xf32)
+        add_21 = paddle._C_ops.add(matmul_52, add_20)
+        del matmul_52
+
+        # pd_op.softmax: (1x8x1x1xf32) <- (1x8x1x1xf32)
+        softmax_6 = paddle._C_ops.softmax(add_21, -1)
+        del add_21
+
+        # pd_op.dropout: (1x8x1x1xf32, 1x8x1x1xui8) <- (1x8x1x1xf32, None, 1xf32)
+        dropout_54, dropout_55 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_6, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_6
+
+        # pd_op.matmul: (1x8x1x64xf32) <- (1x8x1x1xf32, 1x8x1x64xf32)
+        matmul_53 = paddle._C_ops.matmul(dropout_54, transpose_27, False, False)
+        del dropout_54
+
+        # pd_op.transpose: (1x1x8x64xf32) <- (1x8x1x64xf32)
+        transpose_29 = paddle._C_ops.transpose(matmul_53, [0, 2, 1, 3])
+        del matmul_53
+
+        # pd_op.reshape: (1x1x512xf32) <- (1x1x8x64xf32, 3xi64)
+        reshape_27 = paddle._C_ops.reshape(transpose_29, full_int_array_4)
+        del transpose_29
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_54 = paddle._C_ops.matmul(reshape_27, parameter_76, False, False)
+        del parameter_76, reshape_27
+
+        # pd_op.dropout: (1x1x512xf32, 1x1x512xui8) <- (1x1x512xf32, None, 1xf32)
+        dropout_56, dropout_57 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_54, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_54
+
+        # pd_op.add: (1x1x512xf32) <- (1x1x512xf32, 1x1x512xf32)
+        add_22 = paddle._C_ops.add(dropout_52, dropout_56)
+        del dropout_52, dropout_56
+
+        # pd_op.pow: (1x1x512xf32) <- (1x1x512xf32)
+        pow_14 = paddle._C_ops.pow(add_22, float("2"))
+
+        # pd_op.mean: (1x1x1xf32) <- (1x1x512xf32, 1xi64)
+        mean_14 = paddle._C_ops.mean(pow_14, full_int_array_1, True)
+        del pow_14
+
+        # pd_op.scale: (1x1x1xf32) <- (1x1x1xf32, 1xf32)
+        scale_30 = paddle._C_ops.scale(mean_14, full_3, float("1e-06"), True)
+        del mean_14
+
+        # pd_op.rsqrt: (1x1x1xf32) <- (1x1x1xf32)
+        rsqrt_14 = paddle._C_ops.rsqrt(scale_30)
+        del scale_30
+
+        # pd_op.multiply: (1x1x512xf32) <- (1x1x512xf32, 1x1x1xf32)
+        multiply_29 = paddle._C_ops.multiply(add_22, rsqrt_14)
+        del rsqrt_14
+
+        # pd_op.multiply: (1x1x512xf32) <- (512xf32, 1x1x512xf32)
+        multiply_30 = paddle._C_ops.multiply(parameter_69, multiply_29)
+        del multiply_29, parameter_69
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_55 = paddle._C_ops.matmul(multiply_30, parameter_73, False, False)
+        del multiply_30, parameter_73
+
+        # pd_op.reshape: (1x1x8x64xf32) <- (1x1x512xf32, 4xi64)
+        reshape_28 = paddle._C_ops.reshape(matmul_55, full_int_array_2)
+        del matmul_55
+
+        # pd_op.transpose: (1x8x1x64xf32) <- (1x1x8x64xf32)
+        transpose_30 = paddle._C_ops.transpose(reshape_28, [0, 2, 1, 3])
+        del reshape_28
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_56 = paddle._C_ops.matmul(dropout_50, parameter_72, False, False)
+        del parameter_72
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_29 = paddle._C_ops.reshape(matmul_56, full_int_array_2)
+        del matmul_56
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_31 = paddle._C_ops.transpose(reshape_29, [0, 2, 1, 3])
+        del reshape_29
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_57 = paddle._C_ops.matmul(dropout_50, parameter_71, False, False)
+        del parameter_71
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_30 = paddle._C_ops.reshape(matmul_57, full_int_array_2)
+        del matmul_57
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_32 = paddle._C_ops.transpose(reshape_30, [0, 2, 1, 3])
+        del reshape_30
+
+        # pd_op.matmul: (1x8x1x20xf32) <- (1x8x1x64xf32, 1x8x20x64xf32)
+        matmul_58 = paddle._C_ops.matmul(transpose_30, transpose_31, False, True)
+        del transpose_30
+
+        # pd_op.full: (1x8x1x20xf32) <- ()
+        full_20 = paddle._C_ops.full(
+            [1, 8, 1, 20],
+            float("0"),
+            paddle.float32,
+            paddle.framework._current_expected_place(),
+        )
+
+        # pd_op.add: (1x8x1x20xf32) <- (1x8x1x20xf32, 1x1x1x20xf32)
+        add_23 = paddle._C_ops.add(full_20, scale_1)
+        del full_20, scale_1
+
+        # pd_op.add: (1x8x1x20xf32) <- (1x8x1x20xf32, 1x8x1x20xf32)
+        add_24 = paddle._C_ops.add(matmul_58, add_23)
+        del matmul_58
+
+        # pd_op.softmax: (1x8x1x20xf32) <- (1x8x1x20xf32)
+        softmax_7 = paddle._C_ops.softmax(add_24, -1)
+        del add_24
+
+        # pd_op.dropout: (1x8x1x20xf32, 1x8x1x20xui8) <- (1x8x1x20xf32, None, 1xf32)
+        dropout_58, dropout_59 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_7, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_7
+
+        # pd_op.matmul: (1x8x1x64xf32) <- (1x8x1x20xf32, 1x8x20x64xf32)
+        matmul_59 = paddle._C_ops.matmul(dropout_58, transpose_32, False, False)
+        del dropout_58
+
+        # pd_op.transpose: (1x1x8x64xf32) <- (1x8x1x64xf32)
+        transpose_33 = paddle._C_ops.transpose(matmul_59, [0, 2, 1, 3])
+        del matmul_59
+
+        # pd_op.reshape: (1x1x512xf32) <- (1x1x8x64xf32, 3xi64)
+        reshape_31 = paddle._C_ops.reshape(transpose_33, full_int_array_4)
+        del transpose_33
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_60 = paddle._C_ops.matmul(reshape_31, parameter_70, False, False)
+        del parameter_70, reshape_31
+
+        # pd_op.dropout: (1x1x512xf32, 1x1x512xui8) <- (1x1x512xf32, None, 1xf32)
+        dropout_60, dropout_61 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_60, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_60
+
+        # pd_op.add: (1x1x512xf32) <- (1x1x512xf32, 1x1x512xf32)
+        add_25 = paddle._C_ops.add(add_22, dropout_60)
+        del add_22, dropout_60
+
+        # pd_op.pow: (1x1x512xf32) <- (1x1x512xf32)
+        pow_15 = paddle._C_ops.pow(add_25, float("2"))
+
+        # pd_op.mean: (1x1x1xf32) <- (1x1x512xf32, 1xi64)
+        mean_15 = paddle._C_ops.mean(pow_15, full_int_array_1, True)
+        del pow_15
+
+        # pd_op.scale: (1x1x1xf32) <- (1x1x1xf32, 1xf32)
+        scale_31 = paddle._C_ops.scale(mean_15, full_3, float("1e-06"), True)
+        del mean_15
+
+        # pd_op.rsqrt: (1x1x1xf32) <- (1x1x1xf32)
+        rsqrt_15 = paddle._C_ops.rsqrt(scale_31)
+        del scale_31
+
+        # pd_op.multiply: (1x1x512xf32) <- (1x1x512xf32, 1x1x1xf32)
+        multiply_31 = paddle._C_ops.multiply(add_25, rsqrt_15)
+        del rsqrt_15
+
+        # pd_op.multiply: (1x1x512xf32) <- (512xf32, 1x1x512xf32)
+        multiply_32 = paddle._C_ops.multiply(parameter_66, multiply_31)
+        del multiply_31, parameter_66
+
+        # pd_op.matmul: (1x1x2048xf32) <- (1x1x512xf32, 512x2048xf32)
+        matmul_61 = paddle._C_ops.matmul(multiply_32, parameter_68, False, False)
+        del multiply_32, parameter_68
+
+        # pd_op.relu: (1x1x2048xf32) <- (1x1x2048xf32)
+        relu_6 = paddle._C_ops.relu(matmul_61)
+        del matmul_61
+
+        # pd_op.dropout: (1x1x2048xf32, 1x1x2048xui8) <- (1x1x2048xf32, None, 1xf32)
+        dropout_62, dropout_63 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_6, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_6
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x2048xf32, 2048x512xf32)
+        matmul_62 = paddle._C_ops.matmul(dropout_62, parameter_67, False, False)
+        del dropout_62, parameter_67
+
+        # pd_op.dropout: (1x1x512xf32, 1x1x512xui8) <- (1x1x512xf32, None, 1xf32)
+        dropout_64, dropout_65 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_62, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_62
+
+        # pd_op.add: (1x1x512xf32) <- (1x1x512xf32, 1x1x512xf32)
+        add_26 = paddle._C_ops.add(dropout_64, add_25)
+        del add_25, dropout_64
+
+        # pd_op.pow: (1x1x512xf32) <- (1x1x512xf32)
+        pow_16 = paddle._C_ops.pow(add_26, float("2"))
+
+        # pd_op.mean: (1x1x1xf32) <- (1x1x512xf32, 1xi64)
+        mean_16 = paddle._C_ops.mean(pow_16, full_int_array_1, True)
+        del pow_16
+
+        # pd_op.scale: (1x1x1xf32) <- (1x1x1xf32, 1xf32)
+        scale_32 = paddle._C_ops.scale(mean_16, full_3, float("1e-06"), True)
+        del mean_16
+
+        # pd_op.rsqrt: (1x1x1xf32) <- (1x1x1xf32)
+        rsqrt_16 = paddle._C_ops.rsqrt(scale_32)
+        del scale_32
+
+        # pd_op.multiply: (1x1x512xf32) <- (1x1x512xf32, 1x1x1xf32)
+        multiply_33 = paddle._C_ops.multiply(add_26, rsqrt_16)
+        del rsqrt_16
+
+        # pd_op.multiply: (1x1x512xf32) <- (512xf32, 1x1x512xf32)
+        multiply_34 = paddle._C_ops.multiply(parameter_61, multiply_33)
+        del multiply_33, parameter_61
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_63 = paddle._C_ops.matmul(multiply_34, parameter_65, False, False)
+        del parameter_65
+
+        # pd_op.reshape: (1x1x8x64xf32) <- (1x1x512xf32, 4xi64)
+        reshape_32 = paddle._C_ops.reshape(matmul_63, full_int_array_2)
+        del matmul_63
+
+        # pd_op.transpose: (1x8x1x64xf32) <- (1x1x8x64xf32)
+        transpose_34 = paddle._C_ops.transpose(reshape_32, [0, 2, 1, 3])
+        del reshape_32
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_64 = paddle._C_ops.matmul(multiply_34, parameter_64, False, False)
+        del parameter_64
+
+        # pd_op.reshape: (1x1x8x64xf32) <- (1x1x512xf32, 4xi64)
+        reshape_33 = paddle._C_ops.reshape(matmul_64, full_int_array_2)
+        del matmul_64
+
+        # pd_op.transpose: (1x8x1x64xf32) <- (1x1x8x64xf32)
+        transpose_35 = paddle._C_ops.transpose(reshape_33, [0, 2, 1, 3])
+        del reshape_33
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_65 = paddle._C_ops.matmul(multiply_34, parameter_63, False, False)
+        del multiply_34, parameter_63
+
+        # pd_op.reshape: (1x1x8x64xf32) <- (1x1x512xf32, 4xi64)
+        reshape_34 = paddle._C_ops.reshape(matmul_65, full_int_array_2)
+        del matmul_65
+
+        # pd_op.transpose: (1x8x1x64xf32) <- (1x1x8x64xf32)
+        transpose_36 = paddle._C_ops.transpose(reshape_34, [0, 2, 1, 3])
+        del reshape_34
+
+        # pd_op.matmul: (1x8x1x1xf32) <- (1x8x1x64xf32, 1x8x1x64xf32)
+        matmul_66 = paddle._C_ops.matmul(transpose_34, transpose_35, False, True)
+        del transpose_34
+
+        # pd_op.add: (1x8x1x1xf32) <- (1x8x1x1xf32, 1x8x1x1xf32)
+        add_27 = paddle._C_ops.add(matmul_66, add_20)
+        del matmul_66
+
+        # pd_op.softmax: (1x8x1x1xf32) <- (1x8x1x1xf32)
+        softmax_8 = paddle._C_ops.softmax(add_27, -1)
+        del add_27
+
+        # pd_op.dropout: (1x8x1x1xf32, 1x8x1x1xui8) <- (1x8x1x1xf32, None, 1xf32)
+        dropout_66, dropout_67 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_8, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_8
+
+        # pd_op.matmul: (1x8x1x64xf32) <- (1x8x1x1xf32, 1x8x1x64xf32)
+        matmul_67 = paddle._C_ops.matmul(dropout_66, transpose_36, False, False)
+        del dropout_66
+
+        # pd_op.transpose: (1x1x8x64xf32) <- (1x8x1x64xf32)
+        transpose_37 = paddle._C_ops.transpose(matmul_67, [0, 2, 1, 3])
+        del matmul_67
+
+        # pd_op.reshape: (1x1x512xf32) <- (1x1x8x64xf32, 3xi64)
+        reshape_35 = paddle._C_ops.reshape(transpose_37, full_int_array_4)
+        del transpose_37
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_68 = paddle._C_ops.matmul(reshape_35, parameter_62, False, False)
+        del parameter_62, reshape_35
+
+        # pd_op.dropout: (1x1x512xf32, 1x1x512xui8) <- (1x1x512xf32, None, 1xf32)
+        dropout_68, dropout_69 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_68, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_68
+
+        # pd_op.add: (1x1x512xf32) <- (1x1x512xf32, 1x1x512xf32)
+        add_28 = paddle._C_ops.add(add_26, dropout_68)
+        del add_26, dropout_68
+
+        # pd_op.pow: (1x1x512xf32) <- (1x1x512xf32)
+        pow_17 = paddle._C_ops.pow(add_28, float("2"))
+
+        # pd_op.mean: (1x1x1xf32) <- (1x1x512xf32, 1xi64)
+        mean_17 = paddle._C_ops.mean(pow_17, full_int_array_1, True)
+        del pow_17
+
+        # pd_op.scale: (1x1x1xf32) <- (1x1x1xf32, 1xf32)
+        scale_33 = paddle._C_ops.scale(mean_17, full_3, float("1e-06"), True)
+        del mean_17
+
+        # pd_op.rsqrt: (1x1x1xf32) <- (1x1x1xf32)
+        rsqrt_17 = paddle._C_ops.rsqrt(scale_33)
+        del scale_33
+
+        # pd_op.multiply: (1x1x512xf32) <- (1x1x512xf32, 1x1x1xf32)
+        multiply_35 = paddle._C_ops.multiply(add_28, rsqrt_17)
+        del rsqrt_17
+
+        # pd_op.multiply: (1x1x512xf32) <- (512xf32, 1x1x512xf32)
+        multiply_36 = paddle._C_ops.multiply(parameter_56, multiply_35)
+        del multiply_35, parameter_56
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_69 = paddle._C_ops.matmul(multiply_36, parameter_60, False, False)
+        del multiply_36, parameter_60
+
+        # pd_op.reshape: (1x1x8x64xf32) <- (1x1x512xf32, 4xi64)
+        reshape_36 = paddle._C_ops.reshape(matmul_69, full_int_array_2)
+        del matmul_69
+
+        # pd_op.transpose: (1x8x1x64xf32) <- (1x1x8x64xf32)
+        transpose_38 = paddle._C_ops.transpose(reshape_36, [0, 2, 1, 3])
+        del reshape_36
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_70 = paddle._C_ops.matmul(dropout_50, parameter_59, False, False)
+        del parameter_59
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_37 = paddle._C_ops.reshape(matmul_70, full_int_array_2)
+        del matmul_70
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_39 = paddle._C_ops.transpose(reshape_37, [0, 2, 1, 3])
+        del reshape_37
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_71 = paddle._C_ops.matmul(dropout_50, parameter_58, False, False)
+        del parameter_58
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_38 = paddle._C_ops.reshape(matmul_71, full_int_array_2)
+        del matmul_71
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_40 = paddle._C_ops.transpose(reshape_38, [0, 2, 1, 3])
+        del reshape_38
+
+        # pd_op.matmul: (1x8x1x20xf32) <- (1x8x1x64xf32, 1x8x20x64xf32)
+        matmul_72 = paddle._C_ops.matmul(transpose_38, transpose_39, False, True)
+        del transpose_38
+
+        # pd_op.add: (1x8x1x20xf32) <- (1x8x1x20xf32, 1x8x1x20xf32)
+        add_29 = paddle._C_ops.add(matmul_72, add_23)
+        del matmul_72
+
+        # pd_op.softmax: (1x8x1x20xf32) <- (1x8x1x20xf32)
+        softmax_9 = paddle._C_ops.softmax(add_29, -1)
+        del add_29
+
+        # pd_op.dropout: (1x8x1x20xf32, 1x8x1x20xui8) <- (1x8x1x20xf32, None, 1xf32)
+        dropout_70, dropout_71 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_9, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_9
+
+        # pd_op.matmul: (1x8x1x64xf32) <- (1x8x1x20xf32, 1x8x20x64xf32)
+        matmul_73 = paddle._C_ops.matmul(dropout_70, transpose_40, False, False)
+        del dropout_70
+
+        # pd_op.transpose: (1x1x8x64xf32) <- (1x8x1x64xf32)
+        transpose_41 = paddle._C_ops.transpose(matmul_73, [0, 2, 1, 3])
+        del matmul_73
+
+        # pd_op.reshape: (1x1x512xf32) <- (1x1x8x64xf32, 3xi64)
+        reshape_39 = paddle._C_ops.reshape(transpose_41, full_int_array_4)
+        del transpose_41
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_74 = paddle._C_ops.matmul(reshape_39, parameter_57, False, False)
+        del parameter_57, reshape_39
+
+        # pd_op.dropout: (1x1x512xf32, 1x1x512xui8) <- (1x1x512xf32, None, 1xf32)
+        dropout_72, dropout_73 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_74, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_74
+
+        # pd_op.add: (1x1x512xf32) <- (1x1x512xf32, 1x1x512xf32)
+        add_30 = paddle._C_ops.add(add_28, dropout_72)
+        del add_28, dropout_72
+
+        # pd_op.pow: (1x1x512xf32) <- (1x1x512xf32)
+        pow_18 = paddle._C_ops.pow(add_30, float("2"))
+
+        # pd_op.mean: (1x1x1xf32) <- (1x1x512xf32, 1xi64)
+        mean_18 = paddle._C_ops.mean(pow_18, full_int_array_1, True)
+        del pow_18
+
+        # pd_op.scale: (1x1x1xf32) <- (1x1x1xf32, 1xf32)
+        scale_34 = paddle._C_ops.scale(mean_18, full_3, float("1e-06"), True)
+        del mean_18
+
+        # pd_op.rsqrt: (1x1x1xf32) <- (1x1x1xf32)
+        rsqrt_18 = paddle._C_ops.rsqrt(scale_34)
+        del scale_34
+
+        # pd_op.multiply: (1x1x512xf32) <- (1x1x512xf32, 1x1x1xf32)
+        multiply_37 = paddle._C_ops.multiply(add_30, rsqrt_18)
+        del rsqrt_18
+
+        # pd_op.multiply: (1x1x512xf32) <- (512xf32, 1x1x512xf32)
+        multiply_38 = paddle._C_ops.multiply(parameter_53, multiply_37)
+        del multiply_37, parameter_53
+
+        # pd_op.matmul: (1x1x2048xf32) <- (1x1x512xf32, 512x2048xf32)
+        matmul_75 = paddle._C_ops.matmul(multiply_38, parameter_55, False, False)
+        del multiply_38, parameter_55
+
+        # pd_op.relu: (1x1x2048xf32) <- (1x1x2048xf32)
+        relu_7 = paddle._C_ops.relu(matmul_75)
+        del matmul_75
+
+        # pd_op.dropout: (1x1x2048xf32, 1x1x2048xui8) <- (1x1x2048xf32, None, 1xf32)
+        dropout_74, dropout_75 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_7, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_7
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x2048xf32, 2048x512xf32)
+        matmul_76 = paddle._C_ops.matmul(dropout_74, parameter_54, False, False)
+        del dropout_74, parameter_54
+
+        # pd_op.dropout: (1x1x512xf32, 1x1x512xui8) <- (1x1x512xf32, None, 1xf32)
+        dropout_76, dropout_77 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_76, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_76
+
+        # pd_op.add: (1x1x512xf32) <- (1x1x512xf32, 1x1x512xf32)
+        add_31 = paddle._C_ops.add(dropout_76, add_30)
+        del add_30, dropout_76
+
+        # pd_op.pow: (1x1x512xf32) <- (1x1x512xf32)
+        pow_19 = paddle._C_ops.pow(add_31, float("2"))
+
+        # pd_op.mean: (1x1x1xf32) <- (1x1x512xf32, 1xi64)
+        mean_19 = paddle._C_ops.mean(pow_19, full_int_array_1, True)
+        del pow_19
+
+        # pd_op.scale: (1x1x1xf32) <- (1x1x1xf32, 1xf32)
+        scale_35 = paddle._C_ops.scale(mean_19, full_3, float("1e-06"), True)
+        del mean_19
+
+        # pd_op.rsqrt: (1x1x1xf32) <- (1x1x1xf32)
+        rsqrt_19 = paddle._C_ops.rsqrt(scale_35)
+        del scale_35
+
+        # pd_op.multiply: (1x1x512xf32) <- (1x1x512xf32, 1x1x1xf32)
+        multiply_39 = paddle._C_ops.multiply(add_31, rsqrt_19)
+        del rsqrt_19
+
+        # pd_op.multiply: (1x1x512xf32) <- (512xf32, 1x1x512xf32)
+        multiply_40 = paddle._C_ops.multiply(parameter_48, multiply_39)
+        del multiply_39, parameter_48
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_77 = paddle._C_ops.matmul(multiply_40, parameter_52, False, False)
+        del parameter_52
+
+        # pd_op.reshape: (1x1x8x64xf32) <- (1x1x512xf32, 4xi64)
+        reshape_40 = paddle._C_ops.reshape(matmul_77, full_int_array_2)
+        del matmul_77
+
+        # pd_op.transpose: (1x8x1x64xf32) <- (1x1x8x64xf32)
+        transpose_42 = paddle._C_ops.transpose(reshape_40, [0, 2, 1, 3])
+        del reshape_40
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_78 = paddle._C_ops.matmul(multiply_40, parameter_51, False, False)
+        del parameter_51
+
+        # pd_op.reshape: (1x1x8x64xf32) <- (1x1x512xf32, 4xi64)
+        reshape_41 = paddle._C_ops.reshape(matmul_78, full_int_array_2)
+        del matmul_78
+
+        # pd_op.transpose: (1x8x1x64xf32) <- (1x1x8x64xf32)
+        transpose_43 = paddle._C_ops.transpose(reshape_41, [0, 2, 1, 3])
+        del reshape_41
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_79 = paddle._C_ops.matmul(multiply_40, parameter_50, False, False)
+        del multiply_40, parameter_50
+
+        # pd_op.reshape: (1x1x8x64xf32) <- (1x1x512xf32, 4xi64)
+        reshape_42 = paddle._C_ops.reshape(matmul_79, full_int_array_2)
+        del matmul_79
+
+        # pd_op.transpose: (1x8x1x64xf32) <- (1x1x8x64xf32)
+        transpose_44 = paddle._C_ops.transpose(reshape_42, [0, 2, 1, 3])
+        del reshape_42
+
+        # pd_op.matmul: (1x8x1x1xf32) <- (1x8x1x64xf32, 1x8x1x64xf32)
+        matmul_80 = paddle._C_ops.matmul(transpose_42, transpose_43, False, True)
+        del transpose_42
+
+        # pd_op.add: (1x8x1x1xf32) <- (1x8x1x1xf32, 1x8x1x1xf32)
+        add_32 = paddle._C_ops.add(matmul_80, add_20)
+        del matmul_80
+
+        # pd_op.softmax: (1x8x1x1xf32) <- (1x8x1x1xf32)
+        softmax_10 = paddle._C_ops.softmax(add_32, -1)
+        del add_32
+
+        # pd_op.dropout: (1x8x1x1xf32, 1x8x1x1xui8) <- (1x8x1x1xf32, None, 1xf32)
+        dropout_78, dropout_79 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_10, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_10
+
+        # pd_op.matmul: (1x8x1x64xf32) <- (1x8x1x1xf32, 1x8x1x64xf32)
+        matmul_81 = paddle._C_ops.matmul(dropout_78, transpose_44, False, False)
+        del dropout_78
+
+        # pd_op.transpose: (1x1x8x64xf32) <- (1x8x1x64xf32)
+        transpose_45 = paddle._C_ops.transpose(matmul_81, [0, 2, 1, 3])
+        del matmul_81
+
+        # pd_op.reshape: (1x1x512xf32) <- (1x1x8x64xf32, 3xi64)
+        reshape_43 = paddle._C_ops.reshape(transpose_45, full_int_array_4)
+        del transpose_45
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_82 = paddle._C_ops.matmul(reshape_43, parameter_49, False, False)
+        del parameter_49, reshape_43
+
+        # pd_op.dropout: (1x1x512xf32, 1x1x512xui8) <- (1x1x512xf32, None, 1xf32)
+        dropout_80, dropout_81 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_82, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_82
+
+        # pd_op.add: (1x1x512xf32) <- (1x1x512xf32, 1x1x512xf32)
+        add_33 = paddle._C_ops.add(add_31, dropout_80)
+        del add_31, dropout_80
+
+        # pd_op.pow: (1x1x512xf32) <- (1x1x512xf32)
+        pow_20 = paddle._C_ops.pow(add_33, float("2"))
+
+        # pd_op.mean: (1x1x1xf32) <- (1x1x512xf32, 1xi64)
+        mean_20 = paddle._C_ops.mean(pow_20, full_int_array_1, True)
+        del pow_20
+
+        # pd_op.scale: (1x1x1xf32) <- (1x1x1xf32, 1xf32)
+        scale_36 = paddle._C_ops.scale(mean_20, full_3, float("1e-06"), True)
+        del mean_20
+
+        # pd_op.rsqrt: (1x1x1xf32) <- (1x1x1xf32)
+        rsqrt_20 = paddle._C_ops.rsqrt(scale_36)
+        del scale_36
+
+        # pd_op.multiply: (1x1x512xf32) <- (1x1x512xf32, 1x1x1xf32)
+        multiply_41 = paddle._C_ops.multiply(add_33, rsqrt_20)
+        del rsqrt_20
+
+        # pd_op.multiply: (1x1x512xf32) <- (512xf32, 1x1x512xf32)
+        multiply_42 = paddle._C_ops.multiply(parameter_43, multiply_41)
+        del multiply_41, parameter_43
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_83 = paddle._C_ops.matmul(multiply_42, parameter_47, False, False)
+        del multiply_42, parameter_47
+
+        # pd_op.reshape: (1x1x8x64xf32) <- (1x1x512xf32, 4xi64)
+        reshape_44 = paddle._C_ops.reshape(matmul_83, full_int_array_2)
+        del matmul_83
+
+        # pd_op.transpose: (1x8x1x64xf32) <- (1x1x8x64xf32)
+        transpose_46 = paddle._C_ops.transpose(reshape_44, [0, 2, 1, 3])
+        del reshape_44
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_84 = paddle._C_ops.matmul(dropout_50, parameter_46, False, False)
+        del parameter_46
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_45 = paddle._C_ops.reshape(matmul_84, full_int_array_2)
+        del matmul_84
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_47 = paddle._C_ops.transpose(reshape_45, [0, 2, 1, 3])
+        del reshape_45
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_85 = paddle._C_ops.matmul(dropout_50, parameter_45, False, False)
+        del parameter_45
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_46 = paddle._C_ops.reshape(matmul_85, full_int_array_2)
+        del matmul_85
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_48 = paddle._C_ops.transpose(reshape_46, [0, 2, 1, 3])
+        del reshape_46
+
+        # pd_op.matmul: (1x8x1x20xf32) <- (1x8x1x64xf32, 1x8x20x64xf32)
+        matmul_86 = paddle._C_ops.matmul(transpose_46, transpose_47, False, True)
+        del transpose_46
+
+        # pd_op.add: (1x8x1x20xf32) <- (1x8x1x20xf32, 1x8x1x20xf32)
+        add_34 = paddle._C_ops.add(matmul_86, add_23)
+        del matmul_86
+
+        # pd_op.softmax: (1x8x1x20xf32) <- (1x8x1x20xf32)
+        softmax_11 = paddle._C_ops.softmax(add_34, -1)
+        del add_34
+
+        # pd_op.dropout: (1x8x1x20xf32, 1x8x1x20xui8) <- (1x8x1x20xf32, None, 1xf32)
+        dropout_82, dropout_83 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_11, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_11
+
+        # pd_op.matmul: (1x8x1x64xf32) <- (1x8x1x20xf32, 1x8x20x64xf32)
+        matmul_87 = paddle._C_ops.matmul(dropout_82, transpose_48, False, False)
+        del dropout_82
+
+        # pd_op.transpose: (1x1x8x64xf32) <- (1x8x1x64xf32)
+        transpose_49 = paddle._C_ops.transpose(matmul_87, [0, 2, 1, 3])
+        del matmul_87
+
+        # pd_op.reshape: (1x1x512xf32) <- (1x1x8x64xf32, 3xi64)
+        reshape_47 = paddle._C_ops.reshape(transpose_49, full_int_array_4)
+        del transpose_49
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_88 = paddle._C_ops.matmul(reshape_47, parameter_44, False, False)
+        del parameter_44, reshape_47
+
+        # pd_op.dropout: (1x1x512xf32, 1x1x512xui8) <- (1x1x512xf32, None, 1xf32)
+        dropout_84, dropout_85 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_88, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_88
+
+        # pd_op.add: (1x1x512xf32) <- (1x1x512xf32, 1x1x512xf32)
+        add_35 = paddle._C_ops.add(add_33, dropout_84)
+        del add_33, dropout_84
+
+        # pd_op.pow: (1x1x512xf32) <- (1x1x512xf32)
+        pow_21 = paddle._C_ops.pow(add_35, float("2"))
+
+        # pd_op.mean: (1x1x1xf32) <- (1x1x512xf32, 1xi64)
+        mean_21 = paddle._C_ops.mean(pow_21, full_int_array_1, True)
+        del pow_21
+
+        # pd_op.scale: (1x1x1xf32) <- (1x1x1xf32, 1xf32)
+        scale_37 = paddle._C_ops.scale(mean_21, full_3, float("1e-06"), True)
+        del mean_21
+
+        # pd_op.rsqrt: (1x1x1xf32) <- (1x1x1xf32)
+        rsqrt_21 = paddle._C_ops.rsqrt(scale_37)
+        del scale_37
+
+        # pd_op.multiply: (1x1x512xf32) <- (1x1x512xf32, 1x1x1xf32)
+        multiply_43 = paddle._C_ops.multiply(add_35, rsqrt_21)
+        del rsqrt_21
+
+        # pd_op.multiply: (1x1x512xf32) <- (512xf32, 1x1x512xf32)
+        multiply_44 = paddle._C_ops.multiply(parameter_40, multiply_43)
+        del multiply_43, parameter_40
+
+        # pd_op.matmul: (1x1x2048xf32) <- (1x1x512xf32, 512x2048xf32)
+        matmul_89 = paddle._C_ops.matmul(multiply_44, parameter_42, False, False)
+        del multiply_44, parameter_42
+
+        # pd_op.relu: (1x1x2048xf32) <- (1x1x2048xf32)
+        relu_8 = paddle._C_ops.relu(matmul_89)
+        del matmul_89
+
+        # pd_op.dropout: (1x1x2048xf32, 1x1x2048xui8) <- (1x1x2048xf32, None, 1xf32)
+        dropout_86, dropout_87 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_8, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_8
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x2048xf32, 2048x512xf32)
+        matmul_90 = paddle._C_ops.matmul(dropout_86, parameter_41, False, False)
+        del dropout_86, parameter_41
+
+        # pd_op.dropout: (1x1x512xf32, 1x1x512xui8) <- (1x1x512xf32, None, 1xf32)
+        dropout_88, dropout_89 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_90, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_90
+
+        # pd_op.add: (1x1x512xf32) <- (1x1x512xf32, 1x1x512xf32)
+        add_36 = paddle._C_ops.add(dropout_88, add_35)
+        del add_35, dropout_88
+
+        # pd_op.pow: (1x1x512xf32) <- (1x1x512xf32)
+        pow_22 = paddle._C_ops.pow(add_36, float("2"))
+
+        # pd_op.mean: (1x1x1xf32) <- (1x1x512xf32, 1xi64)
+        mean_22 = paddle._C_ops.mean(pow_22, full_int_array_1, True)
+        del pow_22
+
+        # pd_op.scale: (1x1x1xf32) <- (1x1x1xf32, 1xf32)
+        scale_38 = paddle._C_ops.scale(mean_22, full_3, float("1e-06"), True)
+        del mean_22
+
+        # pd_op.rsqrt: (1x1x1xf32) <- (1x1x1xf32)
+        rsqrt_22 = paddle._C_ops.rsqrt(scale_38)
+        del scale_38
+
+        # pd_op.multiply: (1x1x512xf32) <- (1x1x512xf32, 1x1x1xf32)
+        multiply_45 = paddle._C_ops.multiply(add_36, rsqrt_22)
+        del rsqrt_22
+
+        # pd_op.multiply: (1x1x512xf32) <- (512xf32, 1x1x512xf32)
+        multiply_46 = paddle._C_ops.multiply(parameter_35, multiply_45)
+        del multiply_45, parameter_35
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_91 = paddle._C_ops.matmul(multiply_46, parameter_39, False, False)
+        del parameter_39
+
+        # pd_op.reshape: (1x1x8x64xf32) <- (1x1x512xf32, 4xi64)
+        reshape_48 = paddle._C_ops.reshape(matmul_91, full_int_array_2)
+        del matmul_91
+
+        # pd_op.transpose: (1x8x1x64xf32) <- (1x1x8x64xf32)
+        transpose_50 = paddle._C_ops.transpose(reshape_48, [0, 2, 1, 3])
+        del reshape_48
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_92 = paddle._C_ops.matmul(multiply_46, parameter_38, False, False)
+        del parameter_38
+
+        # pd_op.reshape: (1x1x8x64xf32) <- (1x1x512xf32, 4xi64)
+        reshape_49 = paddle._C_ops.reshape(matmul_92, full_int_array_2)
+        del matmul_92
+
+        # pd_op.transpose: (1x8x1x64xf32) <- (1x1x8x64xf32)
+        transpose_51 = paddle._C_ops.transpose(reshape_49, [0, 2, 1, 3])
+        del reshape_49
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_93 = paddle._C_ops.matmul(multiply_46, parameter_37, False, False)
+        del multiply_46, parameter_37
+
+        # pd_op.reshape: (1x1x8x64xf32) <- (1x1x512xf32, 4xi64)
+        reshape_50 = paddle._C_ops.reshape(matmul_93, full_int_array_2)
+        del matmul_93
+
+        # pd_op.transpose: (1x8x1x64xf32) <- (1x1x8x64xf32)
+        transpose_52 = paddle._C_ops.transpose(reshape_50, [0, 2, 1, 3])
+        del reshape_50
+
+        # pd_op.matmul: (1x8x1x1xf32) <- (1x8x1x64xf32, 1x8x1x64xf32)
+        matmul_94 = paddle._C_ops.matmul(transpose_50, transpose_51, False, True)
+        del transpose_50
+
+        # pd_op.add: (1x8x1x1xf32) <- (1x8x1x1xf32, 1x8x1x1xf32)
+        add_37 = paddle._C_ops.add(matmul_94, add_20)
+        del matmul_94
+
+        # pd_op.softmax: (1x8x1x1xf32) <- (1x8x1x1xf32)
+        softmax_12 = paddle._C_ops.softmax(add_37, -1)
+        del add_37
+
+        # pd_op.dropout: (1x8x1x1xf32, 1x8x1x1xui8) <- (1x8x1x1xf32, None, 1xf32)
+        dropout_90, dropout_91 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_12, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_12
+
+        # pd_op.matmul: (1x8x1x64xf32) <- (1x8x1x1xf32, 1x8x1x64xf32)
+        matmul_95 = paddle._C_ops.matmul(dropout_90, transpose_52, False, False)
+        del dropout_90
+
+        # pd_op.transpose: (1x1x8x64xf32) <- (1x8x1x64xf32)
+        transpose_53 = paddle._C_ops.transpose(matmul_95, [0, 2, 1, 3])
+        del matmul_95
+
+        # pd_op.reshape: (1x1x512xf32) <- (1x1x8x64xf32, 3xi64)
+        reshape_51 = paddle._C_ops.reshape(transpose_53, full_int_array_4)
+        del transpose_53
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_96 = paddle._C_ops.matmul(reshape_51, parameter_36, False, False)
+        del parameter_36, reshape_51
+
+        # pd_op.dropout: (1x1x512xf32, 1x1x512xui8) <- (1x1x512xf32, None, 1xf32)
+        dropout_92, dropout_93 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_96, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_96
+
+        # pd_op.add: (1x1x512xf32) <- (1x1x512xf32, 1x1x512xf32)
+        add_38 = paddle._C_ops.add(add_36, dropout_92)
+        del add_36, dropout_92
+
+        # pd_op.pow: (1x1x512xf32) <- (1x1x512xf32)
+        pow_23 = paddle._C_ops.pow(add_38, float("2"))
+
+        # pd_op.mean: (1x1x1xf32) <- (1x1x512xf32, 1xi64)
+        mean_23 = paddle._C_ops.mean(pow_23, full_int_array_1, True)
+        del pow_23
+
+        # pd_op.scale: (1x1x1xf32) <- (1x1x1xf32, 1xf32)
+        scale_39 = paddle._C_ops.scale(mean_23, full_3, float("1e-06"), True)
+        del mean_23
+
+        # pd_op.rsqrt: (1x1x1xf32) <- (1x1x1xf32)
+        rsqrt_23 = paddle._C_ops.rsqrt(scale_39)
+        del scale_39
+
+        # pd_op.multiply: (1x1x512xf32) <- (1x1x512xf32, 1x1x1xf32)
+        multiply_47 = paddle._C_ops.multiply(add_38, rsqrt_23)
+        del rsqrt_23
+
+        # pd_op.multiply: (1x1x512xf32) <- (512xf32, 1x1x512xf32)
+        multiply_48 = paddle._C_ops.multiply(parameter_30, multiply_47)
+        del multiply_47, parameter_30
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_97 = paddle._C_ops.matmul(multiply_48, parameter_34, False, False)
+        del multiply_48, parameter_34
+
+        # pd_op.reshape: (1x1x8x64xf32) <- (1x1x512xf32, 4xi64)
+        reshape_52 = paddle._C_ops.reshape(matmul_97, full_int_array_2)
+        del matmul_97
+
+        # pd_op.transpose: (1x8x1x64xf32) <- (1x1x8x64xf32)
+        transpose_54 = paddle._C_ops.transpose(reshape_52, [0, 2, 1, 3])
+        del reshape_52
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_98 = paddle._C_ops.matmul(dropout_50, parameter_33, False, False)
+        del parameter_33
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_53 = paddle._C_ops.reshape(matmul_98, full_int_array_2)
+        del matmul_98
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_55 = paddle._C_ops.transpose(reshape_53, [0, 2, 1, 3])
+        del reshape_53
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_99 = paddle._C_ops.matmul(dropout_50, parameter_32, False, False)
+        del parameter_32
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_54 = paddle._C_ops.reshape(matmul_99, full_int_array_2)
+        del matmul_99
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_56 = paddle._C_ops.transpose(reshape_54, [0, 2, 1, 3])
+        del reshape_54
+
+        # pd_op.matmul: (1x8x1x20xf32) <- (1x8x1x64xf32, 1x8x20x64xf32)
+        matmul_100 = paddle._C_ops.matmul(transpose_54, transpose_55, False, True)
+        del transpose_54
+
+        # pd_op.add: (1x8x1x20xf32) <- (1x8x1x20xf32, 1x8x1x20xf32)
+        add_39 = paddle._C_ops.add(matmul_100, add_23)
+        del matmul_100
+
+        # pd_op.softmax: (1x8x1x20xf32) <- (1x8x1x20xf32)
+        softmax_13 = paddle._C_ops.softmax(add_39, -1)
+        del add_39
+
+        # pd_op.dropout: (1x8x1x20xf32, 1x8x1x20xui8) <- (1x8x1x20xf32, None, 1xf32)
+        dropout_94, dropout_95 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_13, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_13
+
+        # pd_op.matmul: (1x8x1x64xf32) <- (1x8x1x20xf32, 1x8x20x64xf32)
+        matmul_101 = paddle._C_ops.matmul(dropout_94, transpose_56, False, False)
+        del dropout_94
+
+        # pd_op.transpose: (1x1x8x64xf32) <- (1x8x1x64xf32)
+        transpose_57 = paddle._C_ops.transpose(matmul_101, [0, 2, 1, 3])
+        del matmul_101
+
+        # pd_op.reshape: (1x1x512xf32) <- (1x1x8x64xf32, 3xi64)
+        reshape_55 = paddle._C_ops.reshape(transpose_57, full_int_array_4)
+        del transpose_57
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_102 = paddle._C_ops.matmul(reshape_55, parameter_31, False, False)
+        del parameter_31, reshape_55
+
+        # pd_op.dropout: (1x1x512xf32, 1x1x512xui8) <- (1x1x512xf32, None, 1xf32)
+        dropout_96, dropout_97 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_102, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_102
+
+        # pd_op.add: (1x1x512xf32) <- (1x1x512xf32, 1x1x512xf32)
+        add_40 = paddle._C_ops.add(add_38, dropout_96)
+        del add_38, dropout_96
+
+        # pd_op.pow: (1x1x512xf32) <- (1x1x512xf32)
+        pow_24 = paddle._C_ops.pow(add_40, float("2"))
+
+        # pd_op.mean: (1x1x1xf32) <- (1x1x512xf32, 1xi64)
+        mean_24 = paddle._C_ops.mean(pow_24, full_int_array_1, True)
+        del pow_24
+
+        # pd_op.scale: (1x1x1xf32) <- (1x1x1xf32, 1xf32)
+        scale_40 = paddle._C_ops.scale(mean_24, full_3, float("1e-06"), True)
+        del mean_24
+
+        # pd_op.rsqrt: (1x1x1xf32) <- (1x1x1xf32)
+        rsqrt_24 = paddle._C_ops.rsqrt(scale_40)
+        del scale_40
+
+        # pd_op.multiply: (1x1x512xf32) <- (1x1x512xf32, 1x1x1xf32)
+        multiply_49 = paddle._C_ops.multiply(add_40, rsqrt_24)
+        del rsqrt_24
+
+        # pd_op.multiply: (1x1x512xf32) <- (512xf32, 1x1x512xf32)
+        multiply_50 = paddle._C_ops.multiply(parameter_27, multiply_49)
+        del multiply_49, parameter_27
+
+        # pd_op.matmul: (1x1x2048xf32) <- (1x1x512xf32, 512x2048xf32)
+        matmul_103 = paddle._C_ops.matmul(multiply_50, parameter_29, False, False)
+        del multiply_50, parameter_29
+
+        # pd_op.relu: (1x1x2048xf32) <- (1x1x2048xf32)
+        relu_9 = paddle._C_ops.relu(matmul_103)
+        del matmul_103
+
+        # pd_op.dropout: (1x1x2048xf32, 1x1x2048xui8) <- (1x1x2048xf32, None, 1xf32)
+        dropout_98, dropout_99 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_9, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_9
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x2048xf32, 2048x512xf32)
+        matmul_104 = paddle._C_ops.matmul(dropout_98, parameter_28, False, False)
+        del dropout_98, parameter_28
+
+        # pd_op.dropout: (1x1x512xf32, 1x1x512xui8) <- (1x1x512xf32, None, 1xf32)
+        dropout_100, dropout_101 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_104, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_104
+
+        # pd_op.add: (1x1x512xf32) <- (1x1x512xf32, 1x1x512xf32)
+        add_41 = paddle._C_ops.add(dropout_100, add_40)
+        del add_40, dropout_100
+
+        # pd_op.pow: (1x1x512xf32) <- (1x1x512xf32)
+        pow_25 = paddle._C_ops.pow(add_41, float("2"))
+
+        # pd_op.mean: (1x1x1xf32) <- (1x1x512xf32, 1xi64)
+        mean_25 = paddle._C_ops.mean(pow_25, full_int_array_1, True)
+        del pow_25
+
+        # pd_op.scale: (1x1x1xf32) <- (1x1x1xf32, 1xf32)
+        scale_41 = paddle._C_ops.scale(mean_25, full_3, float("1e-06"), True)
+        del mean_25
+
+        # pd_op.rsqrt: (1x1x1xf32) <- (1x1x1xf32)
+        rsqrt_25 = paddle._C_ops.rsqrt(scale_41)
+        del scale_41
+
+        # pd_op.multiply: (1x1x512xf32) <- (1x1x512xf32, 1x1x1xf32)
+        multiply_51 = paddle._C_ops.multiply(add_41, rsqrt_25)
+        del rsqrt_25
+
+        # pd_op.multiply: (1x1x512xf32) <- (512xf32, 1x1x512xf32)
+        multiply_52 = paddle._C_ops.multiply(parameter_22, multiply_51)
+        del multiply_51, parameter_22
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_105 = paddle._C_ops.matmul(multiply_52, parameter_26, False, False)
+        del parameter_26
+
+        # pd_op.reshape: (1x1x8x64xf32) <- (1x1x512xf32, 4xi64)
+        reshape_56 = paddle._C_ops.reshape(matmul_105, full_int_array_2)
+        del matmul_105
+
+        # pd_op.transpose: (1x8x1x64xf32) <- (1x1x8x64xf32)
+        transpose_58 = paddle._C_ops.transpose(reshape_56, [0, 2, 1, 3])
+        del reshape_56
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_106 = paddle._C_ops.matmul(multiply_52, parameter_25, False, False)
+        del parameter_25
+
+        # pd_op.reshape: (1x1x8x64xf32) <- (1x1x512xf32, 4xi64)
+        reshape_57 = paddle._C_ops.reshape(matmul_106, full_int_array_2)
+        del matmul_106
+
+        # pd_op.transpose: (1x8x1x64xf32) <- (1x1x8x64xf32)
+        transpose_59 = paddle._C_ops.transpose(reshape_57, [0, 2, 1, 3])
+        del reshape_57
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_107 = paddle._C_ops.matmul(multiply_52, parameter_24, False, False)
+        del multiply_52, parameter_24
+
+        # pd_op.reshape: (1x1x8x64xf32) <- (1x1x512xf32, 4xi64)
+        reshape_58 = paddle._C_ops.reshape(matmul_107, full_int_array_2)
+        del matmul_107
+
+        # pd_op.transpose: (1x8x1x64xf32) <- (1x1x8x64xf32)
+        transpose_60 = paddle._C_ops.transpose(reshape_58, [0, 2, 1, 3])
+        del reshape_58
+
+        # pd_op.matmul: (1x8x1x1xf32) <- (1x8x1x64xf32, 1x8x1x64xf32)
+        matmul_108 = paddle._C_ops.matmul(transpose_58, transpose_59, False, True)
+        del transpose_58
+
+        # pd_op.add: (1x8x1x1xf32) <- (1x8x1x1xf32, 1x8x1x1xf32)
+        add_42 = paddle._C_ops.add(matmul_108, add_20)
+        del matmul_108
+
+        # pd_op.softmax: (1x8x1x1xf32) <- (1x8x1x1xf32)
+        softmax_14 = paddle._C_ops.softmax(add_42, -1)
+        del add_42
+
+        # pd_op.dropout: (1x8x1x1xf32, 1x8x1x1xui8) <- (1x8x1x1xf32, None, 1xf32)
+        dropout_102, dropout_103 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_14, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_14
+
+        # pd_op.matmul: (1x8x1x64xf32) <- (1x8x1x1xf32, 1x8x1x64xf32)
+        matmul_109 = paddle._C_ops.matmul(dropout_102, transpose_60, False, False)
+        del dropout_102
+
+        # pd_op.transpose: (1x1x8x64xf32) <- (1x8x1x64xf32)
+        transpose_61 = paddle._C_ops.transpose(matmul_109, [0, 2, 1, 3])
+        del matmul_109
+
+        # pd_op.reshape: (1x1x512xf32) <- (1x1x8x64xf32, 3xi64)
+        reshape_59 = paddle._C_ops.reshape(transpose_61, full_int_array_4)
+        del transpose_61
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_110 = paddle._C_ops.matmul(reshape_59, parameter_23, False, False)
+        del parameter_23, reshape_59
+
+        # pd_op.dropout: (1x1x512xf32, 1x1x512xui8) <- (1x1x512xf32, None, 1xf32)
+        dropout_104, dropout_105 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_110, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_110
+
+        # pd_op.add: (1x1x512xf32) <- (1x1x512xf32, 1x1x512xf32)
+        add_43 = paddle._C_ops.add(add_41, dropout_104)
+        del add_41, dropout_104
+
+        # pd_op.pow: (1x1x512xf32) <- (1x1x512xf32)
+        pow_26 = paddle._C_ops.pow(add_43, float("2"))
+
+        # pd_op.mean: (1x1x1xf32) <- (1x1x512xf32, 1xi64)
+        mean_26 = paddle._C_ops.mean(pow_26, full_int_array_1, True)
+        del pow_26
+
+        # pd_op.scale: (1x1x1xf32) <- (1x1x1xf32, 1xf32)
+        scale_42 = paddle._C_ops.scale(mean_26, full_3, float("1e-06"), True)
+        del mean_26
+
+        # pd_op.rsqrt: (1x1x1xf32) <- (1x1x1xf32)
+        rsqrt_26 = paddle._C_ops.rsqrt(scale_42)
+        del scale_42
+
+        # pd_op.multiply: (1x1x512xf32) <- (1x1x512xf32, 1x1x1xf32)
+        multiply_53 = paddle._C_ops.multiply(add_43, rsqrt_26)
+        del rsqrt_26
+
+        # pd_op.multiply: (1x1x512xf32) <- (512xf32, 1x1x512xf32)
+        multiply_54 = paddle._C_ops.multiply(parameter_17, multiply_53)
+        del multiply_53, parameter_17
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_111 = paddle._C_ops.matmul(multiply_54, parameter_21, False, False)
+        del multiply_54, parameter_21
+
+        # pd_op.reshape: (1x1x8x64xf32) <- (1x1x512xf32, 4xi64)
+        reshape_60 = paddle._C_ops.reshape(matmul_111, full_int_array_2)
+        del matmul_111
+
+        # pd_op.transpose: (1x8x1x64xf32) <- (1x1x8x64xf32)
+        transpose_62 = paddle._C_ops.transpose(reshape_60, [0, 2, 1, 3])
+        del reshape_60
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_112 = paddle._C_ops.matmul(dropout_50, parameter_20, False, False)
+        del parameter_20
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_61 = paddle._C_ops.reshape(matmul_112, full_int_array_2)
+        del matmul_112
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_63 = paddle._C_ops.transpose(reshape_61, [0, 2, 1, 3])
+        del reshape_61
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_113 = paddle._C_ops.matmul(dropout_50, parameter_19, False, False)
+        del parameter_19
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_62 = paddle._C_ops.reshape(matmul_113, full_int_array_2)
+        del matmul_113
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_64 = paddle._C_ops.transpose(reshape_62, [0, 2, 1, 3])
+        del reshape_62
+
+        # pd_op.matmul: (1x8x1x20xf32) <- (1x8x1x64xf32, 1x8x20x64xf32)
+        matmul_114 = paddle._C_ops.matmul(transpose_62, transpose_63, False, True)
+        del transpose_62
+
+        # pd_op.add: (1x8x1x20xf32) <- (1x8x1x20xf32, 1x8x1x20xf32)
+        add_44 = paddle._C_ops.add(matmul_114, add_23)
+        del matmul_114
+
+        # pd_op.softmax: (1x8x1x20xf32) <- (1x8x1x20xf32)
+        softmax_15 = paddle._C_ops.softmax(add_44, -1)
+        del add_44
+
+        # pd_op.dropout: (1x8x1x20xf32, 1x8x1x20xui8) <- (1x8x1x20xf32, None, 1xf32)
+        dropout_106, dropout_107 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_15, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_15
+
+        # pd_op.matmul: (1x8x1x64xf32) <- (1x8x1x20xf32, 1x8x20x64xf32)
+        matmul_115 = paddle._C_ops.matmul(dropout_106, transpose_64, False, False)
+        del dropout_106
+
+        # pd_op.transpose: (1x1x8x64xf32) <- (1x8x1x64xf32)
+        transpose_65 = paddle._C_ops.transpose(matmul_115, [0, 2, 1, 3])
+        del matmul_115
+
+        # pd_op.reshape: (1x1x512xf32) <- (1x1x8x64xf32, 3xi64)
+        reshape_63 = paddle._C_ops.reshape(transpose_65, full_int_array_4)
+        del transpose_65
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_116 = paddle._C_ops.matmul(reshape_63, parameter_18, False, False)
+        del parameter_18, reshape_63
+
+        # pd_op.dropout: (1x1x512xf32, 1x1x512xui8) <- (1x1x512xf32, None, 1xf32)
+        dropout_108, dropout_109 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_116, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_116
+
+        # pd_op.add: (1x1x512xf32) <- (1x1x512xf32, 1x1x512xf32)
+        add_45 = paddle._C_ops.add(add_43, dropout_108)
+        del add_43, dropout_108
+
+        # pd_op.pow: (1x1x512xf32) <- (1x1x512xf32)
+        pow_27 = paddle._C_ops.pow(add_45, float("2"))
+
+        # pd_op.mean: (1x1x1xf32) <- (1x1x512xf32, 1xi64)
+        mean_27 = paddle._C_ops.mean(pow_27, full_int_array_1, True)
+        del pow_27
+
+        # pd_op.scale: (1x1x1xf32) <- (1x1x1xf32, 1xf32)
+        scale_43 = paddle._C_ops.scale(mean_27, full_3, float("1e-06"), True)
+        del mean_27
+
+        # pd_op.rsqrt: (1x1x1xf32) <- (1x1x1xf32)
+        rsqrt_27 = paddle._C_ops.rsqrt(scale_43)
+        del scale_43
+
+        # pd_op.multiply: (1x1x512xf32) <- (1x1x512xf32, 1x1x1xf32)
+        multiply_55 = paddle._C_ops.multiply(add_45, rsqrt_27)
+        del rsqrt_27
+
+        # pd_op.multiply: (1x1x512xf32) <- (512xf32, 1x1x512xf32)
+        multiply_56 = paddle._C_ops.multiply(parameter_14, multiply_55)
+        del multiply_55, parameter_14
+
+        # pd_op.matmul: (1x1x2048xf32) <- (1x1x512xf32, 512x2048xf32)
+        matmul_117 = paddle._C_ops.matmul(multiply_56, parameter_16, False, False)
+        del multiply_56, parameter_16
+
+        # pd_op.relu: (1x1x2048xf32) <- (1x1x2048xf32)
+        relu_10 = paddle._C_ops.relu(matmul_117)
+        del matmul_117
+
+        # pd_op.dropout: (1x1x2048xf32, 1x1x2048xui8) <- (1x1x2048xf32, None, 1xf32)
+        dropout_110, dropout_111 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_10, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_10
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x2048xf32, 2048x512xf32)
+        matmul_118 = paddle._C_ops.matmul(dropout_110, parameter_15, False, False)
+        del dropout_110, parameter_15
+
+        # pd_op.dropout: (1x1x512xf32, 1x1x512xui8) <- (1x1x512xf32, None, 1xf32)
+        dropout_112, dropout_113 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_118, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_118
+
+        # pd_op.add: (1x1x512xf32) <- (1x1x512xf32, 1x1x512xf32)
+        add_46 = paddle._C_ops.add(dropout_112, add_45)
+        del add_45, dropout_112
+
+        # pd_op.pow: (1x1x512xf32) <- (1x1x512xf32)
+        pow_28 = paddle._C_ops.pow(add_46, float("2"))
+
+        # pd_op.mean: (1x1x1xf32) <- (1x1x512xf32, 1xi64)
+        mean_28 = paddle._C_ops.mean(pow_28, full_int_array_1, True)
+        del pow_28
+
+        # pd_op.scale: (1x1x1xf32) <- (1x1x1xf32, 1xf32)
+        scale_44 = paddle._C_ops.scale(mean_28, full_3, float("1e-06"), True)
+        del mean_28
+
+        # pd_op.rsqrt: (1x1x1xf32) <- (1x1x1xf32)
+        rsqrt_28 = paddle._C_ops.rsqrt(scale_44)
+        del scale_44
+
+        # pd_op.multiply: (1x1x512xf32) <- (1x1x512xf32, 1x1x1xf32)
+        multiply_57 = paddle._C_ops.multiply(add_46, rsqrt_28)
+        del rsqrt_28
+
+        # pd_op.multiply: (1x1x512xf32) <- (512xf32, 1x1x512xf32)
+        multiply_58 = paddle._C_ops.multiply(parameter_9, multiply_57)
+        del multiply_57, parameter_9
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_119 = paddle._C_ops.matmul(multiply_58, parameter_13, False, False)
+        del parameter_13
+
+        # pd_op.reshape: (1x1x8x64xf32) <- (1x1x512xf32, 4xi64)
+        reshape_64 = paddle._C_ops.reshape(matmul_119, full_int_array_2)
+        del matmul_119
+
+        # pd_op.transpose: (1x8x1x64xf32) <- (1x1x8x64xf32)
+        transpose_66 = paddle._C_ops.transpose(reshape_64, [0, 2, 1, 3])
+        del reshape_64
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_120 = paddle._C_ops.matmul(multiply_58, parameter_12, False, False)
+        del parameter_12
+
+        # pd_op.reshape: (1x1x8x64xf32) <- (1x1x512xf32, 4xi64)
+        reshape_65 = paddle._C_ops.reshape(matmul_120, full_int_array_2)
+        del matmul_120
+
+        # pd_op.transpose: (1x8x1x64xf32) <- (1x1x8x64xf32)
+        transpose_67 = paddle._C_ops.transpose(reshape_65, [0, 2, 1, 3])
+        del reshape_65
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_121 = paddle._C_ops.matmul(multiply_58, parameter_11, False, False)
+        del multiply_58, parameter_11
+
+        # pd_op.reshape: (1x1x8x64xf32) <- (1x1x512xf32, 4xi64)
+        reshape_66 = paddle._C_ops.reshape(matmul_121, full_int_array_2)
+        del matmul_121
+
+        # pd_op.transpose: (1x8x1x64xf32) <- (1x1x8x64xf32)
+        transpose_68 = paddle._C_ops.transpose(reshape_66, [0, 2, 1, 3])
+        del reshape_66
+
+        # pd_op.matmul: (1x8x1x1xf32) <- (1x8x1x64xf32, 1x8x1x64xf32)
+        matmul_122 = paddle._C_ops.matmul(transpose_66, transpose_67, False, True)
+        del transpose_66
+
+        # pd_op.add: (1x8x1x1xf32) <- (1x8x1x1xf32, 1x8x1x1xf32)
+        add_47 = paddle._C_ops.add(matmul_122, add_20)
+        del add_20, matmul_122
+
+        # pd_op.softmax: (1x8x1x1xf32) <- (1x8x1x1xf32)
+        softmax_16 = paddle._C_ops.softmax(add_47, -1)
+        del add_47
+
+        # pd_op.dropout: (1x8x1x1xf32, 1x8x1x1xui8) <- (1x8x1x1xf32, None, 1xf32)
+        dropout_114, dropout_115 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_16, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_16
+
+        # pd_op.matmul: (1x8x1x64xf32) <- (1x8x1x1xf32, 1x8x1x64xf32)
+        matmul_123 = paddle._C_ops.matmul(dropout_114, transpose_68, False, False)
+        del dropout_114
+
+        # pd_op.transpose: (1x1x8x64xf32) <- (1x8x1x64xf32)
+        transpose_69 = paddle._C_ops.transpose(matmul_123, [0, 2, 1, 3])
+        del matmul_123
+
+        # pd_op.reshape: (1x1x512xf32) <- (1x1x8x64xf32, 3xi64)
+        reshape_67 = paddle._C_ops.reshape(transpose_69, full_int_array_4)
+        del transpose_69
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_124 = paddle._C_ops.matmul(reshape_67, parameter_10, False, False)
+        del parameter_10, reshape_67
+
+        # pd_op.dropout: (1x1x512xf32, 1x1x512xui8) <- (1x1x512xf32, None, 1xf32)
+        dropout_116, dropout_117 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_124, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_124
+
+        # pd_op.add: (1x1x512xf32) <- (1x1x512xf32, 1x1x512xf32)
+        add_48 = paddle._C_ops.add(add_46, dropout_116)
+        del add_46, dropout_116
+
+        # pd_op.pow: (1x1x512xf32) <- (1x1x512xf32)
+        pow_29 = paddle._C_ops.pow(add_48, float("2"))
+
+        # pd_op.mean: (1x1x1xf32) <- (1x1x512xf32, 1xi64)
+        mean_29 = paddle._C_ops.mean(pow_29, full_int_array_1, True)
+        del pow_29
+
+        # pd_op.scale: (1x1x1xf32) <- (1x1x1xf32, 1xf32)
+        scale_45 = paddle._C_ops.scale(mean_29, full_3, float("1e-06"), True)
+        del mean_29
+
+        # pd_op.rsqrt: (1x1x1xf32) <- (1x1x1xf32)
+        rsqrt_29 = paddle._C_ops.rsqrt(scale_45)
+        del scale_45
+
+        # pd_op.multiply: (1x1x512xf32) <- (1x1x512xf32, 1x1x1xf32)
+        multiply_59 = paddle._C_ops.multiply(add_48, rsqrt_29)
+        del rsqrt_29
+
+        # pd_op.multiply: (1x1x512xf32) <- (512xf32, 1x1x512xf32)
+        multiply_60 = paddle._C_ops.multiply(parameter_4, multiply_59)
+        del multiply_59, parameter_4
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_125 = paddle._C_ops.matmul(multiply_60, parameter_8, False, False)
+        del multiply_60, parameter_8
+
+        # pd_op.reshape: (1x1x8x64xf32) <- (1x1x512xf32, 4xi64)
+        reshape_68 = paddle._C_ops.reshape(matmul_125, full_int_array_2)
+        del matmul_125
+
+        # pd_op.transpose: (1x8x1x64xf32) <- (1x1x8x64xf32)
+        transpose_70 = paddle._C_ops.transpose(reshape_68, [0, 2, 1, 3])
+        del reshape_68
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_126 = paddle._C_ops.matmul(dropout_50, parameter_7, False, False)
+        del parameter_7
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_69 = paddle._C_ops.reshape(matmul_126, full_int_array_2)
+        del matmul_126
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_71 = paddle._C_ops.transpose(reshape_69, [0, 2, 1, 3])
+        del reshape_69
+
+        # pd_op.matmul: (1x20x512xf32) <- (1x20x512xf32, 512x512xf32)
+        matmul_127 = paddle._C_ops.matmul(dropout_50, parameter_6, False, False)
+        del parameter_6
+
+        # pd_op.reshape: (1x20x8x64xf32) <- (1x20x512xf32, 4xi64)
+        reshape_70 = paddle._C_ops.reshape(matmul_127, full_int_array_2)
+        del full_int_array_2, matmul_127
+
+        # pd_op.transpose: (1x8x20x64xf32) <- (1x20x8x64xf32)
+        transpose_72 = paddle._C_ops.transpose(reshape_70, [0, 2, 1, 3])
+        del reshape_70
+
+        # pd_op.matmul: (1x8x1x20xf32) <- (1x8x1x64xf32, 1x8x20x64xf32)
+        matmul_128 = paddle._C_ops.matmul(transpose_70, transpose_71, False, True)
+        del transpose_70
+
+        # pd_op.add: (1x8x1x20xf32) <- (1x8x1x20xf32, 1x8x1x20xf32)
+        add_49 = paddle._C_ops.add(matmul_128, add_23)
+        del add_23, matmul_128
+
+        # pd_op.softmax: (1x8x1x20xf32) <- (1x8x1x20xf32)
+        softmax_17 = paddle._C_ops.softmax(add_49, -1)
+        del add_49
+
+        # pd_op.dropout: (1x8x1x20xf32, 1x8x1x20xui8) <- (1x8x1x20xf32, None, 1xf32)
+        dropout_118, dropout_119 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                softmax_17, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del softmax_17
+
+        # pd_op.matmul: (1x8x1x64xf32) <- (1x8x1x20xf32, 1x8x20x64xf32)
+        matmul_129 = paddle._C_ops.matmul(dropout_118, transpose_72, False, False)
+        del dropout_118
+
+        # pd_op.transpose: (1x1x8x64xf32) <- (1x8x1x64xf32)
+        transpose_73 = paddle._C_ops.transpose(matmul_129, [0, 2, 1, 3])
+        del matmul_129
+
+        # pd_op.reshape: (1x1x512xf32) <- (1x1x8x64xf32, 3xi64)
+        reshape_71 = paddle._C_ops.reshape(transpose_73, full_int_array_4)
+        del full_int_array_4, transpose_73
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x512xf32, 512x512xf32)
+        matmul_130 = paddle._C_ops.matmul(reshape_71, parameter_5, False, False)
+        del parameter_5, reshape_71
+
+        # pd_op.dropout: (1x1x512xf32, 1x1x512xui8) <- (1x1x512xf32, None, 1xf32)
+        dropout_120, dropout_121 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_130, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_130
+
+        # pd_op.add: (1x1x512xf32) <- (1x1x512xf32, 1x1x512xf32)
+        add_50 = paddle._C_ops.add(add_48, dropout_120)
+        del add_48, dropout_120
+
+        # pd_op.pow: (1x1x512xf32) <- (1x1x512xf32)
+        pow_30 = paddle._C_ops.pow(add_50, float("2"))
+
+        # pd_op.mean: (1x1x1xf32) <- (1x1x512xf32, 1xi64)
+        mean_30 = paddle._C_ops.mean(pow_30, full_int_array_1, True)
+        del pow_30
+
+        # pd_op.scale: (1x1x1xf32) <- (1x1x1xf32, 1xf32)
+        scale_46 = paddle._C_ops.scale(mean_30, full_3, float("1e-06"), True)
+        del mean_30
+
+        # pd_op.rsqrt: (1x1x1xf32) <- (1x1x1xf32)
+        rsqrt_30 = paddle._C_ops.rsqrt(scale_46)
+        del scale_46
+
+        # pd_op.multiply: (1x1x512xf32) <- (1x1x512xf32, 1x1x1xf32)
+        multiply_61 = paddle._C_ops.multiply(add_50, rsqrt_30)
+        del rsqrt_30
+
+        # pd_op.multiply: (1x1x512xf32) <- (512xf32, 1x1x512xf32)
+        multiply_62 = paddle._C_ops.multiply(parameter_1, multiply_61)
+        del multiply_61, parameter_1
+
+        # pd_op.matmul: (1x1x2048xf32) <- (1x1x512xf32, 512x2048xf32)
+        matmul_131 = paddle._C_ops.matmul(multiply_62, parameter_3, False, False)
+        del multiply_62, parameter_3
+
+        # pd_op.relu: (1x1x2048xf32) <- (1x1x2048xf32)
+        relu_11 = paddle._C_ops.relu(matmul_131)
+        del matmul_131
+
+        # pd_op.dropout: (1x1x2048xf32, 1x1x2048xui8) <- (1x1x2048xf32, None, 1xf32)
+        dropout_122, dropout_123 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                relu_11, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del relu_11
+
+        # pd_op.matmul: (1x1x512xf32) <- (1x1x2048xf32, 2048x512xf32)
+        matmul_132 = paddle._C_ops.matmul(dropout_122, parameter_2, False, False)
+        del dropout_122, parameter_2
+
+        # pd_op.dropout: (1x1x512xf32, 1x1x512xui8) <- (1x1x512xf32, None, 1xf32)
+        dropout_124, dropout_125 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                matmul_132, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del matmul_132
+
+        # pd_op.add: (1x1x512xf32) <- (1x1x512xf32, 1x1x512xf32)
+        add_51 = paddle._C_ops.add(dropout_124, add_50)
+        del add_50, dropout_124
+
+        # pd_op.pow: (1x1x512xf32) <- (1x1x512xf32)
+        pow_31 = paddle._C_ops.pow(add_51, float("2"))
+
+        # pd_op.mean: (1x1x1xf32) <- (1x1x512xf32, 1xi64)
+        mean_31 = paddle._C_ops.mean(pow_31, full_int_array_1, True)
+        del full_int_array_1, pow_31
+
+        # pd_op.scale: (1x1x1xf32) <- (1x1x1xf32, 1xf32)
+        scale_47 = paddle._C_ops.scale(mean_31, full_3, float("1e-06"), True)
+        del full_3, mean_31
+
+        # pd_op.rsqrt: (1x1x1xf32) <- (1x1x1xf32)
+        rsqrt_31 = paddle._C_ops.rsqrt(scale_47)
+        del scale_47
+
+        # pd_op.multiply: (1x1x512xf32) <- (1x1x512xf32, 1x1x1xf32)
+        multiply_63 = paddle._C_ops.multiply(add_51, rsqrt_31)
+        del add_51, rsqrt_31
+
+        # pd_op.multiply: (1x1x512xf32) <- (512xf32, 1x1x512xf32)
+        multiply_64 = paddle._C_ops.multiply(parameter_0, multiply_63)
+        del multiply_63, parameter_0
+
+        # pd_op.dropout: (1x1x512xf32, 1x1x512xui8) <- (1x1x512xf32, None, 1xf32)
+        dropout_126, dropout_127 = (lambda x, f: f(x))(
+            paddle._C_ops.dropout(
+                multiply_64, None, full_2, True, "upscale_in_train", 0, False
+            ),
+            lambda out: out if isinstance(out, (list, tuple)) else (out, None),
+        )
+        del full_2, multiply_64
+
+        # pd_op.full: (1xf32) <- ()
+        full_21 = paddle._C_ops.full(
+            [1], float("0.0441942"), paddle.float32, paddle.core.CPUPlace()
+        )
+
+        # pd_op.scale: (1x1x512xf32) <- (1x1x512xf32, 1xf32)
+        scale_48 = paddle._C_ops.scale(dropout_126, full_21, float("0"), True)
+        del dropout_126, full_21
+
+        # pd_op.matmul: (1x1x32128xf32) <- (1x1x512xf32, 32128x512xf32)
+        matmul_0 = paddle._C_ops.matmul(scale_48, parameter_130, False, True)
+        del (
+            dropout_50,
+            parameter_130,
+            scale_48,
+            transpose_26,
+            transpose_27,
+            transpose_31,
+            transpose_32,
+            transpose_35,
+            transpose_36,
+            transpose_39,
+            transpose_40,
+            transpose_43,
+            transpose_44,
+            transpose_47,
+            transpose_48,
+            transpose_51,
+            transpose_52,
+            transpose_55,
+            transpose_56,
+            transpose_59,
+            transpose_60,
+            transpose_63,
+            transpose_64,
+            transpose_67,
+            transpose_68,
+            transpose_71,
+            transpose_72,
+        )
+
+        return matmul_0
diff --git a/paddle_samples/PaddleNLP/t5-small/weight_meta.py b/paddle_samples/PaddleNLP/t5-small/weight_meta.py
new file mode 100644
index 000000000..a5a523c26
--- /dev/null
+++ b/paddle_samples/PaddleNLP/t5-small/weight_meta.py
@@ -0,0 +1,1439 @@
+class Program_weight_tensor_parameter_0:
+    name = "parameter_0"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("-0.000406265")
+    max_val = float("9.5625")
+    mean = float("0.238993")
+    std = float("0.528551")
+    data = None
+
+
+class Program_weight_tensor_parameter_1:
+    name = "parameter_1"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("-1.03125")
+    max_val = float("12.125")
+    mean = float("3.0563")
+    std = float("1.25819")
+    data = None
+
+
+class Program_weight_tensor_parameter_2:
+    name = "parameter_2"
+    shape = [2048, 512]
+    dtype = "float32"
+    min_val = float("-45.25")
+    max_val = float("24.5")
+    mean = float("0.000320316")
+    std = float("0.469186")
+    data = None
+
+
+class Program_weight_tensor_parameter_3:
+    name = "parameter_3"
+    shape = [512, 2048]
+    dtype = "float32"
+    min_val = float("-13.6875")
+    max_val = float("10.25")
+    mean = float("0.00696439")
+    std = float("0.82078")
+    data = None
+
+
+class Program_weight_tensor_parameter_4:
+    name = "parameter_4"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("-0.164062")
+    max_val = float("1.52344")
+    mean = float("0.142325")
+    std = float("0.0974383")
+    data = None
+
+
+class Program_weight_tensor_parameter_5:
+    name = "parameter_5"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-20.5")
+    max_val = float("19.25")
+    mean = float("-0.00232446")
+    std = float("1.47252")
+    data = None
+
+
+class Program_weight_tensor_parameter_6:
+    name = "parameter_6"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-12.3125")
+    max_val = float("10.8125")
+    mean = float("-0.00313481")
+    std = float("1.79793")
+    data = None
+
+
+class Program_weight_tensor_parameter_7:
+    name = "parameter_7"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-5.03125")
+    max_val = float("4.71875")
+    mean = float("0.000654718")
+    std = float("0.387374")
+    data = None
+
+
+class Program_weight_tensor_parameter_8:
+    name = "parameter_8"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-0.233398")
+    max_val = float("0.21875")
+    mean = float("-3.2276e-05")
+    std = float("0.0427395")
+    data = None
+
+
+class Program_weight_tensor_parameter_9:
+    name = "parameter_9"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("-0.341797")
+    max_val = float("0.789062")
+    mean = float("0.221493")
+    std = float("0.079524")
+    data = None
+
+
+class Program_weight_tensor_parameter_10:
+    name = "parameter_10"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-42.75")
+    max_val = float("45.75")
+    mean = float("-0.00122536")
+    std = float("1.53928")
+    data = None
+
+
+class Program_weight_tensor_parameter_11:
+    name = "parameter_11"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-4.53125")
+    max_val = float("5.96875")
+    mean = float("0.00298593")
+    std = float("0.953516")
+    data = None
+
+
+class Program_weight_tensor_parameter_12:
+    name = "parameter_12"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-3.5")
+    max_val = float("2.32812")
+    mean = float("0.000342449")
+    std = float("0.394908")
+    data = None
+
+
+class Program_weight_tensor_parameter_13:
+    name = "parameter_13"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-0.261719")
+    max_val = float("0.285156")
+    mean = float("0.000135268")
+    std = float("0.0427585")
+    data = None
+
+
+class Program_weight_tensor_parameter_14:
+    name = "parameter_14"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("0.375")
+    max_val = float("7.65625")
+    mean = float("2.4388")
+    std = float("0.69339")
+    data = None
+
+
+class Program_weight_tensor_parameter_15:
+    name = "parameter_15"
+    shape = [2048, 512]
+    dtype = "float32"
+    min_val = float("-16.25")
+    max_val = float("10.5")
+    mean = float("-0.000124255")
+    std = float("0.419801")
+    data = None
+
+
+class Program_weight_tensor_parameter_16:
+    name = "parameter_16"
+    shape = [512, 2048]
+    dtype = "float32"
+    min_val = float("-10.625")
+    max_val = float("10.5")
+    mean = float("0.0074399")
+    std = float("0.694377")
+    data = None
+
+
+class Program_weight_tensor_parameter_17:
+    name = "parameter_17"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("-0.0810547")
+    max_val = float("0.464844")
+    mean = float("0.0716004")
+    std = float("0.0325611")
+    data = None
+
+
+class Program_weight_tensor_parameter_18:
+    name = "parameter_18"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-25.5")
+    max_val = float("21.0")
+    mean = float("0.000646527")
+    std = float("1.26897")
+    data = None
+
+
+class Program_weight_tensor_parameter_19:
+    name = "parameter_19"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-8.25")
+    max_val = float("7.75")
+    mean = float("-0.00116448")
+    std = float("1.22014")
+    data = None
+
+
+class Program_weight_tensor_parameter_20:
+    name = "parameter_20"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-5.65625")
+    max_val = float("6.34375")
+    mean = float("0.000291407")
+    std = float("0.478891")
+    data = None
+
+
+class Program_weight_tensor_parameter_21:
+    name = "parameter_21"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-0.255859")
+    max_val = float("0.271484")
+    mean = float("-0.000160606")
+    std = float("0.0556473")
+    data = None
+
+
+class Program_weight_tensor_parameter_22:
+    name = "parameter_22"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("-0.0688477")
+    max_val = float("0.53125")
+    mean = float("0.1733")
+    std = float("0.0362951")
+    data = None
+
+
+class Program_weight_tensor_parameter_23:
+    name = "parameter_23"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-48.5")
+    max_val = float("37.5")
+    mean = float("-0.00320207")
+    std = float("2.31977")
+    data = None
+
+
+class Program_weight_tensor_parameter_24:
+    name = "parameter_24"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-5.875")
+    max_val = float("4.40625")
+    mean = float("0.0014279")
+    std = float("0.891554")
+    data = None
+
+
+class Program_weight_tensor_parameter_25:
+    name = "parameter_25"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-2.70312")
+    max_val = float("2.4375")
+    mean = float("-0.000709436")
+    std = float("0.410992")
+    data = None
+
+
+class Program_weight_tensor_parameter_26:
+    name = "parameter_26"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-0.306641")
+    max_val = float("0.324219")
+    mean = float("-1.23639e-05")
+    std = float("0.0434743")
+    data = None
+
+
+class Program_weight_tensor_parameter_27:
+    name = "parameter_27"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("0.332031")
+    max_val = float("4.71875")
+    mean = float("1.9234")
+    std = float("0.349066")
+    data = None
+
+
+class Program_weight_tensor_parameter_28:
+    name = "parameter_28"
+    shape = [2048, 512]
+    dtype = "float32"
+    min_val = float("-10.0625")
+    max_val = float("10.6875")
+    mean = float("-0.00030282")
+    std = float("0.399087")
+    data = None
+
+
+class Program_weight_tensor_parameter_29:
+    name = "parameter_29"
+    shape = [512, 2048]
+    dtype = "float32"
+    min_val = float("-8.6875")
+    max_val = float("8.5625")
+    mean = float("0.0142616")
+    std = float("0.680091")
+    data = None
+
+
+class Program_weight_tensor_parameter_30:
+    name = "parameter_30"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("-0.0791016")
+    max_val = float("0.435547")
+    mean = float("0.119905")
+    std = float("0.0375838")
+    data = None
+
+
+class Program_weight_tensor_parameter_31:
+    name = "parameter_31"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-10.0625")
+    max_val = float("13.5625")
+    mean = float("9.86211e-05")
+    std = float("0.894709")
+    data = None
+
+
+class Program_weight_tensor_parameter_32:
+    name = "parameter_32"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-5.0625")
+    max_val = float("6.59375")
+    mean = float("-0.00323875")
+    std = float("0.92144")
+    data = None
+
+
+class Program_weight_tensor_parameter_33:
+    name = "parameter_33"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-4.65625")
+    max_val = float("4.5625")
+    mean = float("-0.00133923")
+    std = float("0.410349")
+    data = None
+
+
+class Program_weight_tensor_parameter_34:
+    name = "parameter_34"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-0.279297")
+    max_val = float("0.263672")
+    mean = float("-2.56622e-05")
+    std = float("0.0439826")
+    data = None
+
+
+class Program_weight_tensor_parameter_35:
+    name = "parameter_35"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("-0.103516")
+    max_val = float("0.378906")
+    mean = float("0.154994")
+    std = float("0.0354193")
+    data = None
+
+
+class Program_weight_tensor_parameter_36:
+    name = "parameter_36"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-21.75")
+    max_val = float("22.875")
+    mean = float("0.00349601")
+    std = float("1.43415")
+    data = None
+
+
+class Program_weight_tensor_parameter_37:
+    name = "parameter_37"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-5.0")
+    max_val = float("4.90625")
+    mean = float("-0.00221994")
+    std = float("0.958329")
+    data = None
+
+
+class Program_weight_tensor_parameter_38:
+    name = "parameter_38"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-3.17188")
+    max_val = float("3.07812")
+    mean = float("-0.000142329")
+    std = float("0.418875")
+    data = None
+
+
+class Program_weight_tensor_parameter_39:
+    name = "parameter_39"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-0.339844")
+    max_val = float("0.439453")
+    mean = float("3.85278e-05")
+    std = float("0.0463164")
+    data = None
+
+
+class Program_weight_tensor_parameter_40:
+    name = "parameter_40"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("0.310547")
+    max_val = float("3.09375")
+    mean = float("1.45117")
+    std = float("0.241399")
+    data = None
+
+
+class Program_weight_tensor_parameter_41:
+    name = "parameter_41"
+    shape = [2048, 512]
+    dtype = "float32"
+    min_val = float("-14.1875")
+    max_val = float("9.0625")
+    mean = float("-0.000422012")
+    std = float("0.38754")
+    data = None
+
+
+class Program_weight_tensor_parameter_42:
+    name = "parameter_42"
+    shape = [512, 2048]
+    dtype = "float32"
+    min_val = float("-21.375")
+    max_val = float("18.0")
+    mean = float("0.019711")
+    std = float("0.736918")
+    data = None
+
+
+class Program_weight_tensor_parameter_43:
+    name = "parameter_43"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("-0.0698242")
+    max_val = float("0.257812")
+    mean = float("0.0878949")
+    std = float("0.020732")
+    data = None
+
+
+class Program_weight_tensor_parameter_44:
+    name = "parameter_44"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-9.3125")
+    max_val = float("9.375")
+    mean = float("-0.00120116")
+    std = float("0.733678")
+    data = None
+
+
+class Program_weight_tensor_parameter_45:
+    name = "parameter_45"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-3.75")
+    max_val = float("3.73438")
+    mean = float("-0.000331075")
+    std = float("0.73921")
+    data = None
+
+
+class Program_weight_tensor_parameter_46:
+    name = "parameter_46"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-5.8125")
+    max_val = float("5.46875")
+    mean = float("0.000311156")
+    std = float("0.392459")
+    data = None
+
+
+class Program_weight_tensor_parameter_47:
+    name = "parameter_47"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-0.365234")
+    max_val = float("0.460938")
+    mean = float("-8.85896e-05")
+    std = float("0.0677922")
+    data = None
+
+
+class Program_weight_tensor_parameter_48:
+    name = "parameter_48"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("-0.0869141")
+    max_val = float("0.304688")
+    mean = float("0.14316")
+    std = float("0.0355998")
+    data = None
+
+
+class Program_weight_tensor_parameter_49:
+    name = "parameter_49"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-19.125")
+    max_val = float("23.375")
+    mean = float("0.000395767")
+    std = float("1.07214")
+    data = None
+
+
+class Program_weight_tensor_parameter_50:
+    name = "parameter_50"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-5.40625")
+    max_val = float("4.71875")
+    mean = float("-0.00047506")
+    std = float("0.908469")
+    data = None
+
+
+class Program_weight_tensor_parameter_51:
+    name = "parameter_51"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-3.25")
+    max_val = float("3.0625")
+    mean = float("-0.000608685")
+    std = float("0.406653")
+    data = None
+
+
+class Program_weight_tensor_parameter_52:
+    name = "parameter_52"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-0.410156")
+    max_val = float("0.5")
+    mean = float("0.000177197")
+    std = float("0.0492003")
+    data = None
+
+
+class Program_weight_tensor_parameter_53:
+    name = "parameter_53"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("0.304688")
+    max_val = float("2.26562")
+    mean = float("1.15984")
+    std = float("0.228704")
+    data = None
+
+
+class Program_weight_tensor_parameter_54:
+    name = "parameter_54"
+    shape = [2048, 512]
+    dtype = "float32"
+    min_val = float("-15.8125")
+    max_val = float("14.625")
+    mean = float("-0.00204277")
+    std = float("0.358594")
+    data = None
+
+
+class Program_weight_tensor_parameter_55:
+    name = "parameter_55"
+    shape = [512, 2048]
+    dtype = "float32"
+    min_val = float("-18.25")
+    max_val = float("25.25")
+    mean = float("0.0252502")
+    std = float("0.713721")
+    data = None
+
+
+class Program_weight_tensor_parameter_56:
+    name = "parameter_56"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("-0.0786133")
+    max_val = float("0.198242")
+    mean = float("0.0967789")
+    std = float("0.0251307")
+    data = None
+
+
+class Program_weight_tensor_parameter_57:
+    name = "parameter_57"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-7.5")
+    max_val = float("6.96875")
+    mean = float("-0.000955252")
+    std = float("0.651063")
+    data = None
+
+
+class Program_weight_tensor_parameter_58:
+    name = "parameter_58"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-3.1875")
+    max_val = float("3.53125")
+    mean = float("-0.000889366")
+    std = float("0.636495")
+    data = None
+
+
+class Program_weight_tensor_parameter_59:
+    name = "parameter_59"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-5.125")
+    max_val = float("4.84375")
+    mean = float("0.00154598")
+    std = float("0.351185")
+    data = None
+
+
+class Program_weight_tensor_parameter_60:
+    name = "parameter_60"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-0.4375")
+    max_val = float("0.419922")
+    mean = float("-9.81584e-05")
+    std = float("0.0735168")
+    data = None
+
+
+class Program_weight_tensor_parameter_61:
+    name = "parameter_61"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("0.0354004")
+    max_val = float("0.298828")
+    mean = float("0.131269")
+    std = float("0.0354128")
+    data = None
+
+
+class Program_weight_tensor_parameter_62:
+    name = "parameter_62"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-17.0")
+    max_val = float("14.6875")
+    mean = float("0.00124212")
+    std = float("0.91165")
+    data = None
+
+
+class Program_weight_tensor_parameter_63:
+    name = "parameter_63"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-3.98438")
+    max_val = float("3.8125")
+    mean = float("0.000928635")
+    std = float("0.724313")
+    data = None
+
+
+class Program_weight_tensor_parameter_64:
+    name = "parameter_64"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-2.625")
+    max_val = float("2.48438")
+    mean = float("0.00132067")
+    std = float("0.429071")
+    data = None
+
+
+class Program_weight_tensor_parameter_65:
+    name = "parameter_65"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-0.328125")
+    max_val = float("0.337891")
+    mean = float("-1.0113e-05")
+    std = float("0.0557238")
+    data = None
+
+
+class Program_weight_tensor_parameter_66:
+    name = "parameter_66"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("0.113281")
+    max_val = float("5.25")
+    mean = float("0.67453")
+    std = float("0.326679")
+    data = None
+
+
+class Program_weight_tensor_parameter_67:
+    name = "parameter_67"
+    shape = [2048, 512]
+    dtype = "float32"
+    min_val = float("-10.1875")
+    max_val = float("7.375")
+    mean = float("-0.00203279")
+    std = float("0.335252")
+    data = None
+
+
+class Program_weight_tensor_parameter_68:
+    name = "parameter_68"
+    shape = [512, 2048]
+    dtype = "float32"
+    min_val = float("-15.75")
+    max_val = float("14.5625")
+    mean = float("0.014664")
+    std = float("0.619637")
+    data = None
+
+
+class Program_weight_tensor_parameter_69:
+    name = "parameter_69"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("0.022583")
+    max_val = float("0.71875")
+    mean = float("0.0890827")
+    std = float("0.0493023")
+    data = None
+
+
+class Program_weight_tensor_parameter_70:
+    name = "parameter_70"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-12.125")
+    max_val = float("14.0625")
+    mean = float("-0.000343404")
+    std = float("0.716759")
+    data = None
+
+
+class Program_weight_tensor_parameter_71:
+    name = "parameter_71"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-2.5625")
+    max_val = float("2.28125")
+    mean = float("0.000567753")
+    std = float("0.469188")
+    data = None
+
+
+class Program_weight_tensor_parameter_72:
+    name = "parameter_72"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-4.03125")
+    max_val = float("2.92188")
+    mean = float("-0.000232129")
+    std = float("0.392837")
+    data = None
+
+
+class Program_weight_tensor_parameter_73:
+    name = "parameter_73"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-0.34375")
+    max_val = float("0.306641")
+    mean = float("-0.000139535")
+    std = float("0.0599122")
+    data = None
+
+
+class Program_weight_tensor_parameter_74:
+    name = "parameter_74"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("0.0385742")
+    max_val = float("0.476562")
+    mean = float("0.0893378")
+    std = float("0.0319224")
+    data = None
+
+
+class Program_weight_tensor_parameter_75:
+    name = "parameter_75"
+    shape = [32, 8]
+    dtype = "float32"
+    min_val = float("-34.5")
+    max_val = float("48.0")
+    mean = float("-1.32047")
+    std = float("7.05675")
+    data = None
+
+
+class Program_weight_tensor_parameter_76:
+    name = "parameter_76"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-12.5625")
+    max_val = float("13.75")
+    mean = float("0.000351022")
+    std = float("0.857384")
+    data = None
+
+
+class Program_weight_tensor_parameter_77:
+    name = "parameter_77"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-2.45312")
+    max_val = float("2.39062")
+    mean = float("0.000828562")
+    std = float("0.489659")
+    data = None
+
+
+class Program_weight_tensor_parameter_78:
+    name = "parameter_78"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-3.98438")
+    max_val = float("3.67188")
+    mean = float("0.00128421")
+    std = float("0.559587")
+    data = None
+
+
+class Program_weight_tensor_parameter_79:
+    name = "parameter_79"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-0.511719")
+    max_val = float("0.53125")
+    mean = float("-2.00758e-05")
+    std = float("0.0698939")
+    data = None
+
+
+class Program_weight_tensor_parameter_80:
+    name = "parameter_80"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("0.0319824")
+    max_val = float("0.355469")
+    mean = float("0.241696")
+    std = float("0.0792719")
+    data = None
+
+
+class Program_weight_tensor_parameter_81:
+    name = "parameter_81"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("-0.239258")
+    max_val = float("3.54688")
+    mean = float("0.612829")
+    std = float("0.2203")
+    data = None
+
+
+class Program_weight_tensor_parameter_82:
+    name = "parameter_82"
+    shape = [2048, 512]
+    dtype = "float32"
+    min_val = float("-12.5")
+    max_val = float("11.1875")
+    mean = float("0.00199515")
+    std = float("0.555518")
+    data = None
+
+
+class Program_weight_tensor_parameter_83:
+    name = "parameter_83"
+    shape = [512, 2048]
+    dtype = "float32"
+    min_val = float("-68.5")
+    max_val = float("51.75")
+    mean = float("-0.00369408")
+    std = float("0.930026")
+    data = None
+
+
+class Program_weight_tensor_parameter_84:
+    name = "parameter_84"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("0.0234375")
+    max_val = float("0.15625")
+    mean = float("0.119119")
+    std = float("0.0214491")
+    data = None
+
+
+class Program_weight_tensor_parameter_85:
+    name = "parameter_85"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-20.875")
+    max_val = float("22.875")
+    mean = float("0.00219397")
+    std = float("1.72905")
+    data = None
+
+
+class Program_weight_tensor_parameter_86:
+    name = "parameter_86"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-5.0625")
+    max_val = float("5.25")
+    mean = float("0.0022758")
+    std = float("0.896258")
+    data = None
+
+
+class Program_weight_tensor_parameter_87:
+    name = "parameter_87"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-3.4375")
+    max_val = float("3.79688")
+    mean = float("-0.00106126")
+    std = float("0.436514")
+    data = None
+
+
+class Program_weight_tensor_parameter_88:
+    name = "parameter_88"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-0.482422")
+    max_val = float("0.554688")
+    mean = float("-1.9731e-05")
+    std = float("0.0540952")
+    data = None
+
+
+class Program_weight_tensor_parameter_89:
+    name = "parameter_89"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("0.246094")
+    max_val = float("4.75")
+    mean = float("0.756424")
+    std = float("0.255841")
+    data = None
+
+
+class Program_weight_tensor_parameter_90:
+    name = "parameter_90"
+    shape = [2048, 512]
+    dtype = "float32"
+    min_val = float("-13.4375")
+    max_val = float("16.75")
+    mean = float("0.00097069")
+    std = float("0.486232")
+    data = None
+
+
+class Program_weight_tensor_parameter_91:
+    name = "parameter_91"
+    shape = [512, 2048]
+    dtype = "float32"
+    min_val = float("-27.125")
+    max_val = float("30.25")
+    mean = float("-0.00128162")
+    std = float("0.900071")
+    data = None
+
+
+class Program_weight_tensor_parameter_92:
+    name = "parameter_92"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("0.0275879")
+    max_val = float("0.177734")
+    mean = float("0.117132")
+    std = float("0.0188376")
+    data = None
+
+
+class Program_weight_tensor_parameter_93:
+    name = "parameter_93"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-18.625")
+    max_val = float("14.6875")
+    mean = float("0.00229923")
+    std = float("1.33635")
+    data = None
+
+
+class Program_weight_tensor_parameter_94:
+    name = "parameter_94"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-4.1875")
+    max_val = float("4.0625")
+    mean = float("0.000259673")
+    std = float("0.769611")
+    data = None
+
+
+class Program_weight_tensor_parameter_95:
+    name = "parameter_95"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-4.375")
+    max_val = float("4.375")
+    mean = float("-0.0004771")
+    std = float("0.450516")
+    data = None
+
+
+class Program_weight_tensor_parameter_96:
+    name = "parameter_96"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-0.558594")
+    max_val = float("0.648438")
+    mean = float("-0.000177306")
+    std = float("0.054454")
+    data = None
+
+
+class Program_weight_tensor_parameter_97:
+    name = "parameter_97"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("0.226562")
+    max_val = float("4.09375")
+    mean = float("0.753399")
+    std = float("0.21234")
+    data = None
+
+
+class Program_weight_tensor_parameter_98:
+    name = "parameter_98"
+    shape = [2048, 512]
+    dtype = "float32"
+    min_val = float("-14.5625")
+    max_val = float("14.75")
+    mean = float("0.00074136")
+    std = float("0.439012")
+    data = None
+
+
+class Program_weight_tensor_parameter_99:
+    name = "parameter_99"
+    shape = [512, 2048]
+    dtype = "float32"
+    min_val = float("-15.3125")
+    max_val = float("18.875")
+    mean = float("0.00268636")
+    std = float("0.815878")
+    data = None
+
+
+class Program_weight_tensor_parameter_100:
+    name = "parameter_100"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("-0.032959")
+    max_val = float("0.201172")
+    mean = float("0.11948")
+    std = float("0.0203831")
+    data = None
+
+
+class Program_weight_tensor_parameter_101:
+    name = "parameter_101"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-16.375")
+    max_val = float("13.8125")
+    mean = float("0.000326929")
+    std = float("1.02065")
+    data = None
+
+
+class Program_weight_tensor_parameter_102:
+    name = "parameter_102"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-3.45312")
+    max_val = float("3.75")
+    mean = float("0.00176018")
+    std = float("0.680957")
+    data = None
+
+
+class Program_weight_tensor_parameter_103:
+    name = "parameter_103"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-2.92188")
+    max_val = float("2.78125")
+    mean = float("-6.96151e-05")
+    std = float("0.44855")
+    data = None
+
+
+class Program_weight_tensor_parameter_104:
+    name = "parameter_104"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-0.390625")
+    max_val = float("0.375")
+    mean = float("-2.9371e-05")
+    std = float("0.0547455")
+    data = None
+
+
+class Program_weight_tensor_parameter_105:
+    name = "parameter_105"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("0.203125")
+    max_val = float("2.5")
+    mean = float("0.613131")
+    std = float("0.150112")
+    data = None
+
+
+class Program_weight_tensor_parameter_106:
+    name = "parameter_106"
+    shape = [2048, 512]
+    dtype = "float32"
+    min_val = float("-11.5625")
+    max_val = float("7.3125")
+    mean = float("0.000362642")
+    std = float("0.406834")
+    data = None
+
+
+class Program_weight_tensor_parameter_107:
+    name = "parameter_107"
+    shape = [512, 2048]
+    dtype = "float32"
+    min_val = float("-25.0")
+    max_val = float("34.5")
+    mean = float("0.00201512")
+    std = float("0.786157")
+    data = None
+
+
+class Program_weight_tensor_parameter_108:
+    name = "parameter_108"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("0.0332031")
+    max_val = float("0.298828")
+    mean = float("0.136475")
+    std = float("0.0285545")
+    data = None
+
+
+class Program_weight_tensor_parameter_109:
+    name = "parameter_109"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-9.8125")
+    max_val = float("11.125")
+    mean = float("0.00299521")
+    std = float("0.807871")
+    data = None
+
+
+class Program_weight_tensor_parameter_110:
+    name = "parameter_110"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-4.125")
+    max_val = float("5.1875")
+    mean = float("-0.000725675")
+    std = float("0.625774")
+    data = None
+
+
+class Program_weight_tensor_parameter_111:
+    name = "parameter_111"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-3.76562")
+    max_val = float("2.57812")
+    mean = float("0.000150539")
+    std = float("0.378036")
+    data = None
+
+
+class Program_weight_tensor_parameter_112:
+    name = "parameter_112"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-0.402344")
+    max_val = float("0.375")
+    mean = float("5.61201e-05")
+    std = float("0.0449669")
+    data = None
+
+
+class Program_weight_tensor_parameter_113:
+    name = "parameter_113"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("0.137695")
+    max_val = float("1.57031")
+    mean = float("0.405777")
+    std = float("0.124192")
+    data = None
+
+
+class Program_weight_tensor_parameter_114:
+    name = "parameter_114"
+    shape = [2048, 512]
+    dtype = "float32"
+    min_val = float("-9.5")
+    max_val = float("15.0")
+    mean = float("-2.43034e-05")
+    std = float("0.360743")
+    data = None
+
+
+class Program_weight_tensor_parameter_115:
+    name = "parameter_115"
+    shape = [512, 2048]
+    dtype = "float32"
+    min_val = float("-16.75")
+    max_val = float("19.625")
+    mean = float("0.000799844")
+    std = float("0.758569")
+    data = None
+
+
+class Program_weight_tensor_parameter_116:
+    name = "parameter_116"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("0.0291748")
+    max_val = float("0.361328")
+    mean = float("0.102077")
+    std = float("0.0278308")
+    data = None
+
+
+class Program_weight_tensor_parameter_117:
+    name = "parameter_117"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-12.875")
+    max_val = float("12.875")
+    mean = float("0.00135863")
+    std = float("0.811788")
+    data = None
+
+
+class Program_weight_tensor_parameter_118:
+    name = "parameter_118"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-3.5")
+    max_val = float("3.28125")
+    mean = float("-3.46814e-05")
+    std = float("0.590862")
+    data = None
+
+
+class Program_weight_tensor_parameter_119:
+    name = "parameter_119"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-3.17188")
+    max_val = float("2.71875")
+    mean = float("-0.000842045")
+    std = float("0.46493")
+    data = None
+
+
+class Program_weight_tensor_parameter_120:
+    name = "parameter_120"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-0.392578")
+    max_val = float("0.492188")
+    mean = float("-3.10072e-05")
+    std = float("0.0599718")
+    data = None
+
+
+class Program_weight_tensor_parameter_121:
+    name = "parameter_121"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("0.0737305")
+    max_val = float("3.6875")
+    mean = float("0.30554")
+    std = float("0.221261")
+    data = None
+
+
+class Program_weight_tensor_parameter_122:
+    name = "parameter_122"
+    shape = [2048, 512]
+    dtype = "float32"
+    min_val = float("-8.1875")
+    max_val = float("8.9375")
+    mean = float("0.000126899")
+    std = float("0.305008")
+    data = None
+
+
+class Program_weight_tensor_parameter_123:
+    name = "parameter_123"
+    shape = [512, 2048]
+    dtype = "float32"
+    min_val = float("-16.625")
+    max_val = float("14.5")
+    mean = float("-0.000737352")
+    std = float("0.615552")
+    data = None
+
+
+class Program_weight_tensor_parameter_124:
+    name = "parameter_124"
+    shape = [512]
+    dtype = "float32"
+    min_val = float("0.0388184")
+    max_val = float("0.380859")
+    mean = float("0.0923548")
+    std = float("0.0335667")
+    data = None
+
+
+class Program_weight_tensor_parameter_125:
+    name = "parameter_125"
+    shape = [32, 8]
+    dtype = "float32"
+    min_val = float("-10.8125")
+    max_val = float("6.125")
+    mean = float("-0.748865")
+    std = float("2.72737")
+    data = None
+
+
+class Program_weight_tensor_parameter_126:
+    name = "parameter_126"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-9.6875")
+    max_val = float("12.5625")
+    mean = float("-0.00112105")
+    std = float("0.637433")
+    data = None
+
+
+class Program_weight_tensor_parameter_127:
+    name = "parameter_127"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-2.875")
+    max_val = float("2.32812")
+    mean = float("0.000568047")
+    std = float("0.449698")
+    data = None
+
+
+class Program_weight_tensor_parameter_128:
+    name = "parameter_128"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-3.5")
+    max_val = float("4.0")
+    mean = float("-6.72496e-06")
+    std = float("0.4878")
+    data = None
+
+
+class Program_weight_tensor_parameter_129:
+    name = "parameter_129"
+    shape = [512, 512]
+    dtype = "float32"
+    min_val = float("-0.376953")
+    max_val = float("0.439453")
+    mean = float("0.000162044")
+    std = float("0.0597395")
+    data = None
+
+
+class Program_weight_tensor_parameter_130:
+    name = "parameter_130"
+    shape = [32128, 512]
+    dtype = "float32"
+    min_val = float("-792.0")
+    max_val = float("348.0")
+    mean = float("0.122392")
+    std = float("23.1937")
+    data = None

From e7151531c22a51da5bbb463a3a887dc423324480 Mon Sep 17 00:00:00 2001
From: RbRe145 <czheng12399@outlook.com>
Date: Fri, 26 Sep 2025 02:55:01 +0000
Subject: [PATCH 4/4] fix nlp_getter f format

---
 graph_net/test/nlp_model_getter.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/graph_net/test/nlp_model_getter.py b/graph_net/test/nlp_model_getter.py
index d795f7e30..151863137 100644
--- a/graph_net/test/nlp_model_getter.py
+++ b/graph_net/test/nlp_model_getter.py
@@ -229,7 +229,6 @@ def get_albert_model_and_inputs(model_name, text, dtype):
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.unk_token
 
-    # 4) 构造输入（支持 str 或 List[str]）
     enc = tokenizer(
         text,
         return_tensors="pd",
@@ -238,7 +237,6 @@ def get_albert_model_and_inputs(model_name, text, dtype):
         max_length=512,
     )
 
-    # 显式补 attention_mask（pad 处为 0）
     if "attention_mask" not in enc:
         input_ids = enc["input_ids"]
         enc["attention_mask"] = (input_ids != tokenizer.pad_token_id).astype("int64")