Skip to content

Commit ec6ccbc

Browse files
committed
Fix model unittest CI
1 parent da34e8c commit ec6ccbc

File tree

3 files changed

+19
-8
lines changed

3 files changed

+19
-8
lines changed

.github/workflows/model-unittest-gpu.yml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ jobs:
5555
run: |
5656
container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
5757
echo "container_name=${container_name}" >> "$GITHUB_ENV"
58-
docker run -d -t --name ${container_name} --net=host -v /dev/shm:/dev/shm --shm-size=32G \
58+
docker run -d -t --gpus all --name ${container_name} --net=host -v /dev/shm:/dev/shm --shm-size=32G \
5959
-v $work_dir/../../..:$work_dir/../../.. \
6060
-v $work_dir:/workspace \
6161
-v /home/.cache/pip:/home/.cache/pip \
@@ -73,6 +73,7 @@ jobs:
7373
-e HF_PROXY_PATH=$work_dir/../../../proxy_huggingface \
7474
-e AISTUDIO_PROXY_PATH=$work_dir/../../../proxy_aistudio \
7575
-w /workspace --runtime=nvidia --privileged $IMAGE_NAME
76+
docker exec -it $container_name nvidia-smi
7677
7778
- name: Download Code
7879
run: |
@@ -82,6 +83,7 @@ jobs:
8283
wget -q --no-proxy https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddleFormers.tar --no-check-certificate
8384
echo "Extracting PaddleFormers.tar"
8485
tar xf PaddleFormers.tar && rm -rf PaddleFormers.tar
86+
echo "work_dir = ${work_dir}"
8587
source $work_dir/../../../proxy
8688
cd PaddleFormers
8789
git config --global user.name "PaddleCI"
@@ -109,9 +111,17 @@ jobs:
109111
set -e
110112
rm -rf /root/.cache/aistudio/
111113
cd /workspace/PaddleFormers && git config --global --add safe.directory $PWD
114+
echo "work_dir = ${work_dir}"
112115
source $work_dir/../../../proxy
113116
source $work_dir/../../../AISTUDIO_ACCESS_TOKEN
114117
cp -r $work_dir/../../../models ./models
118+
echo "Check models:"
119+
ls -l ./models
120+
echo "Check Cuda Version"
121+
python -c "import paddle; print(paddle.version.cuda()); print(paddle.version.cudnn()); print(paddle.is_compiled_with_cuda())"
122+
nvcc -V
123+
cat /usr/local/cuda/version.txt
124+
echo "Test Start"
115125
timeout 30m bash scripts/regression/ci_model_unittest.sh ${paddle_whl}
116126
'
117127

scripts/regression/test_dpo.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ def test_dpo_full(self):
119119
train_path,
120120
updated_config_path,
121121
]
122-
training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=150)
122+
training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
123123

124124
# test training result
125125
self.dpotrain_tester.assert_result(training_p.returncode, training_p.stdout)
@@ -153,7 +153,7 @@ def test_dpo_lora(self):
153153
train_path,
154154
updated_config_path,
155155
]
156-
training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=150)
156+
training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
157157

158158
# test training result
159159
self.dpotrain_tester.assert_result(training_p.returncode, training_p.stdout)
@@ -178,7 +178,7 @@ def test_dpo_lora(self):
178178
lora_merge_output_dir,
179179
]
180180
lora_merge_p = subprocess.run(
181-
lora_merge_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=150
181+
lora_merge_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
182182
)
183183
self.dpotrain_tester.assert_result(lora_merge_p.returncode, lora_merge_p.stdout)
184184

scripts/regression/test_sft.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,8 @@ def test_sft_full(self):
123123
train_path,
124124
updated_config_path,
125125
]
126-
training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=150)
126+
print(f"cmd {cmd}")
127+
training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
127128

128129
# test training result
129130
self.sfttrain_tester.assert_result(training_p.returncode, training_p.stdout)
@@ -133,7 +134,7 @@ def test_sft_full(self):
133134
self.sfttrain_tester.assert_loss(training_p.stdout, EXCEPTED_LOSS)
134135

135136
# test model resume
136-
# reusme_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=150)
137+
# reusme_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
137138
# self.sfttrain_tester.assert_result(reusme_p.returncode, reusme_p.stdout)
138139

139140
# EXCEPTED_LOSS = 9.550503
@@ -172,7 +173,7 @@ def test_sft_lora(self):
172173
train_path,
173174
updated_config_path,
174175
]
175-
training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=150)
176+
training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
176177

177178
# test training result
178179
self.sfttrain_tester.assert_result(training_p.returncode, training_p.stdout)
@@ -201,7 +202,7 @@ def test_sft_lora(self):
201202
lora_merge_output_dir,
202203
]
203204
lora_merge_p = subprocess.run(
204-
lora_merge_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=150
205+
lora_merge_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
205206
)
206207
self.sfttrain_tester.assert_result(lora_merge_p.returncode, lora_merge_p.stdout)
207208

0 commit comments

Comments
 (0)