Fix model unittest CI

huanghengheng · huanghengheng · commit ec6ccbcb9be6 · 2025-09-26T14:47:57.000+08:00
diff --git a/.github/workflows/model-unittest-gpu.yml b/.github/workflows/model-unittest-gpu.yml
@@ -55,7 +55,7 @@ jobs:
         run: |
           container_name=${TASK}-$(date +%Y%m%d-%H%M%S)
           echo "container_name=${container_name}" >> "$GITHUB_ENV"
-          docker run -d -t --name ${container_name} --net=host -v /dev/shm:/dev/shm --shm-size=32G \
+          docker run -d -t --gpus all --name ${container_name} --net=host -v /dev/shm:/dev/shm --shm-size=32G \
             -v $work_dir/../../..:$work_dir/../../.. \
             -v $work_dir:/workspace \
             -v /home/.cache/pip:/home/.cache/pip \
@@ -73,6 +73,7 @@ jobs:
             -e HF_PROXY_PATH=$work_dir/../../../proxy_huggingface \
             -e AISTUDIO_PROXY_PATH=$work_dir/../../../proxy_aistudio \
             -w /workspace --runtime=nvidia --privileged $IMAGE_NAME
+          docker exec -it $container_name nvidia-smi
             
       - name: Download Code
         run: |
@@ -82,6 +83,7 @@ jobs:
             wget -q --no-proxy  https://paddle-qa.bj.bcebos.com/CodeSync/develop/PaddleFormers.tar --no-check-certificate
             echo "Extracting PaddleFormers.tar"
             tar xf PaddleFormers.tar && rm -rf PaddleFormers.tar
+            echo "work_dir = ${work_dir}"
             source $work_dir/../../../proxy
             cd PaddleFormers
             git config --global user.name "PaddleCI"
@@ -109,9 +111,17 @@ jobs:
           set -e
           rm -rf /root/.cache/aistudio/
           cd /workspace/PaddleFormers && git config --global --add safe.directory $PWD
+          echo "work_dir = ${work_dir}"
           source $work_dir/../../../proxy
           source $work_dir/../../../AISTUDIO_ACCESS_TOKEN
           cp -r $work_dir/../../../models ./models
+          echo "Check models:"
+          ls -l ./models
+          echo "Check Cuda Version"
+          python -c "import paddle; print(paddle.version.cuda()); print(paddle.version.cudnn()); print(paddle.is_compiled_with_cuda())"
+          nvcc -V  
+          cat /usr/local/cuda/version.txt
+          echo "Test Start"
           timeout 30m bash scripts/regression/ci_model_unittest.sh ${paddle_whl}
           '
           
diff --git a/scripts/regression/test_dpo.py b/scripts/regression/test_dpo.py
@@ -119,7 +119,7 @@ def test_dpo_full(self):
             train_path,
             updated_config_path,
         ]
-        training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=150)
+        training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
 
         # test training result
         self.dpotrain_tester.assert_result(training_p.returncode, training_p.stdout)
@@ -153,7 +153,7 @@ def test_dpo_lora(self):
             train_path,
             updated_config_path,
         ]
-        training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=150)
+        training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
 
         # test training result
         self.dpotrain_tester.assert_result(training_p.returncode, training_p.stdout)
@@ -178,7 +178,7 @@ def test_dpo_lora(self):
             lora_merge_output_dir,
         ]
         lora_merge_p = subprocess.run(
-            lora_merge_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=150
+            lora_merge_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
         )
         self.dpotrain_tester.assert_result(lora_merge_p.returncode, lora_merge_p.stdout)
 
diff --git a/scripts/regression/test_sft.py b/scripts/regression/test_sft.py
@@ -123,7 +123,8 @@ def test_sft_full(self):
             train_path,
             updated_config_path,
         ]
-        training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=150)
+        print(f"cmd {cmd}")
+        training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
 
         # test training result
         self.sfttrain_tester.assert_result(training_p.returncode, training_p.stdout)
@@ -133,7 +134,7 @@ def test_sft_full(self):
         self.sfttrain_tester.assert_loss(training_p.stdout, EXCEPTED_LOSS)
 
         # test model resume
-        # reusme_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=150)
+        # reusme_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
         # self.sfttrain_tester.assert_result(reusme_p.returncode, reusme_p.stdout)
 
         # EXCEPTED_LOSS = 9.550503
@@ -172,7 +173,7 @@ def test_sft_lora(self):
             train_path,
             updated_config_path,
         ]
-        training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=150)
+        training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
 
         # test training result
         self.sfttrain_tester.assert_result(training_p.returncode, training_p.stdout)
@@ -201,7 +202,7 @@ def test_sft_lora(self):
             lora_merge_output_dir,
         ]
         lora_merge_p = subprocess.run(
-            lora_merge_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=150
+            lora_merge_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
         )
         self.sfttrain_tester.assert_result(lora_merge_p.returncode, lora_merge_p.stdout)
 

Original file line number	Diff line number	Diff line change
`@@ -119,7 +119,7 @@ def test_dpo_full(self):`
`119`	`119`	`train_path,`
`120`	`120`	`updated_config_path,`
`121`	`121`	`]`
`122`		`- training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=150)`
	`122`	`+ training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)`
`123`	`123`
`124`	`124`	`# test training result`
`125`	`125`	`self.dpotrain_tester.assert_result(training_p.returncode, training_p.stdout)`
`@@ -153,7 +153,7 @@ def test_dpo_lora(self):`
`153`	`153`	`train_path,`
`154`	`154`	`updated_config_path,`
`155`	`155`	`]`
`156`		`- training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=150)`
	`156`	`+ training_p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)`
`157`	`157`
`158`	`158`	`# test training result`
`159`	`159`	`self.dpotrain_tester.assert_result(training_p.returncode, training_p.stdout)`
`@@ -178,7 +178,7 @@ def test_dpo_lora(self):`
`178`	`178`	`lora_merge_output_dir,`
`179`	`179`	`]`
`180`	`180`	`lora_merge_p = subprocess.run(`
`181`		`- lora_merge_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, timeout=150`
	`181`	`+ lora_merge_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True`
`182`	`182`	`)`
`183`	`183`	`self.dpotrain_tester.assert_result(lora_merge_p.returncode, lora_merge_p.stdout)`
`184`	`184`