Merge pull request #294 from Tencent/jiaruifang/accurate_timer

feifeibear · web-flow · commit b1265ee37cba · 2021-12-23T19:37:29.000+08:00
accurate timer
diff --git a/CHANGE_LOG.md b/CHANGE_LOG.md
@@ -1,5 +1,8 @@
 ## v0.4.5 Dec. 2021
-refactory the files in example and adding chunk size searching.
+Evaluate on 8 nodes of SuperPod. Fix bugs in multi-GPU mem tracer.
+
+## v0.4.5 Dec. 2021
+Refactor the files in example and add chunk size searching.
 
 
 ### v0.4.4 Dec. 2021
diff --git a/patrickstar/utils/global_timer.py b/patrickstar/utils/global_timer.py
@@ -28,6 +28,7 @@
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import time
+import torch
 
 # from .logging import logger
 from .singleton_meta import SingletonMeta
@@ -57,6 +58,7 @@ def start_profile(self, key):
     def finish_profile(self, key):
         if not self.start_flag:
             return
+        torch.cuda.current_stream().synchronize()
         if key in self.elapse_stat:
             self.elapse_stat[key] += time.time() - self.start_time[key]
         else:
diff --git a/setup.py b/setup.py
@@ -41,7 +41,7 @@ def fetch_requirements(path):
 
 setup(
     name="patrickstar",
-    version="0.4.5",
+    version="0.4.6",
     description="PatrickStart library",
     long_description="PatrickStar: Parallel Training of Large Language Models via a Chunk-based Parameter Server",
     long_description_content_type="text/markdown",