Skip to content

Commit f575b50

Browse files
committed
tc
1 parent 0353137 commit f575b50

File tree

4 files changed

+378
-53
lines changed

4 files changed

+378
-53
lines changed

.jenkins/build.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
1616
sudo apt-get update
1717
sudo apt-get install -y pandoc
1818

19+
# export CUBLAS_WORKSPACE_CONFIG=:4096:8
20+
1921
# NS: Path to python runtime should already be part of docker container
2022
# export PATH=/opt/conda/bin:$PATH
2123

.jenkins/get_files_to_run.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,24 +41,27 @@ def add_to_shard(i, filename):
4141

4242
all_other_files = all_files.copy()
4343
needs_multigpu = list(
44-
filter(lambda x: get_needs_machine(x) == "linux.16xlarge.nvidia.gpu", all_files,)
45-
)
46-
needs_a10g = list(
47-
filter(lambda x: get_needs_machine(x) == "linux.g5.4xlarge.nvidia.gpu", all_files,)
44+
filter(lambda x: get_needs_machine(x) == "multigpu", all_files,)
4845
)
46+
# Magic code for torchvision: for some reason, it needs to run after
47+
# beginner_source/basics/data_tutorial.py. Very specifically:
48+
# https://github.com/pytorch/tutorials/blob/edff1330ca6c198e8e29a3d574bfb4afbe191bfd/beginner_source/basics/data_tutorial.py#L49-L60
49+
# So manually add them to the last shard. I think some other files also
50+
# work but I'm too lazy to figure out which ones.
51+
# add_to_shard(num_shards - 1, "beginner_source/basics/data_tutorial.py")
52+
# add_to_shard(num_shards - 1, "intermediate_source/torchvision_tutorial.py")
53+
# all_other_files.remove("beginner_source/basics/data_tutorial.py")
54+
# all_other_files.remove("intermediate_source/torchvision_tutorial.py")
55+
4956
for filename in needs_multigpu:
5057
# currently, the only job that has multigpu is the 0th worker,
5158
# so we'll add all the jobs that need this machine to the 0th worker
5259
add_to_shard(0, filename)
5360
all_other_files.remove(filename)
54-
for filename in needs_a10g:
55-
# currently, workers 1-5 use linux.g5.4xlarge.nvidia.gpu (sm86, A10G),
56-
# so we'll add all the jobs that need this machine to the 1st worker
57-
add_to_shard(1, filename)
58-
all_other_files.remove(filename)
5961
sorted_files = sorted(all_other_files, key=get_duration, reverse=True,)
6062

6163
for filename in sorted_files:
64+
# If you don't specify a machine, you get the default
6265
min_shard_index = sorted(range(1, num_shards), key=lambda i: sharded_files[i][0])[
6366
0
6467
]

0 commit comments

Comments
 (0)