@@ -41,24 +41,27 @@ def add_to_shard(i, filename):
41
41
42
42
all_other_files = all_files .copy ()
43
43
needs_multigpu = list (
44
- filter (lambda x : get_needs_machine (x ) == "linux.16xlarge.nvidia.gpu" , all_files ,)
45
- )
46
- needs_a10g = list (
47
- filter (lambda x : get_needs_machine (x ) == "linux.g5.4xlarge.nvidia.gpu" , all_files ,)
44
+ filter (lambda x : get_needs_machine (x ) == "multigpu" , all_files ,)
48
45
)
46
+ # Magic code for torchvision: for some reason, it needs to run after
47
+ # beginner_source/basics/data_tutorial.py. Very specifically:
48
+ # https://github.com/pytorch/tutorials/blob/edff1330ca6c198e8e29a3d574bfb4afbe191bfd/beginner_source/basics/data_tutorial.py#L49-L60
49
+ # So manually add them to the last shard. I think some other files also
50
+ # work but I'm too lazy to figure out which ones.
51
+ # add_to_shard(num_shards - 1, "beginner_source/basics/data_tutorial.py")
52
+ # add_to_shard(num_shards - 1, "intermediate_source/torchvision_tutorial.py")
53
+ # all_other_files.remove("beginner_source/basics/data_tutorial.py")
54
+ # all_other_files.remove("intermediate_source/torchvision_tutorial.py")
55
+
49
56
for filename in needs_multigpu :
50
57
# currently, the only job that has multigpu is the 0th worker,
51
58
# so we'll add all the jobs that need this machine to the 0th worker
52
59
add_to_shard (0 , filename )
53
60
all_other_files .remove (filename )
54
- for filename in needs_a10g :
55
- # currently, workers 1-5 use linux.g5.4xlarge.nvidia.gpu (sm86, A10G),
56
- # so we'll add all the jobs that need this machine to the 1st worker
57
- add_to_shard (1 , filename )
58
- all_other_files .remove (filename )
59
61
sorted_files = sorted (all_other_files , key = get_duration , reverse = True ,)
60
62
61
63
for filename in sorted_files :
64
+ # If you don't specify a machine, you get the default
62
65
min_shard_index = sorted (range (1 , num_shards ), key = lambda i : sharded_files [i ][0 ])[
63
66
0
64
67
]
0 commit comments