|
5 | 5 | import numpy as np |
6 | 6 | import pyarrow as pa |
7 | 7 | import pyarrow.parquet as pq |
8 | | -import torch |
9 | 8 | import tqdm |
10 | 9 | from torch.utils.data import IterableDataset, get_worker_info |
11 | 10 | from torchdata.stateful_dataloader import StatefulDataLoader |
12 | 11 |
|
13 | 12 | from fastvideo.v1.dataset.utils import collate_latents_embs_masks |
14 | | -from fastvideo.v1.distributed import (get_sp_world_size, get_world_rank, |
15 | | - get_world_size) |
| 13 | +from fastvideo.v1.distributed import (get_sp_world_size, get_world_group, |
| 14 | + get_world_rank, get_world_size) |
16 | 15 | from fastvideo.v1.logger import init_logger |
17 | 16 |
|
18 | 17 | logger = init_logger(__name__) |
@@ -157,95 +156,123 @@ def shard_parquet_files_across_sp_groups_and_workers( |
157 | 156 | # Check if sharding plan already exists |
158 | 157 | sharding_info_dir = os.path.join( |
159 | 158 | path, f"sharding_info_{num_sp_groups}_sp_groups_{num_workers}_workers") |
160 | | - if os.path.exists(sharding_info_dir): |
161 | | - logger.info("Sharding plan already exists") |
162 | | - logger.info("Loading sharding plan from %s", sharding_info_dir) |
163 | | - try: |
| 159 | + |
| 160 | + # Only rank 0 handles cache checking and file scanning |
| 161 | + if get_world_rank() == 0: |
| 162 | + cache_loaded = False |
| 163 | + shard_parquet_files = None |
| 164 | + shard_total_samples = None |
| 165 | + shard_parquet_lengths = None |
| 166 | + |
| 167 | + # First try to load existing sharding plan |
| 168 | + if os.path.exists(sharding_info_dir): |
| 169 | + logger.info("Loading sharding plan from %s", sharding_info_dir) |
| 170 | + try: |
| 171 | + with open( |
| 172 | + os.path.join(sharding_info_dir, |
| 173 | + "shard_parquet_files.pkl"), "rb") as f: |
| 174 | + shard_parquet_files = pickle.load(f) |
| 175 | + with open( |
| 176 | + os.path.join(sharding_info_dir, |
| 177 | + "shard_total_samples.pkl"), "rb") as f: |
| 178 | + shard_total_samples = pickle.load(f) |
| 179 | + with open( |
| 180 | + os.path.join(sharding_info_dir, |
| 181 | + "shard_parquet_lengths.pkl"), "rb") as f: |
| 182 | + shard_parquet_lengths = pickle.load(f) |
| 183 | + cache_loaded = True |
| 184 | + logger.info("Successfully loaded sharding plan") |
| 185 | + except Exception as e: |
| 186 | + logger.error("Error loading sharding plan: %s", str(e)) |
| 187 | + logger.info("Falling back to creating new sharding plan") |
| 188 | + cache_loaded = False |
| 189 | + |
| 190 | + # If cache not loaded (either doesn't exist or failed to load), create sharding plan |
| 191 | + if not cache_loaded: |
| 192 | + logger.info("Creating new sharding plan") |
| 193 | + logger.info("Scanning for parquet files in %s", path) |
| 194 | + |
| 195 | + # Find all parquet files |
| 196 | + parquet_files = [] |
| 197 | + |
| 198 | + for root, _, files in os.walk(path): |
| 199 | + for file in files: |
| 200 | + if file.endswith('.parquet'): |
| 201 | + parquet_files.append(os.path.join(root, file)) |
| 202 | + |
| 203 | + if not parquet_files: |
| 204 | + raise ValueError("No parquet files found in %s", path) |
| 205 | + |
| 206 | + # Calculate file lengths efficiently using a single pass |
| 207 | + logger.info("Calculating file lengths...") |
| 208 | + lengths = [] |
| 209 | + for file in tqdm.tqdm(parquet_files, desc="Reading parquet files"): |
| 210 | + lengths.append(pq.ParquetFile(file).metadata.num_rows) |
| 211 | + |
| 212 | + total_samples = sum(lengths) |
| 213 | + logger.info("Found %d files with %d total samples", |
| 214 | + len(parquet_files), total_samples) |
| 215 | + |
| 216 | + # Sort files by length for better balancing |
| 217 | + sorted_indices = np.argsort(lengths) |
| 218 | + sorted_files = [parquet_files[i] for i in sorted_indices] |
| 219 | + sorted_lengths = [lengths[i] for i in sorted_indices] |
| 220 | + |
| 221 | + # Create shards |
| 222 | + num_shards = num_sp_groups * num_workers |
| 223 | + shard_parquet_files = [[] for _ in range(num_shards)] |
| 224 | + shard_total_samples = [0] * num_shards |
| 225 | + shard_parquet_lengths = [{} for _ in range(num_shards)] |
| 226 | + |
| 227 | + # Distribute files to shards using a greedy approach |
| 228 | + logger.info("Distributing files to shards...") |
| 229 | + for file, length in zip(reversed(sorted_files), |
| 230 | + reversed(sorted_lengths), |
| 231 | + strict=True): |
| 232 | + # Find shard with minimum current length |
| 233 | + target_shard = np.argmin(shard_total_samples) |
| 234 | + shard_parquet_files[target_shard].append(file) |
| 235 | + shard_total_samples[target_shard] += length |
| 236 | + shard_parquet_lengths[target_shard][file] = length |
| 237 | + #randomize each shard |
| 238 | + for shard in shard_parquet_files: |
| 239 | + random.seed(seed) |
| 240 | + random.shuffle(shard) |
| 241 | + |
| 242 | + # Save the sharding plan |
| 243 | + os.makedirs(sharding_info_dir, exist_ok=True) |
164 | 244 | with open( |
165 | 245 | os.path.join(sharding_info_dir, "shard_parquet_files.pkl"), |
166 | | - "rb") as f: |
167 | | - shard_parquet_files = pickle.load(f) |
| 246 | + "wb") as f: |
| 247 | + pickle.dump(shard_parquet_files, f) |
168 | 248 | with open( |
169 | 249 | os.path.join(sharding_info_dir, "shard_total_samples.pkl"), |
170 | | - "rb") as f: |
171 | | - shard_total_samples = pickle.load(f) |
| 250 | + "wb") as f: |
| 251 | + pickle.dump(shard_total_samples, f) |
172 | 252 | with open( |
173 | 253 | os.path.join(sharding_info_dir, |
174 | | - "shard_parquet_lengths.pkl"), "rb") as f: |
175 | | - shard_parquet_lengths = pickle.load(f) |
176 | | - return shard_parquet_files, shard_total_samples, shard_parquet_lengths |
177 | | - except Exception as e: |
178 | | - logger.error("Error loading sharding plan: %s", str(e)) |
179 | | - logger.info("Falling back to creating new sharding plan") |
180 | | - |
181 | | - if get_world_rank() == 0: |
182 | | - logger.info("Scanning for parquet files in %s", path) |
183 | | - |
184 | | - # Find all parquet files |
185 | | - parquet_files = [] |
186 | | - |
187 | | - for root, _, files in os.walk(path): |
188 | | - for file in files: |
189 | | - if file.endswith('.parquet'): |
190 | | - parquet_files.append(os.path.join(root, file)) |
191 | | - |
192 | | - if not parquet_files: |
193 | | - raise ValueError("No parquet files found in %s", path) |
194 | | - |
195 | | - # Calculate file lengths efficiently using a single pass |
196 | | - logger.info("Calculating file lengths...") |
197 | | - lengths = [] |
198 | | - for file in tqdm.tqdm(parquet_files, desc="Reading parquet files"): |
199 | | - lengths.append(pq.ParquetFile(file).metadata.num_rows) |
200 | | - |
201 | | - total_samples = sum(lengths) |
202 | | - logger.info("Found %d files with %d total samples", len(parquet_files), |
203 | | - total_samples) |
204 | | - |
205 | | - # Sort files by length for better balancing |
206 | | - sorted_indices = np.argsort(lengths) |
207 | | - sorted_files = [parquet_files[i] for i in sorted_indices] |
208 | | - sorted_lengths = [lengths[i] for i in sorted_indices] |
209 | | - |
210 | | - # Create shards |
211 | | - num_shards = num_sp_groups * num_workers |
212 | | - shard_parquet_files = [[] for _ in range(num_shards)] |
213 | | - shard_total_samples = [0] * num_shards |
214 | | - shard_parquet_lengths = [{} for _ in range(num_shards)] |
215 | | - |
216 | | - # Distribute files to shards using a greedy approach |
217 | | - logger.info("Distributing files to shards...") |
218 | | - for file, length in zip(reversed(sorted_files), |
219 | | - reversed(sorted_lengths), |
220 | | - strict=True): |
221 | | - # Find shard with minimum current length |
222 | | - target_shard = np.argmin(shard_total_samples) |
223 | | - shard_parquet_files[target_shard].append(file) |
224 | | - shard_total_samples[target_shard] += length |
225 | | - shard_parquet_lengths[target_shard][file] = length |
226 | | - #randomize each shard |
227 | | - for shard in shard_parquet_files: |
228 | | - random.seed(seed) |
229 | | - random.shuffle(shard) |
230 | | - |
231 | | - save_dir = os.path.join( |
232 | | - path, |
233 | | - f"sharding_info_{num_sp_groups}_sp_groups_{num_workers}_workers") |
234 | | - os.makedirs(save_dir, exist_ok=True) |
235 | | - with open(os.path.join(save_dir, "shard_parquet_files.pkl"), "wb") as f: |
236 | | - pickle.dump(shard_parquet_files, f) |
237 | | - with open(os.path.join(save_dir, "shard_total_samples.pkl"), "wb") as f: |
238 | | - pickle.dump(shard_total_samples, f) |
239 | | - with open(os.path.join(save_dir, "shard_parquet_lengths.pkl"), |
240 | | - "wb") as f: |
241 | | - pickle.dump(shard_parquet_lengths, f) |
242 | | - logger.info("Saved sharding info to %s", save_dir) |
243 | | - |
244 | | - # wait for all ranks to finish |
245 | | - torch.distributed.barrier() |
246 | | - # recursive call |
247 | | - return shard_parquet_files_across_sp_groups_and_workers( |
248 | | - path, num_sp_groups, num_workers, seed) |
| 254 | + "shard_parquet_lengths.pkl"), "wb") as f: |
| 255 | + pickle.dump(shard_parquet_lengths, f) |
| 256 | + logger.info("Saved sharding info to %s", sharding_info_dir) |
| 257 | + |
| 258 | + # Wait for rank 0 to finish creating/loading sharding plan |
| 259 | + world_group = get_world_group() |
| 260 | + world_group.barrier() |
| 261 | + |
| 262 | + # Now all ranks load the sharding plan (it should exist and be valid now) |
| 263 | + logger.info("Loading sharding plan from %s after barrier", |
| 264 | + sharding_info_dir) |
| 265 | + with open(os.path.join(sharding_info_dir, "shard_parquet_files.pkl"), |
| 266 | + "rb") as f: |
| 267 | + shard_parquet_files = pickle.load(f) |
| 268 | + with open(os.path.join(sharding_info_dir, "shard_total_samples.pkl"), |
| 269 | + "rb") as f: |
| 270 | + shard_total_samples = pickle.load(f) |
| 271 | + with open(os.path.join(sharding_info_dir, "shard_parquet_lengths.pkl"), |
| 272 | + "rb") as f: |
| 273 | + shard_parquet_lengths = pickle.load(f) |
| 274 | + |
| 275 | + return shard_parquet_files, shard_total_samples, shard_parquet_lengths |
249 | 276 |
|
250 | 277 |
|
251 | 278 | def build_parquet_iterable_style_dataloader( |
|
0 commit comments