Removed .in-progress (and key_folder)!

jpc · jpc · commit b3cb2f91c035 · 2026-01-28T17:29:44.000Z
diff --git a/wsds/utils.py b/wsds/utils.py
@@ -38,7 +38,7 @@ def find_first_shard(path):
     return None
 
 
-def list_all_columns(ds_path, shard_name=None, include_in_progress=True, key_folder=None):
+def list_all_columns(ds_path, shard_name=None):
     """Given a dataset path, return a list of all columns.
 
     If you also give a shard name it greatly speeds it up
@@ -53,13 +53,12 @@ def list_all_columns(ds_path, shard_name=None, include_in_progress=True, key_fol
             continue
         if not p.is_dir():
             continue
-        is_in_progress = p.suffix == ".in-progress"
-        if is_in_progress and not include_in_progress:
-            continue
-        if shard_name is None or is_in_progress:
+        if shard_name is None:
             fname = find_first_shard(p)
         else:
             fname = (p / shard_name).with_suffix(".wsds")
+            if not fname.exists():
+                fname = find_first_shard(p)
         if fname and fname.exists():
             try:
                 columns = get_columns(fname)
@@ -68,10 +67,8 @@ def list_all_columns(ds_path, shard_name=None, include_in_progress=True, key_fol
                 continue
             for col in columns:
                 if col == "__key__":
-                    if not is_in_progress or key_folder == fname.parent.name:
-                        # We need a subdir that has all shards but we don't wanna list all of them (that's expensive)
-                        # so instead we rely on a subdir naming convention (the .in-progress suffix) and never use these
-                        key_col.append((fname.stat().st_size, p.name, col))
+                    # List all potential __key__ columns (they should be in each shard)
+                    key_col.append((fname.stat().st_size, p.name, col))
                     continue
                 # seems like we should fix this during the original conversion
                 if col in cols or col in dupes:
@@ -84,10 +81,7 @@ def list_all_columns(ds_path, shard_name=None, include_in_progress=True, key_fol
                 else:
                     cols[col] = (p.name, col)
     # use the smallest shards for __key__ (should be the fastest)
-    if key_folder is not None:
-        cols["__key__"] = next(col for col in key_col if key_folder == col[1])[1:]
-    elif len(key_col) > 0:
-        cols["__key__"] = sorted(key_col)[0][1:]
+    cols["__key__"] = [x[1:] for x in sorted(key_col)]
     return dict(sorted(cols.items()))
 
 
diff --git a/wsds/ws_dataset.py b/wsds/ws_dataset.py
@@ -56,6 +56,11 @@ class WSDataset:
     def __init__(self, dataset_dir: str | Path, include_in_progress: bool = True, key_folder: str | None = None, disable_memory_map: bool = False):
         self.dataset_dir = self._resolve_path(dataset_dir)
 
+        if include_in_progress is not True:
+            print("NOTE: include_in_progress is deprecated and all subdirs are included by default")
+        if key_folder is not None:
+            print("NOTE: key_folder is deprecated and key folder is selected automatically")
+
         self.index = None
         self.segmented = False
         self.disable_memory_map = disable_memory_map
@@ -71,12 +76,8 @@ def __init__(self, dataset_dir: str | Path, include_in_progress: bool = True, ke
             self.fields = meta['fields']
         else:
             dataset_path, shard_name  = next(self.index.shards()) if self.index else ("", None)
-            self.fields = list_all_columns(
-                self.dataset_dir / dataset_path, shard_name, include_in_progress=include_in_progress
-            )
-            self.fields.update(list_all_columns(
-                self.dataset_dir, include_in_progress=include_in_progress, key_folder=key_folder
-            ))
+            self.fields = list_all_columns(self.dataset_dir / dataset_path, shard_name)
+            self.fields.update(list_all_columns(self.dataset_dir))
         if 'computed_columns' in meta:
             self.computed_columns = meta['computed_columns']
         else:
@@ -254,14 +255,19 @@ def _parse_sql_queries_polars(self, *queries, shard_subsample=1, rng=None, shard
                     # __key__ exists in all shards
                     needed_special_columns.append(col)
                     continue
-                subdir, field = self.fields[col]
+                value = self.fields[col]
+                # FIXME: figure out a way to handle all candidates for __key__
+                if isinstance(value[0], str):
+                    subdir, field = value
+                else:
+                    subdir, field = value[0]
                 assert col == field, "renamed fields are not supported in SQL queries yet"
                 subdirs[subdir].append(field)
             exprs.append(expr)
 
         # If only __key__ is in the query, we need to load shards from at least one subdir
         key_value = self.fields["__key__"]
-        key_subdir = key_value[0]
+        key_subdir = key_value[0] if isinstance(key_value[0], str) else key_value[0][0]
         if needed_special_columns:
             if subdirs:
                 key_subdir = list(subdirs.keys())[0]
@@ -439,7 +445,8 @@ def get_shard_path(self, subdir, shard_name):
         return (Path(dir) / shard_name).with_suffix(".wsds")
 
     def _register_wsds_links(self):
-        for subdir, _ in self.fields.values():
+        for value in self.fields.values():
+            subdir = value[0] if isinstance(value[0], str) else value[0][0]
             if subdir.endswith(".wsds-link"):
                 spec = json.loads((self.dataset_dir / subdir).read_text())
                 self.computed_columns[subdir] = spec
@@ -481,8 +488,16 @@ def get_shard(self, subdir, shard_name):
         return shard
 
     def get_sample(self, shard_name, field, offset):
-        subdir, column = self.fields[field]
-        return self.get_shard(subdir, shard_name).get_sample(column, offset)
+        value = self.fields[field]
+        alternatives = [value] if isinstance(value[0], str) else value
+        last_err = None
+        for subdir, column in alternatives:
+            try:
+                return self.get_shard(subdir, shard_name).get_sample(column, offset)
+            except WSShardMissingError as e:
+                last_err = e
+                continue
+        raise last_err
 
     def parse_key(self, key):
         if self.segmented:
diff --git a/wsds/ws_sample.py b/wsds/ws_sample.py
@@ -89,7 +89,8 @@ def __repr__(self, repr=repr):
             if k in self.overrides:
                 subdir = "__overrides__"
             elif k in self.dataset.fields:
-                subdir, _ = self.dataset.fields[k]
+                value = self.dataset.fields[k]
+                subdir = value[0] if isinstance(value[0], str) else value[0][0]
             else:
                 subdir = "__unknown__"
             if subdir not in subdir_columns:
diff --git a/wsds/ws_tools.py b/wsds/ws_tools.py
@@ -448,7 +448,6 @@ def init_split(
     source_dataset: Path | None = None,
     vad_column: str | None = None,
     num_workers: int = 64,
-    include_in_progress: bool = False,
     index_path: str = ".",
 ):
     """Initialize a new dataset, from scratch or from a segmentation of an existing one."""