Add Dataset.group_split

FelixAbrahamsson · FelixAbrahamsson · commit c89139ad0bb1 · 2020-08-26T14:58:20.000+02:00
diff --git a/datastream/dataset.py b/datastream/dataset.py
@@ -256,6 +256,59 @@ def split(
             ).items()
         }
 
+    def group_split(
+        self,
+        split_column: str,
+        proportions: Dict[str, float],
+        filepath: Optional[Union[str, Path]] = None,
+        frozen: Optional[bool] = False,
+        seed: Optional[int] = None,
+    ) -> Dict[str, Dataset[T]]:
+        '''
+        Similar to :func:`Dataset.split`, but uses a non-unique split column
+        instead of a unique key column. This is useful for example when you
+        have a dataset with examples that come from separate sources and you
+        don't want to have examples from the same source in different splits.
+        Does not support stratification.
+
+        >>> split_file = Path('doctest_split_dataset.json')
+        >>> split_datasets = (
+        ...     Dataset.from_dataframe(pd.DataFrame(dict(
+        ...         source=np.arange(100) // 4,
+        ...         number=np.random.randn(100),
+        ...     )))
+        ...     .group_split(
+        ...         split_column='source',
+        ...         proportions=dict(train=0.8, test=0.2),
+        ...         filepath=split_file,
+        ...     )
+        ... )
+        >>> len(split_datasets['train'])
+        80
+        >>> split_file.unlink()  # clean up after doctest
+        '''
+        if filepath is not None:
+            filepath = Path(filepath)
+
+        split_dataframes = tools.group_split_dataframes
+        if seed is not None:
+            split_dataframes = tools.numpy_seed(seed)(split_dataframes)
+
+        return {
+            split_name: Dataset(
+                dataframe=dataframe,
+                length=len(dataframe),
+                functions=self.functions,
+            )
+            for split_name, dataframe in split_dataframes(
+                self.dataframe,
+                split_column,
+                proportions,
+                filepath,
+                frozen,
+            ).items()
+        }
+
     def zip_index(self: Dataset[T]) -> Dataset[Tuple[T, int]]:
         '''
         Zip the output with its index. The output of the pipeline will be
@@ -343,10 +396,10 @@ def from_combine(index):
     def create_to_combine_mapping(datasets):
         cumprod_lengths = np.cumprod(list(map(len, datasets)))
         def to_concat(inner_indices):
-            return inner_indices[0] + sum(
-                [inner_index * cumprod_lengths[i]
-                for i, inner_index in enumerate(inner_indices[1:])]
-            )
+            return inner_indices[0] + sum([
+                inner_index * cumprod_lengths[i]
+                for i, inner_index in enumerate(inner_indices[1:])
+            ])
         return to_concat
 
     @staticmethod
@@ -548,3 +601,48 @@ def test_split_dataset():
     assert split_datasets1 != split_datasets3
     assert split_datasets3 == split_datasets4
     assert split_datasets3 != split_datasets5
+
+
+def test_group_split_dataset():
+    dataset = Dataset.from_dataframe(pd.DataFrame(dict(
+        group=np.arange(100) // 4,
+        number=np.random.randn(100),
+    ))).map(tuple)
+
+    split_file = Path('test_split_dataset.json')
+    proportions = dict(
+        gradient=0.7,
+        early_stopping=0.15,
+        compare=0.15,
+    )
+
+    kwargs = dict(
+        split_column='group',
+        proportions=proportions,
+        filepath=split_file,
+    )
+
+    split_datasets1 = dataset.group_split(**kwargs)
+    split_datasets2 = dataset.group_split(**kwargs)
+    split_datasets3 = dataset.group_split(
+        split_column='group',
+        proportions=proportions,
+        seed=100,
+    )
+    split_datasets4 = dataset.group_split(
+        split_column='group',
+        proportions=proportions,
+        seed=100,
+    )
+    split_datasets5 = dataset.group_split(
+        split_column='group',
+        proportions=proportions,
+        seed=800,
+    )
+
+    split_file.unlink()
+
+    assert split_datasets1 == split_datasets2
+    assert split_datasets1 != split_datasets3
+    assert split_datasets3 == split_datasets4
+    assert split_datasets3 != split_datasets5
diff --git a/datastream/datastream.py b/datastream/datastream.py
@@ -118,7 +118,7 @@ def zip(datastreams: List[Datastream]) -> Datastream[Tuple]:
     def map(
         self: Datastream[T], function: Callable[[T], R]
     ) -> Datastream[R]:
-        ''' 
+        '''
         Creates a new Datastream with a new mapped dataset. See
         :func:`Dataset.map` for details.
         '''
diff --git a/datastream/tools/__init__.py b/datastream/tools/__init__.py
@@ -2,4 +2,6 @@
 from datastream.tools.starcompose import starcompose
 from datastream.tools.repeat_map_chain import repeat_map_chain
 from datastream.tools.numpy_seed import numpy_seed
-from datastream.tools.split_dataframes import split_dataframes
+from datastream.tools.split_dataframes import (
+    split_dataframes, group_split_dataframes
+)
diff --git a/datastream/tools/split_dataframes.py b/datastream/tools/split_dataframes.py
@@ -1,5 +1,5 @@
 from __future__ import annotations
-from typing import Tuple, Union, Dict, Optional
+from typing import Tuple, Dict, Optional
 from pathlib import Path
 import json
 import numpy as np
@@ -27,7 +27,7 @@ def split_dataframes(
             'Expected sum of proportions to be 1.',
             f'Proportions were {tuple(proportions.values())}',
         ]))
-    
+
     if filepath is not None and filepath.exists():
         split = json.loads(filepath.read_text())
 
@@ -77,12 +77,31 @@ def split_dataframes(
 
     return {
         split_name: (
-            dataframe[lambda df: df[key_column].isin(split[split_name])]
+            dataframe[dataframe[key_column].isin(split[split_name])]
         )
         for split_name in proportions.keys()
     }
 
 
+def group_split_dataframes(
+    dataframe: pd.DataFrame,
+    split_column: str,
+    proportions: Dict[str, float],
+    filepath: Optional[Path] = None,
+    frozen: Optional[bool] = False,
+):
+    key_dataframe = pd.DataFrame(dict(key=dataframe[split_column].unique()))
+    splits = split_dataframes(
+        key_dataframe, 'key', proportions, filepath=filepath, frozen=frozen
+    )
+    return {
+        split_name: (
+            dataframe[dataframe[split_column].isin(split['key'])]
+        )
+        for split_name, split in splits.items()
+    }
+
+
 def stratas(dataframe, stratify_column):
     return [
         dataframe[lambda df: df[stratify_column] == strata_value]
@@ -136,7 +155,7 @@ def n_target_split(keys, proportion):
 def selected(k, unassigned):
     return np.random.choice(
         unassigned, size=k, replace=False
-    ).tolist()  
+    ).tolist()
 
 
 def mock_dataframe():
@@ -166,6 +185,24 @@ def test_standard():
     assert tuple(map(len, split_dataframes_.values())) == (80, 10, 10)
 
 
+def test_group_split_dataframe():
+    dataframe = mock_dataframe().assign(group=lambda df: df['index'] // 4)
+    split_dataframes_ = group_split_dataframes(
+        dataframe,
+        split_column='group',
+        proportions=dict(
+            train=0.8,
+            compare=0.2,
+        ),
+    )
+    group_overlap = (
+        set(split_dataframes_['train'].group)
+        .intersection(split_dataframes_['compare'].group)
+    )
+    assert len(group_overlap) == 0
+    assert tuple(map(len, split_dataframes_.values())) == (80, 20)
+
+
 def test_validate_proportions():
     from pytest import raises