feature: new_columns method in dataset

FelixAbrahamsson · FelixAbrahamsson · commit 0a0fdb0c5239 · 2020-10-06T13:26:52.000+02:00
diff --git a/datastream/dataset.py b/datastream/dataset.py
@@ -329,6 +329,32 @@ def group_split(
             ).items()
         }
 
+
+    def new_columns(
+        self: Dataset[T], **kwargs: Callable[pd.Dataframe, pd.Series]
+    ) -> Dataset[T]:
+        '''
+        Append new column(s) to the :attr:`.Dataset.dataframe` by passing the
+        new column names as keywords with functions that take the
+        :attr:`.Dataset.dataframe` as input and return :func:`pandas.Series`.
+
+        >>> (
+        ...     Dataset.from_dataframe(pd.DataFrame(dict(number=[1, 2, 3])))
+        ...     .new_columns(twice=lambda df: df['number'] * 2)
+        ...     .map(lambda row: row['twice'])
+        ... )[-1]
+        6
+        '''
+        if len(set(kwargs.keys()) & set(self.dataframe.columns)) >= 1:
+            raise ValueError('Should not replace existing columns')
+
+        dataframe = self.dataframe.assign(**kwargs)
+        return Dataset(
+            dataframe=dataframe,
+            length=len(dataframe),
+            get_item=self.get_item,
+        )
+
     def zip_index(self: Dataset[T]) -> Dataset[Tuple[T, int]]:
         '''
         Zip the output with its index. The output of the pipeline will be
@@ -550,6 +576,18 @@ def test_subset():
     assert dataset[0]['number'] == numbers[2]
 
 
+def test_new_columns():
+    from pytest import raises
+
+    with raises(ValueError):
+        dataset = (
+            Dataset.from_dataframe(pd.DataFrame(dict(
+                key=np.arange(100),
+            )))
+            .new_columns(key=lambda df: df['key'] * 2)
+        )
+
+
 def test_concat_dataset():
     dataset = Dataset.concat([
         Dataset.from_subscriptable(list(range(5))),