operation: nlp: example: Add sklearn ops example

0dust · web-flow · commit 9594c8293058 · 2020-08-28T08:59:35.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 ### Added
+- Example usage of sklearn operations
 - Example Flower17 species image classification
 - Configloading ablity from CLI using "@" before filename
 - Docstrings and doctestable example for DataFlowSource
diff --git a/dffml/df/memory.py b/dffml/df/memory.py
@@ -1288,6 +1288,10 @@ async def seed_inputs(
         ctx: Optional[BaseInputSetContext] = None,
         input_set: Optional[Union[List[Input], BaseInputSet]] = None,
     ) -> BaseInputSetContext:
+        if ctx is not None and not isinstance(ctx, BaseInputSetContext):
+            raise TypeError(
+                f"ctx {ctx} is of type {type(ctx)}, should be BaseInputSetContext"
+            )
         self.logger.debug("Seeding dataflow with input_set: %s", input_set)
         if input_set is None:
             # Create a list if extra inputs were not given
@@ -1378,7 +1382,9 @@ async def run(
                 await self.forward_inputs_to_subflow(input_set)
                 ctxs.append(
                     await self.seed_inputs(
-                        ctx=StringInputSetContext(ctx_string),
+                        ctx=StringInputSetContext(ctx_string)
+                        if isinstance(ctx_string, str)
+                        else ctx_string,
                         input_set=input_set,
                     )
                 )
diff --git a/dffml/source/df.py b/dffml/source/df.py
@@ -1,10 +1,15 @@
 import pathlib
-from typing import Type, AsyncIterator
+from typing import Type, AsyncIterator, Dict, Any
 
-from dffml.base import config, BaseConfig
+from dffml.base import config, BaseConfig, field
 from dffml.configloader.configloader import BaseConfigLoader
-from dffml.df.types import Definition, DataFlow, Input
-from dffml.df.base import BaseOrchestrator
+from dffml.df.types import Definition, DataFlow, Input, Operation
+from dffml.df.base import BaseInputSetContext, BaseContextHandle
+from dffml.df.base import (
+    BaseOrchestrator,
+    OperationImplementationContext,
+    OperationImplementation,
+)
 from dffml.feature import Features
 from dffml.record import Record
 from dffml.source.source import BaseSource, BaseSourceContext
@@ -17,17 +22,80 @@ class DataFlowSourceConfig:
     source: BaseSource
     dataflow: DataFlow
     features: Features
+    length: str = field(
+        "Definition name to add as source length", default=None
+    )
+    all_for_single: bool = False
     orchestrator: BaseOrchestrator = MemoryOrchestrator.withconfig({})
 
 
+class RecordContextHandle(BaseContextHandle):
+    def as_string(self) -> str:
+        return self.ctx.record.key
+
+
+class RecordInputSetContext(BaseInputSetContext):
+    def __init__(self, record: Record):
+        self.record = record
+
+    async def handle(self) -> BaseContextHandle:
+        return RecordContextHandle(self)
+
+    def __repr__(self):
+        return self.as_string
+
+    def __str__(self):
+        return repr(self)
+
+
 class DataFlowSourceContext(BaseSourceContext):
     async def update(self, record: Record):
         await self.sctx.update(record)
 
-    async def records(self) -> AsyncIterator[Record]:
-        async for record in self.sctx.records():
+    # TODO Implement this method. We forgot to implement it when we initially
+    # added the DataFlowSourceContext
+    async def record(self, key: str) -> AsyncIterator[Record]:
+        if self.parent.config.all_for_single:
+            async for ctx, result in self.records():
+                if (await ctx.handle()).as_string() == key:
+                    yield record
+        else:
             async for ctx, result in self.octx.run(
-                [
+                {
+                    RecordInputSetContext(record): [
+                        Input(
+                            value=record.feature(feature.name),
+                            definition=Definition(
+                                name=feature.name,
+                                primitive=str(feature.dtype()),
+                            ),
+                        )
+                        for feature in self.parent.config.features
+                    ]
+                    + (
+                        []
+                        if not self.parent.config.length
+                        else [
+                            Input(
+                                value=await self.sctx.length(),
+                                definition=Definition(
+                                    name=self.parent.config.length,
+                                    primitive="int",
+                                ),
+                            )
+                        ]
+                    )
+                    async for record in [self.sctx.record(key)]
+                }
+            ):
+                if result:
+                    ctx.record.evaluated(result)
+                yield ctx.record
+
+    async def records(self) -> AsyncIterator[Record]:
+        async for ctx, result in self.octx.run(
+            {
+                RecordInputSetContext(record): [
                     Input(
                         value=record.feature(feature.name),
                         definition=Definition(
@@ -36,10 +104,24 @@ async def records(self) -> AsyncIterator[Record]:
                     )
                     for feature in self.parent.config.features
                 ]
-            ):
-                if result:
-                    record.evaluated(result)
-                yield record
+                + (
+                    []
+                    if not self.parent.config.length
+                    else [
+                        Input(
+                            value=await self.sctx.length(),
+                            definition=Definition(
+                                name=self.parent.config.length, primitive="int"
+                            ),
+                        )
+                    ]
+                )
+                async for record in self.sctx.records()
+            }
+        ):
+            if result:
+                ctx.record.evaluated(result)
+            yield ctx.record
 
     async def __aenter__(self) -> "DataFlowSourceContext":
         self.sctx = await self.parent.source().__aenter__()
diff --git a/docs/tutorials/dataflows/nlp.rst b/docs/tutorials/dataflows/nlp.rst
@@ -1,7 +1,11 @@
 Using NLP Operations
 ====================
 
-This example will show you how to use DFFML operations to clean text data and train a model using DFFML cli.
+These example will show you how to use DFFML operations to clean text data and train Tensorflow DNNClassifier model and Scikit Learn
+Naive Bayes Classifier model using DFFML cli.
+
+Preprocessing data and training DNNClassifier model
+---------------------------------------------------
 
 DFFML offers several :ref:`plugin_models`. For this example
 we will be using the tensorflow DNNClassifier model
@@ -92,4 +96,120 @@ The output is:
     |                                                          sentiment                                                           |
     +------------------------------------------------------------------------------------------------------------------------------+
     |           Value:  1           |                               Confidence:   0.5122595429420471                               |
-    +------------------------------------------------------------------------------------------------------------------------------+
+    +------------------------------------------------------------------------------------------------------------------------------+
+
+    
+Preprocessing data and training Naive Bayes Classifier model
+------------------------------------------------------------
+
+Now we will see how to use traditional ML algorithm like Naive Bayes Classifier available in ``dffml-model-scikit`` (:ref:`plugin_model_dffml_model_scikit`) for
+classification.
+
+Create training data:
+
+.. literalinclude:: /../examples/nlp/train_data.sh
+
+But before we feed the data to model we need to convert it to vectors of numeric values.
+Here we will use ``tfidf_vectorizer`` operation (:ref:`plugin_operation_dffml_operations_nlp_tfidf_vectorizer`) which is a wrapper around
+sklearn `TfidfVectorizer. <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html>`_
+
+The dataflow will be similar to the one used above but with a slight modification. We will add an extra operation
+``collect_output`` (:ref:`plugin_operation_dffml_operations_nlp_collect_output`) which will collect all the records before
+forwarding them to next operation. This is to ensure that `tfidf_vectorizer` receives a list of sentence rather than a single
+sentence at a time. 
+The matrix returned by `tfidf_vectorizer` will be passed to ``extract_array_from_matrix`` (:ref:`plugin_operation_dffml_operations_nlp_extract_array_from_matrix`)
+which will return the array corresponding to each sentence.
+
+So, Let's modify the dataflow to use our new operations.
+
+.. literalinclude:: /../examples/nlp/sklearn/create_dataflow.sh
+
+To visualize the dataflow run:
+
+.. literalinclude:: /../examples/nlp/sklearn/dataflow_diagram.sh
+
+We can now use this dataflow to preprocess the data and make it ready to be fed into model:
+
+.. literalinclude:: /../examples/nlp/sklearn/train.sh
+
+Assess accuracy:
+
+.. literalinclude:: /../examples/nlp/sklearn/accuracy.sh
+
+The output is:
+
+.. code-block:: console
+
+    1.0
+
+Create test data:
+
+.. literalinclude:: /../examples/nlp/sklearn/test_data.sh
+
+Make prediction on test data:
+
+.. literalinclude:: /../examples/nlp/sklearn/predict.sh
+
+The output is:
+
+.. code-block:: console
+
+            Key:	1
+                                            Record Features
+    +------------------------------------------------------------------------------------------------+
+    |        sentence        |                          Those were good days                         |
+    +------------------------------------------------------------------------------------------------+
+    |extract_array_from_matri|             0.0, 0.0, 0.7071067811865476, 0 ... (length:9)            |
+    +------------------------------------------------------------------------------------------------+
+
+                                            Prediction
+    +------------------------------------------------------------------------------------------------+
+    |                                           sentiment                                            |
+    +------------------------------------------------------------------------------------------------+
+    |       Value:  1        |                           Confidence:   1.0                           |
+    +------------------------------------------------------------------------------------------------+
+
+        Key:	2
+                                            Record Features
+    +------------------------------------------------------------------------------------------------+
+    |        sentence        |                          My cat plays all day                         |
+    +------------------------------------------------------------------------------------------------+
+    |extract_array_from_matri|             0.5773502691896257, 0.577350269 ... (length:9)            |
+    +------------------------------------------------------------------------------------------------+
+
+                                            Prediction
+    +------------------------------------------------------------------------------------------------+
+    |                                           sentiment                                            |
+    +------------------------------------------------------------------------------------------------+
+    |       Value:  0        |                           Confidence:   1.0                           |
+    +------------------------------------------------------------------------------------------------+
+
+        Key:	0
+                                            Record Features
+    +------------------------------------------------------------------------------------------------+
+    |        sentence        |                        Such a pleasant morning                        |
+    +------------------------------------------------------------------------------------------------+
+    |extract_array_from_matri|             0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0 ... (length:9)            |
+    +------------------------------------------------------------------------------------------------+
+
+                                            Prediction
+    +------------------------------------------------------------------------------------------------+
+    |                                           sentiment                                            |
+    +------------------------------------------------------------------------------------------------+
+    |       Value:  1        |                           Confidence:   1.0                           |
+    +------------------------------------------------------------------------------------------------+
+
+        Key:	3
+                                            Record Features
+    +------------------------------------------------------------------------------------------------+
+    |        sentence        |                             Dogs are evil                             |
+    +------------------------------------------------------------------------------------------------+
+    |extract_array_from_matri|             0.0, 0.0, 0.0, 0.70710678118654 ... (length:9)            |
+    +------------------------------------------------------------------------------------------------+
+
+                                            Prediction
+    +------------------------------------------------------------------------------------------------+
+    |                                           sentiment                                            |
+    +------------------------------------------------------------------------------------------------+
+    |       Value:  0        |                           Confidence:   1.0                           |
+    +------------------------------------------------------------------------------------------------+
diff --git a/examples/nlp/create_dataflow.sh b/examples/nlp/create_dataflow.sh
@@ -5,5 +5,7 @@ dffml dataflow create get_single remove_stopwords get_embedding \
       '[{"seed": ["spacy_model_name_def"]}]'=get_embedding.inputs.spacy_model \
       '[{"seed": ["pad_token_def"]}]'=get_embedding.inputs.pad_token \
       '[{"seed": ["max_len_def"]}]'=get_embedding.inputs.max_len \
+      '[{"remove_stopwords": "result"}]'=get_embedding.inputs.text \
       '[{"remove_stopwords": "result"}]'=get_embedding.inputs.text |
+
     tee nlp_ops_dataflow.json
diff --git a/examples/nlp/sklearn/accuracy.sh b/examples/nlp/sklearn/accuracy.sh
@@ -0,0 +1,11 @@
+dffml accuracy \
+    -model scikitgnb \
+    -model-features extract_array_from_matrix.outputs.result:float:1 \
+    -model-predict sentiment:int:1 \
+    -model-directory tempdir \
+    -sources text=df \
+    -source-text-dataflow nlp_ops_dataflow.json \
+    -source-text-features sentence:str:1 \
+    -source-text-source csv \
+    -source-text-source-filename train_data.csv \
+    -log debug 
diff --git a/examples/nlp/sklearn/create_dataflow.sh b/examples/nlp/sklearn/create_dataflow.sh
@@ -0,0 +1,12 @@
+dffml dataflow create get_single remove_stopwords collect_output extract_array_from_matrix tfidf_vectorizer  \
+    -inputs '["extract_array_from_matrix.outputs.result"]'=get_single_spec 4=source_length \
+    -flow \
+      '[{"seed": ["sentence"]}]'=remove_stopwords.inputs.text \
+      '[{"seed": ["source_length"]}]'=collect_output.inputs.length \
+      '[{"remove_stopwords": "result"}]'=collect_output.inputs.sentence \
+      '[{"collect_output": "all"}]'=tfidf_vectorizer.inputs.text \
+      '[{"remove_stopwords": "result"}]'=extract_array_from_matrix.inputs.single_text_example \
+      '[{"collect_output": "all"}]'=extract_array_from_matrix.inputs.collected_text \
+      '[{"tfidf_vectorizer": "result"}]'=extract_array_from_matrix.inputs.input_matrix |
+    tee nlp_ops_dataflow.json
+
diff --git a/examples/nlp/sklearn/dataflow_diagram.sh b/examples/nlp/sklearn/dataflow_diagram.sh
@@ -0,0 +1 @@
+dffml dataflow diagram -stage processing -- nlp_ops_dataflow.json
diff --git a/examples/nlp/sklearn/predict.sh b/examples/nlp/sklearn/predict.sh
@@ -0,0 +1,11 @@
+dffml predict all \
+    -model scikitgnb \
+    -model-features extract_array_from_matrix.outputs.result:float:1 \
+    -model-predict sentiment:int:1 \
+    -model-directory tempdir \
+    -sources text=df \
+    -source-text-dataflow nlp_ops_dataflow.json \
+    -source-text-features sentence:str:1 \
+    -source-text-source csv \
+    -source-text-source-filename test_data.csv \
+    -pretty
diff --git a/examples/nlp/sklearn/test_data.sh b/examples/nlp/sklearn/test_data.sh
@@ -0,0 +1,7 @@
+cat > test_data.csv << EOF
+sentence
+Such a pleasant morning
+Those were good days
+My cat plays all day
+Dogs are evil
+EOF
diff --git a/examples/nlp/sklearn/train.sh b/examples/nlp/sklearn/train.sh
@@ -0,0 +1,11 @@
+dffml train \
+    -model scikitgnb \
+    -model-features extract_array_from_matrix.outputs.result:float:1 \
+    -model-predict sentiment:int:1 \
+    -model-directory tempdir \
+    -sources text=df \
+    -source-text-dataflow nlp_ops_dataflow.json \
+    -source-text-features sentence:str:1 \
+    -source-text-source csv \
+    -source-text-source-filename train_data.csv \
+    -log debug 
diff --git a/examples/nlp/sklearn/train_data.sh b/examples/nlp/sklearn/train_data.sh
@@ -0,0 +1,7 @@
+cat > train_data.csv << EOF
+sentence,sentiment
+What a pleasant morning,1
+Those were bad days,0
+My puppy plays all day,1
+Cats are evil,0
+EOF
diff --git a/operations/nlp/dffml_operations_nlp/operations.py b/operations/nlp/dffml_operations_nlp/operations.py
diff --git a/operations/nlp/setup.py b/operations/nlp/setup.py
diff --git a/operations/nlp/tests/test_operations.py b/operations/nlp/tests/test_operations.py
diff --git a/tests/source/test_df.py b/tests/source/test_df.py
diff --git a/tests/test_df.py b/tests/test_df.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+dffml dataflow diagram -stage processing -- nlp_ops_dataflow.json`