Skip to content
This repository was archived by the owner on Aug 25, 2024. It is now read-only.

Commit 9594c82

Browse files
authored
operation: nlp: example: Add sklearn ops example
1 parent 773d8b6 commit 9594c82

File tree

17 files changed

+435
-22
lines changed

17 files changed

+435
-22
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
66

77
## [Unreleased]
88
### Added
9+
- Example usage of sklearn operations
910
- Example Flower17 species image classification
1011
- Configloading ablity from CLI using "@" before filename
1112
- Docstrings and doctestable example for DataFlowSource

dffml/df/memory.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1288,6 +1288,10 @@ async def seed_inputs(
12881288
ctx: Optional[BaseInputSetContext] = None,
12891289
input_set: Optional[Union[List[Input], BaseInputSet]] = None,
12901290
) -> BaseInputSetContext:
1291+
if ctx is not None and not isinstance(ctx, BaseInputSetContext):
1292+
raise TypeError(
1293+
f"ctx {ctx} is of type {type(ctx)}, should be BaseInputSetContext"
1294+
)
12911295
self.logger.debug("Seeding dataflow with input_set: %s", input_set)
12921296
if input_set is None:
12931297
# Create a list if extra inputs were not given
@@ -1378,7 +1382,9 @@ async def run(
13781382
await self.forward_inputs_to_subflow(input_set)
13791383
ctxs.append(
13801384
await self.seed_inputs(
1381-
ctx=StringInputSetContext(ctx_string),
1385+
ctx=StringInputSetContext(ctx_string)
1386+
if isinstance(ctx_string, str)
1387+
else ctx_string,
13821388
input_set=input_set,
13831389
)
13841390
)

dffml/source/df.py

Lines changed: 93 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
11
import pathlib
2-
from typing import Type, AsyncIterator
2+
from typing import Type, AsyncIterator, Dict, Any
33

4-
from dffml.base import config, BaseConfig
4+
from dffml.base import config, BaseConfig, field
55
from dffml.configloader.configloader import BaseConfigLoader
6-
from dffml.df.types import Definition, DataFlow, Input
7-
from dffml.df.base import BaseOrchestrator
6+
from dffml.df.types import Definition, DataFlow, Input, Operation
7+
from dffml.df.base import BaseInputSetContext, BaseContextHandle
8+
from dffml.df.base import (
9+
BaseOrchestrator,
10+
OperationImplementationContext,
11+
OperationImplementation,
12+
)
813
from dffml.feature import Features
914
from dffml.record import Record
1015
from dffml.source.source import BaseSource, BaseSourceContext
@@ -17,17 +22,80 @@ class DataFlowSourceConfig:
1722
source: BaseSource
1823
dataflow: DataFlow
1924
features: Features
25+
length: str = field(
26+
"Definition name to add as source length", default=None
27+
)
28+
all_for_single: bool = False
2029
orchestrator: BaseOrchestrator = MemoryOrchestrator.withconfig({})
2130

2231

32+
class RecordContextHandle(BaseContextHandle):
33+
def as_string(self) -> str:
34+
return self.ctx.record.key
35+
36+
37+
class RecordInputSetContext(BaseInputSetContext):
38+
def __init__(self, record: Record):
39+
self.record = record
40+
41+
async def handle(self) -> BaseContextHandle:
42+
return RecordContextHandle(self)
43+
44+
def __repr__(self):
45+
return self.as_string
46+
47+
def __str__(self):
48+
return repr(self)
49+
50+
2351
class DataFlowSourceContext(BaseSourceContext):
2452
async def update(self, record: Record):
2553
await self.sctx.update(record)
2654

27-
async def records(self) -> AsyncIterator[Record]:
28-
async for record in self.sctx.records():
55+
# TODO Implement this method. We forgot to implement it when we initially
56+
# added the DataFlowSourceContext
57+
async def record(self, key: str) -> AsyncIterator[Record]:
58+
if self.parent.config.all_for_single:
59+
async for ctx, result in self.records():
60+
if (await ctx.handle()).as_string() == key:
61+
yield record
62+
else:
2963
async for ctx, result in self.octx.run(
30-
[
64+
{
65+
RecordInputSetContext(record): [
66+
Input(
67+
value=record.feature(feature.name),
68+
definition=Definition(
69+
name=feature.name,
70+
primitive=str(feature.dtype()),
71+
),
72+
)
73+
for feature in self.parent.config.features
74+
]
75+
+ (
76+
[]
77+
if not self.parent.config.length
78+
else [
79+
Input(
80+
value=await self.sctx.length(),
81+
definition=Definition(
82+
name=self.parent.config.length,
83+
primitive="int",
84+
),
85+
)
86+
]
87+
)
88+
async for record in [self.sctx.record(key)]
89+
}
90+
):
91+
if result:
92+
ctx.record.evaluated(result)
93+
yield ctx.record
94+
95+
async def records(self) -> AsyncIterator[Record]:
96+
async for ctx, result in self.octx.run(
97+
{
98+
RecordInputSetContext(record): [
3199
Input(
32100
value=record.feature(feature.name),
33101
definition=Definition(
@@ -36,10 +104,24 @@ async def records(self) -> AsyncIterator[Record]:
36104
)
37105
for feature in self.parent.config.features
38106
]
39-
):
40-
if result:
41-
record.evaluated(result)
42-
yield record
107+
+ (
108+
[]
109+
if not self.parent.config.length
110+
else [
111+
Input(
112+
value=await self.sctx.length(),
113+
definition=Definition(
114+
name=self.parent.config.length, primitive="int"
115+
),
116+
)
117+
]
118+
)
119+
async for record in self.sctx.records()
120+
}
121+
):
122+
if result:
123+
ctx.record.evaluated(result)
124+
yield ctx.record
43125

44126
async def __aenter__(self) -> "DataFlowSourceContext":
45127
self.sctx = await self.parent.source().__aenter__()

docs/tutorials/dataflows/nlp.rst

Lines changed: 122 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
Using NLP Operations
22
====================
33

4-
This example will show you how to use DFFML operations to clean text data and train a model using DFFML cli.
4+
These example will show you how to use DFFML operations to clean text data and train Tensorflow DNNClassifier model and Scikit Learn
5+
Naive Bayes Classifier model using DFFML cli.
6+
7+
Preprocessing data and training DNNClassifier model
8+
---------------------------------------------------
59

610
DFFML offers several :ref:`plugin_models`. For this example
711
we will be using the tensorflow DNNClassifier model
@@ -92,4 +96,120 @@ The output is:
9296
| sentiment |
9397
+------------------------------------------------------------------------------------------------------------------------------+
9498
| Value: 1 | Confidence: 0.5122595429420471 |
95-
+------------------------------------------------------------------------------------------------------------------------------+
99+
+------------------------------------------------------------------------------------------------------------------------------+
100+
101+
102+
Preprocessing data and training Naive Bayes Classifier model
103+
------------------------------------------------------------
104+
105+
Now we will see how to use traditional ML algorithm like Naive Bayes Classifier available in ``dffml-model-scikit`` (:ref:`plugin_model_dffml_model_scikit`) for
106+
classification.
107+
108+
Create training data:
109+
110+
.. literalinclude:: /../examples/nlp/train_data.sh
111+
112+
But before we feed the data to model we need to convert it to vectors of numeric values.
113+
Here we will use ``tfidf_vectorizer`` operation (:ref:`plugin_operation_dffml_operations_nlp_tfidf_vectorizer`) which is a wrapper around
114+
sklearn `TfidfVectorizer. <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html>`_
115+
116+
The dataflow will be similar to the one used above but with a slight modification. We will add an extra operation
117+
``collect_output`` (:ref:`plugin_operation_dffml_operations_nlp_collect_output`) which will collect all the records before
118+
forwarding them to next operation. This is to ensure that `tfidf_vectorizer` receives a list of sentence rather than a single
119+
sentence at a time.
120+
The matrix returned by `tfidf_vectorizer` will be passed to ``extract_array_from_matrix`` (:ref:`plugin_operation_dffml_operations_nlp_extract_array_from_matrix`)
121+
which will return the array corresponding to each sentence.
122+
123+
So, Let's modify the dataflow to use our new operations.
124+
125+
.. literalinclude:: /../examples/nlp/sklearn/create_dataflow.sh
126+
127+
To visualize the dataflow run:
128+
129+
.. literalinclude:: /../examples/nlp/sklearn/dataflow_diagram.sh
130+
131+
We can now use this dataflow to preprocess the data and make it ready to be fed into model:
132+
133+
.. literalinclude:: /../examples/nlp/sklearn/train.sh
134+
135+
Assess accuracy:
136+
137+
.. literalinclude:: /../examples/nlp/sklearn/accuracy.sh
138+
139+
The output is:
140+
141+
.. code-block:: console
142+
143+
1.0
144+
145+
Create test data:
146+
147+
.. literalinclude:: /../examples/nlp/sklearn/test_data.sh
148+
149+
Make prediction on test data:
150+
151+
.. literalinclude:: /../examples/nlp/sklearn/predict.sh
152+
153+
The output is:
154+
155+
.. code-block:: console
156+
157+
Key: 1
158+
Record Features
159+
+------------------------------------------------------------------------------------------------+
160+
| sentence | Those were good days |
161+
+------------------------------------------------------------------------------------------------+
162+
|extract_array_from_matri| 0.0, 0.0, 0.7071067811865476, 0 ... (length:9) |
163+
+------------------------------------------------------------------------------------------------+
164+
165+
Prediction
166+
+------------------------------------------------------------------------------------------------+
167+
| sentiment |
168+
+------------------------------------------------------------------------------------------------+
169+
| Value: 1 | Confidence: 1.0 |
170+
+------------------------------------------------------------------------------------------------+
171+
172+
Key: 2
173+
Record Features
174+
+------------------------------------------------------------------------------------------------+
175+
| sentence | My cat plays all day |
176+
+------------------------------------------------------------------------------------------------+
177+
|extract_array_from_matri| 0.5773502691896257, 0.577350269 ... (length:9) |
178+
+------------------------------------------------------------------------------------------------+
179+
180+
Prediction
181+
+------------------------------------------------------------------------------------------------+
182+
| sentiment |
183+
+------------------------------------------------------------------------------------------------+
184+
| Value: 0 | Confidence: 1.0 |
185+
+------------------------------------------------------------------------------------------------+
186+
187+
Key: 0
188+
Record Features
189+
+------------------------------------------------------------------------------------------------+
190+
| sentence | Such a pleasant morning |
191+
+------------------------------------------------------------------------------------------------+
192+
|extract_array_from_matri| 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0 ... (length:9) |
193+
+------------------------------------------------------------------------------------------------+
194+
195+
Prediction
196+
+------------------------------------------------------------------------------------------------+
197+
| sentiment |
198+
+------------------------------------------------------------------------------------------------+
199+
| Value: 1 | Confidence: 1.0 |
200+
+------------------------------------------------------------------------------------------------+
201+
202+
Key: 3
203+
Record Features
204+
+------------------------------------------------------------------------------------------------+
205+
| sentence | Dogs are evil |
206+
+------------------------------------------------------------------------------------------------+
207+
|extract_array_from_matri| 0.0, 0.0, 0.0, 0.70710678118654 ... (length:9) |
208+
+------------------------------------------------------------------------------------------------+
209+
210+
Prediction
211+
+------------------------------------------------------------------------------------------------+
212+
| sentiment |
213+
+------------------------------------------------------------------------------------------------+
214+
| Value: 0 | Confidence: 1.0 |
215+
+------------------------------------------------------------------------------------------------+

examples/nlp/create_dataflow.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,7 @@ dffml dataflow create get_single remove_stopwords get_embedding \
55
'[{"seed": ["spacy_model_name_def"]}]'=get_embedding.inputs.spacy_model \
66
'[{"seed": ["pad_token_def"]}]'=get_embedding.inputs.pad_token \
77
'[{"seed": ["max_len_def"]}]'=get_embedding.inputs.max_len \
8+
'[{"remove_stopwords": "result"}]'=get_embedding.inputs.text \
89
'[{"remove_stopwords": "result"}]'=get_embedding.inputs.text |
10+
911
tee nlp_ops_dataflow.json

examples/nlp/sklearn/accuracy.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
dffml accuracy \
2+
-model scikitgnb \
3+
-model-features extract_array_from_matrix.outputs.result:float:1 \
4+
-model-predict sentiment:int:1 \
5+
-model-directory tempdir \
6+
-sources text=df \
7+
-source-text-dataflow nlp_ops_dataflow.json \
8+
-source-text-features sentence:str:1 \
9+
-source-text-source csv \
10+
-source-text-source-filename train_data.csv \
11+
-log debug
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
dffml dataflow create get_single remove_stopwords collect_output extract_array_from_matrix tfidf_vectorizer \
2+
-inputs '["extract_array_from_matrix.outputs.result"]'=get_single_spec 4=source_length \
3+
-flow \
4+
'[{"seed": ["sentence"]}]'=remove_stopwords.inputs.text \
5+
'[{"seed": ["source_length"]}]'=collect_output.inputs.length \
6+
'[{"remove_stopwords": "result"}]'=collect_output.inputs.sentence \
7+
'[{"collect_output": "all"}]'=tfidf_vectorizer.inputs.text \
8+
'[{"remove_stopwords": "result"}]'=extract_array_from_matrix.inputs.single_text_example \
9+
'[{"collect_output": "all"}]'=extract_array_from_matrix.inputs.collected_text \
10+
'[{"tfidf_vectorizer": "result"}]'=extract_array_from_matrix.inputs.input_matrix |
11+
tee nlp_ops_dataflow.json
12+
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
dffml dataflow diagram -stage processing -- nlp_ops_dataflow.json

examples/nlp/sklearn/predict.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
dffml predict all \
2+
-model scikitgnb \
3+
-model-features extract_array_from_matrix.outputs.result:float:1 \
4+
-model-predict sentiment:int:1 \
5+
-model-directory tempdir \
6+
-sources text=df \
7+
-source-text-dataflow nlp_ops_dataflow.json \
8+
-source-text-features sentence:str:1 \
9+
-source-text-source csv \
10+
-source-text-source-filename test_data.csv \
11+
-pretty

examples/nlp/sklearn/test_data.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
cat > test_data.csv << EOF
2+
sentence
3+
Such a pleasant morning
4+
Those were good days
5+
My cat plays all day
6+
Dogs are evil
7+
EOF

0 commit comments

Comments
 (0)