Add a Quick Example (#12)

sarahmish · web-flow · commit 08644a27e9a8 · 2024-09-13T09:23:25.000-07:00
* fix primitive setup * fix lint * add helper functions * reorganize folder * fix lint * correct import order * Revert "correct import order" This reverts commit 6389c55. * fix import order * pause ubuntu/windows lint test * add error catch + format json * fix minor issues with dimension * add simple example * update notebook * update core
diff --git a/sigllm/pipelines/detector/mistral_detector.json b/sigllm/pipelines/detector/mistral_detector.json
@@ -4,7 +4,7 @@
         "sklearn.impute.SimpleImputer",
         "sigllm.primitives.transformation.Float2Scalar",
         "sigllm.primitives.forecasting.custom.rolling_window_sequences",
-        "sigllm.primitives.transformation.format_as_string",
+	"sigllm.primitives.transformation.format_as_string",
         "sigllm.primitives.forecasting.huggingface.HF",
         "sigllm.primitives.transformation.format_as_integer",
         "sigllm.primitives.transformation.Scalar2Float",
@@ -36,7 +36,8 @@
             "steps": 5
         },
         "sigllm.primitives.transformation.format_as_integer#1": {
-            "trunc": 1
+            "trunc": 1,
+            "errors": "coerce"
         },
         "sigllm.primitives.postprocessing.aggregate_rolling_window#1": {
             "agg": "median"
@@ -60,9 +61,9 @@
         "sigllm.primitives.postprocessing.aggregate_rolling_window#1": {
             "y": "y_hat"
         },
-	"numpy.reshape#1": {
-	    "X": "y_hat"
-	},
+        "numpy.reshape#1": {
+            "X": "y_hat"
+        },
         "orion.primitives.timeseries_anomalies.find_anomalies#1": {
             "index": "target_index"
         }
diff --git a/sigllm/primitives/forecasting/huggingface.py b/sigllm/primitives/forecasting/huggingface.py
@@ -53,8 +53,6 @@ def __init__(self, name=DEFAULT_MODEL, sep=',', steps=1, temp=1, top_p=1,
         self.raw = raw
         self.samples = samples
         self.padding = padding
-        self.max_tokens = None
-        self.input_length = None
 
         self.tokenizer = AutoTokenizer.from_pretrained(self.name, use_fast=False)
 
@@ -105,21 +103,19 @@ def forecast(self, X, **kwargs):
         """
         all_responses, all_probs = [], []
         for text in tqdm(X):
-            x = text.flatten().tolist()
             tokenized_input = self.tokenizer(
-                x,
+                [text],
                 return_tensors="pt"
             ).to("cuda")
 
-            if self.max_tokens is None or self.input_length is None:
-                self.input_length = tokenized_input['input_ids'].shape[1]
-                average_length = self.input_length / len(x[0].split(','))
-                self.max_tokens = (average_length + self.padding) * self.steps
+            input_length = tokenized_input['input_ids'].shape[1]
+            average_length = input_length / len(text.split(','))
+            max_tokens = (average_length + self.padding) * self.steps
 
             generate_ids = self.model.generate(
                 **tokenized_input,
                 do_sample=True,
-                max_new_tokens=self.max_tokens,
+                max_new_tokens=max_tokens,
                 temperature=self.temp,
                 top_p=self.top_p,
                 bad_words_ids=self.invalid_tokens,
@@ -128,7 +124,7 @@ def forecast(self, X, **kwargs):
             )
 
             responses = self.tokenizer.batch_decode(
-                generate_ids[:, self.input_length:],
+                generate_ids[:, input_length:],
                 skip_special_tokens=True,
                 clean_up_tokenization_spaces=False
             )
diff --git a/sigllm/primitives/jsons/sigllm.primitives.transformation.Float2Scalar.json b/sigllm/primitives/jsons/sigllm.primitives.transformation.Float2Scalar.json
@@ -36,6 +36,10 @@
             {
                 "name": "minimum",
                 "type": "float"
+            },
+            {
+                "name": "decimal",
+                "type": "int"
             }
         ]
     },
diff --git a/sigllm/primitives/jsons/sigllm.primitives.transformation.Scalar2Float.json b/sigllm/primitives/jsons/sigllm.primitives.transformation.Scalar2Float.json
@@ -22,6 +22,11 @@
                 "name": "minimum",
                 "type": "float",
                 "default": 0
+            },
+            {
+                "name": "decimal",
+                "type": "int",
+                "default": 2
             }
         ],
         "output": [
@@ -30,13 +35,5 @@
                 "type": "ndarray"
             }
         ]
-    },
-    "hyperparameters": {
-        "fixed": {
-            "decimal": {
-                "type": "int",
-                "default": 2
-            }
-        }
     }
 }
diff --git a/sigllm/primitives/transformation.py b/sigllm/primitives/transformation.py
@@ -27,14 +27,16 @@ def format_as_string(X, sep=',', space=False):
             A list of string representation of each row.
     """
     def _as_string(x):
-        text = sep.join(list(map(str, x)))
+        text = sep.join(list(map(str, x.flatten())))
 
         if space:
             text = ' '.join(text)
 
         return text
 
-    return np.apply_along_axis(_as_string, axis=1, arr=X)
+    results = list(map(_as_string, X))
+
+    return np.array(results)
 
 
 def _from_string_to_integer(text, sep=',', trunc=None, errors='ignore'):
@@ -147,7 +149,7 @@ def transform(self, X):
 
         values = sign * (values * 10**self.decimal).astype(int)
 
-        return values, self.minimum
+        return values, self.minimum, self.decimal
 
 
 class Scalar2Float:
@@ -160,14 +162,13 @@ class Scalar2Float:
         105, 200, 310, 483, 500, 0 -> 1.05, 2., 3.1, 4.8342, 5, 0
 
     Args:
+        minimum (float):
+            Bias to shift the data. Captured from Float2Scalar.
         decimal (int):
             Number of decimal points to keep from the float representation. Default to `2`.
     """
 
-    def __init__(self, decimal=2):
-        self.decimal = decimal
-
-    def transform(self, X, minimum=0):
-        values = X * 10**(-self.decimal)
+    def transform(self, X, minimum=0, decimal=2):
+        values = X * 10**(-decimal)
 
         return values + minimum
diff --git a/tests/primitives/test_transformation.py b/tests/primitives/test_transformation.py
@@ -230,7 +230,7 @@ def test_transform_default(self):
         print(converter)
 
         converter.fit(data)
-        output, minimum = converter.transform(data)
+        output, minimum, decimal = converter.transform(data)
 
         assert converter.decimal == 2
         assert converter.rescale is True
@@ -249,7 +249,7 @@ def test_transform_decimal_zero(self):
         ])
 
         converter.fit(data)
-        output, minimum = converter.transform(data)
+        output, minimum, decimal = converter.transform(data)
 
         assert converter.decimal == 0
         assert converter.rescale is True
@@ -268,7 +268,7 @@ def test_transform_minimum_not_zero(self):
         ])
 
         converter.fit(data)
-        output, minimum = converter.transform(data)
+        output, minimum, decimal = converter.transform(data)
 
         assert converter.decimal == 2
         assert converter.rescale is True
@@ -287,7 +287,7 @@ def test_transform_rescale_false(self):
         ])
 
         converter.fit(data)
-        output, minimum = converter.transform(data)
+        output, minimum, decimal = converter.transform(data)
 
         assert converter.decimal == 2
         assert converter.rescale is False
@@ -306,7 +306,7 @@ def test_transform_negative(self):
         ])
 
         converter.fit(data)
-        output, minimum = converter.transform(data)
+        output, minimum, decimal = converter.transform(data)
 
         assert converter.decimal == 2
         assert converter.rescale is True
@@ -325,7 +325,7 @@ def test_transform_fit_different(self):
         ])
 
         converter.fit([7, 3, 0.5])
-        output, minimum = converter.transform(data)
+        output, minimum, decimal = converter.transform(data)
 
         assert converter.decimal == 2
         assert converter.rescale is True
@@ -348,12 +348,10 @@ def test_transform_default(self):
 
         output = converter.transform(data)
 
-        assert converter.decimal == 2
-
         np.testing.assert_array_equal(output, expected)
 
     def test_transform_decimal_zero(self):
-        converter = Scalar2Float(decimal=0)
+        converter = Scalar2Float()
 
         data = np.array([
             1, 2, 3, 4, 5, 0
@@ -362,9 +360,7 @@ def test_transform_decimal_zero(self):
             1., 2., 3., 4., 5., 0.
         ])
 
-        output = converter.transform(data)
-
-        assert converter.decimal == 0
+        output = converter.transform(data, decimal=0)
 
         np.testing.assert_array_equal(output, expected)
 
@@ -380,8 +376,6 @@ def test_transform_minimum_not_zero(self):
 
         output = converter.transform(data, minimum=-1)
 
-        assert converter.decimal == 2
-
         np.testing.assert_allclose(output, expected)
 
 
@@ -400,10 +394,10 @@ def test_float2scalar_scalar2float_integration():
     ])
 
     float2scalar.fit(data)
-    transformed, minimum = float2scalar.transform(data)
+    transformed, minimum, decimal = float2scalar.transform(data)
 
-    scalar2float = Scalar2Float(decimal)
+    scalar2float = Scalar2Float()
 
-    output = scalar2float.transform(transformed, minimum)
+    output = scalar2float.transform(transformed, minimum, decimal)
 
     np.testing.assert_allclose(output, expected, rtol=1e-2)
diff --git a/tutorials/Simple Time Series Example.ipynb b/tutorials/Simple Time Series Example.ipynb
diff --git a/tutorials/pipelines/detector-pipeline.ipynb b/tutorials/pipelines/detector-pipeline.ipynb

Original file line number	Diff line number	Diff line change
`@@ -36,6 +36,10 @@`
`36`	`36`	`{`
`37`	`37`	`"name": "minimum",`
`38`	`38`	`"type": "float"`
	`39`	`+ },`
	`40`	`+ {`
	`41`	`+ "name": "decimal",`
	`42`	`+ "type": "int"`
`39`	`43`	`}`
`40`	`44`	`]`
`41`	`45`	`},`
Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,11 @@`
`22`	`22`	`"name": "minimum",`
`23`	`23`	`"type": "float",`
`24`	`24`	`"default": 0`
	`25`	`+ },`
	`26`	`+ {`
	`27`	`+ "name": "decimal",`
	`28`	`+ "type": "int",`
	`29`	`+ "default": 2`
`25`	`30`	`}`
`26`	`31`	`],`
`27`	`32`	`"output": [`
`@@ -30,13 +35,5 @@`
`30`	`35`	`"type": "ndarray"`
`31`	`36`	`}`
`32`	`37`	`]`
`33`		`- },`
`34`		`- "hyperparameters": {`
`35`		`- "fixed": {`
`36`		`- "decimal": {`
`37`		`- "type": "int",`
`38`		`- "default": 2`
`39`		`- }`
`40`		`- }`
`41`	`38`	`}`
`42`	`39`	`}`