sintel-dev · sarahmish · Apr 18, 2025 · Mar 31, 2025 · Mar 31, 2025 · Mar 31, 2025
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -11,7 +11,7 @@ on:
 
 jobs:
   lint:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v4
     - name: Set up Python 3.9

diff --git a/sigllm/core.py b/sigllm/core.py
@@ -100,8 +100,14 @@ def __repr__(self):
 
         return ('SigLLM:\n{}\nhyperparameters:\n{}\n').format(pipeline, hyperparameters)
 
-    def detect(self, data: pd.DataFrame, visualization: bool = False, **kwargs) -> pd.DataFrame:
-        """Detect anomalies in the given data..
+    def detect(
+        self,
+        data: pd.DataFrame,
+        normal: pd.DataFrame = None,
+        visualization: bool = False,
+        **kwargs,
+    ) -> pd.DataFrame:
+        """Detect anomalies in the given data.
 
         If ``visualization=True``, also return the visualization
         outputs from the MLPipeline object.
@@ -110,6 +116,10 @@ def detect(self, data: pd.DataFrame, visualization: bool = False, **kwargs) -> p
             data (DataFrame):
                 Input data, passed as a ``pandas.DataFrame`` containing
                 exactly two columns: timestamp and value.
+            normal (DataFrame, optional):
+                Normal reference data for one-shot prompting, passed as a ``pandas.DataFrame``
+                containing exactly two columns: timestamp and value. If None, zero-shot
+                prompting is used. Default to None.
             visualization (bool):
                 If ``True``, also capture the ``visualization`` named
                 output from the ``MLPipeline`` and return it as a second
@@ -125,6 +135,9 @@ def detect(self, data: pd.DataFrame, visualization: bool = False, **kwargs) -> p
         if not self._fitted:
             self._mlpipeline = self._get_mlpipeline()
 
+        if normal is not None:
+            kwargs['normal'] = normal
+
         result = self._detect(self._mlpipeline.fit, data, visualization, **kwargs)
         self._fitted = True
 

diff --git a/sigllm/data.py b/sigllm/data.py
@@ -0,0 +1,125 @@
+# -*- coding: utf-8 -*-
+
+"""Data Management module.
+
+This module contains functions that allow downloading demo data from Amazon S3,
+as well as load and work with other data stored locally.
+"""
+
+import logging
+import os
+
+import pandas as pd
+from orion.data import format_csv, load_csv
+
+LOGGER = logging.getLogger(__name__)
+
+DATA_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
+BUCKET = 'sintel-sigllm'
+S3_URL = 'https://{}.s3.amazonaws.com/{}'
+
+
+def download_normal(name, data_path=DATA_PATH):
+    """Load the CSV with the given name from S3.
+
+    If the CSV has never been loaded before, it will be downloaded
+    from the [sintel-sigllm bucket](https://sintel-sigllm.s3.amazonaws.com) or
+    the S3 bucket specified following the `s3://{bucket}/path/to/the.csv` format,
+    and then cached inside the `data` folder, within the `sigllm` package
+    directory, and then returned.
+
+    Otherwise, if it has been downloaded and cached before, it will be directly
+    loaded from the `sigllm/data` folder without contacting S3.
+
+    Args:
+        name (str):
+            Name of the CSV to load.
+        data_path (str):
+            Path to store data.
+
+    Returns:
+        pandas.DataFrame:
+            A pandas.DataFrame is returned containing all the data.
+
+    Raises:
+        FileNotFoundError: If the normal file doesn't exist locally and can't
+        be downloaded from S3.
+    """
+    try:
+        url = None
+        if name.startswith('s3://'):
+            parts = name[5:].split('/', 1)
+            bucket = parts[0]
+            path = parts[1]
+            url = S3_URL.format(bucket, path)
+            filename = os.path.join(data_path, path.split('/')[-1])
+        else:
+            filename = os.path.join(data_path, name + '_normal.csv')
+            data_path = os.path.join(data_path, os.path.dirname(name))
+
+        if os.path.exists(filename):
+            data = pd.read_csv(filename)
+            return data
+
+        url = url or S3_URL.format(BUCKET, '{}_normal.csv'.format(name))
+        LOGGER.info('Downloading CSV %s from %s', name, url)
+
+        try:
+            data = pd.read_csv(url)
+            os.makedirs(data_path, exist_ok=True)
+            data.to_csv(filename, index=False)
+            return data
+        except Exception:
+            error_msg = (
+                f'Could not download or find normal file for {name}. '
+                f'Please ensure the file exists at {filename} or can be '
+                f'downloaded from {url}'
+            )
+            LOGGER.error(error_msg)
+            raise FileNotFoundError(error_msg)
+
+    except Exception as e:
+        error_msg = f'Error processing normal file for {name}: {str(e)}'
+        LOGGER.error(error_msg)
+        raise FileNotFoundError(error_msg)
+
+
+def load_normal(name, timestamp_column=None, value_column=None, start=None, end=None):
+    """Load normal data from file or download if needed.
+
+    Args:
+        name (str):
+            Name or path of the normal data.
+        timestamp_column (str or int):
+            Column index or name for timestamp.
+        value_column (str or int):
+            Column index or name for values.
+        start (int or timestamp):
+            Optional. If specified, this will be start of the sub-sequence.
+        end (int or timestamp):
+            Optional. If specified, this will be end of the sub-sequence.
+
+    Returns:
+        pandas.DataFrame:
+            Loaded subsequence with `timestamp` and `value` columns.
+    """
+    if os.path.isfile(name):
+        data = load_csv(name, timestamp_column, value_column)
+    else:
+        data = download_normal(name)
+
+    data = format_csv(data)
+
+    # handle start or end is specified
+    if start or end:
+        if any(data.index.isin([start, end])):
+            data = data.iloc[start:end]
+        else:
+            mask = True
+            if start is not None:
+                mask &= data[timestamp_column] >= start
+            if end is not None:
+                mask &= data[timestamp_column] <= end
+            data = data[mask]
+
+    return data
diff --git a/sigllm/pipelines/prompter/mistral_prompter.json b/sigllm/pipelines/prompter/mistral_prompter.json
@@ -31,7 +31,8 @@
         },
         "sigllm.primitives.prompting.huggingface.HF#1": {
             "name": "mistralai/Mistral-7B-Instruct-v0.2",
-            "samples": 10
+            "samples": 10,
+            "restrict_tokens": true
         },
         "sigllm.primitives.prompting.anomalies.find_anomalies_in_windows#1": {
             "alpha": 0.4

diff --git a/sigllm/pipelines/prompter/mistral_prompter_0shot.json b/sigllm/pipelines/prompter/mistral_prompter_0shot.json
@@ -0,0 +1,74 @@
+{
+    "primitives": [
+        "mlstars.custom.timeseries_preprocessing.time_segments_aggregate",
+        "sklearn.impute.SimpleImputer",
+        "sigllm.primitives.transformation.Float2Scalar",
+        "sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences",
+	    "sigllm.primitives.transformation.format_as_string",
+
+        "sigllm.primitives.prompting.huggingface.HF",
+        "sigllm.primitives.transformation.parse_anomaly_response",
+        "sigllm.primitives.transformation.format_as_integer",
+        "sigllm.primitives.prompting.anomalies.val2idx",
+        "sigllm.primitives.prompting.anomalies.find_anomalies_in_windows",
+        "sigllm.primitives.prompting.anomalies.merge_anomalous_sequences",
+        "sigllm.primitives.prompting.anomalies.format_anomalies"
+    ],
+    "init_params": {
+        "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": {
+            "time_column": "timestamp",
+            "interval": 21600,
+            "method": "mean"
+        },
+        "sigllm.primitives.transformation.Float2Scalar#1": {
+            "decimal": 2,
+            "rescale": true
+        },
+        "sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences#1": {
+            "window_size": 100,
+            "step_size": 40
+        },
+        "sigllm.primitives.transformation.format_as_string#1": {
+            "space": false
+        },
+        "sigllm.primitives.prompting.huggingface.HF#1": {
+            "name": "mistralai/Mistral-7B-Instruct-v0.2",
+            "samples": 1,
+            "temp": 0.01
+        },
+        "sigllm.primitives.prompting.anomalies.find_anomalies_in_windows#1": {
+            "alpha": 0.4
+        },
+        "sigllm.primitives.prompting.anomalies.merge_anomalous_sequences#1": {
+            "beta": 0.5
+        }
+    },
+    "input_names": {
+        "sigllm.primitives.prompting.huggingface.HF#1": {
+            "X": "X_str"
+        },
+        "sigllm.primitives.transformation.parse_anomaly_response#1": {
+            "X": "y_hat"
+        },
+        "sigllm.primitives.transformation.format_as_integer#1": {
+            "X": "y_parsed"
+        }
+    },
+    "output_names": {
+        "mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": {
+            "index": "timestamp"
+        },
+        "sigllm.primitives.transformation.format_as_string#1": {
+            "X": "X_str"
+        },
+        "sigllm.primitives.prompting.huggingface.HF#1": {
+            "y": "y_hat"
+        },
+        "sigllm.primitives.transformation.parse_anomaly_response#1": {
+            "X": "y_parsed"
+        },
+        "sigllm.primitives.transformation.format_as_integer#1": {
+            "X": "y"
+        }
+    }
+}