Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
2ba851e
Core Changed to get normal behavior in pipeline
Mar 31, 2025
bc285a1
Transformation changed
Mar 31, 2025
502603c
anomalies.py changed
Mar 31, 2025
0769e90
Hugginface.py changed : no restrictions token, and also has normal as…
Mar 31, 2025
2929f8b
Timeseries preprocessing.py
Mar 31, 2025
785f96d
jsons files added for primitives
Mar 31, 2025
16bb0a5
jsons files added for primitives
Mar 31, 2025
4ffaffa
pipelines 0shot and 1shot added
Mar 31, 2025
8f8fc07
add boolean for restrict_tokens in HF
Mar 31, 2025
bd334c9
good messages.json for prompt
Apr 1, 2025
dbf8ed1
Added load_normal in sigllm.data
Apr 1, 2025
6f08214
Fixed load_normal in sigllm.data
Apr 1, 2025
fbedec1
Fixed lint format
Apr 1, 2025
fa98d60
Fixed lint format Ruff
Apr 1, 2025
8ea8f97
Fixed from review Sarah
Apr 1, 2025
293f1ca
Fixed lint format after working on Sarah's reviews
Apr 1, 2025
8b6dd6e
Dataset prompter parameters
Apr 1, 2025
3689912
.jons removed from input names in 1_shot pipeline.json
Apr 2, 2025
42efea0
.jons removed from input names in 1_shot pipeline.json
Apr 2, 2025
5d99162
fix PR issues & add unittests
sarahmish Apr 16, 2025
49e67d8
add unittests for parse_anomaly_response
sarahmish Apr 17, 2025
11ff33a
remove unused functions
sarahmish Apr 17, 2025
a2e28f3
add new functionality tests
sarahmish Apr 17, 2025
f293d84
update ubuntu image
sarahmish Apr 17, 2025
f3f7b4c
change normal->single
sarahmish Apr 17, 2025
540ea92
fix lint
sarahmish Apr 17, 2025
5876feb
swap normal -> single
sarahmish Apr 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ on:

jobs:
lint:
runs-on: ubuntu-20.04
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.9
Expand Down
17 changes: 15 additions & 2 deletions sigllm/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,14 @@ def __repr__(self):

return ('SigLLM:\n{}\nhyperparameters:\n{}\n').format(pipeline, hyperparameters)

def detect(self, data: pd.DataFrame, visualization: bool = False, **kwargs) -> pd.DataFrame:
"""Detect anomalies in the given data..
def detect(
self,
data: pd.DataFrame,
normal: pd.DataFrame = None,
visualization: bool = False,
**kwargs,
) -> pd.DataFrame:
"""Detect anomalies in the given data.

If ``visualization=True``, also return the visualization
outputs from the MLPipeline object.
Expand All @@ -110,6 +116,10 @@ def detect(self, data: pd.DataFrame, visualization: bool = False, **kwargs) -> p
data (DataFrame):
Input data, passed as a ``pandas.DataFrame`` containing
exactly two columns: timestamp and value.
normal (DataFrame, optional):
Normal reference data for one-shot prompting, passed as a ``pandas.DataFrame``
containing exactly two columns: timestamp and value. If None, zero-shot
prompting is used. Default to None.
visualization (bool):
If ``True``, also capture the ``visualization`` named
output from the ``MLPipeline`` and return it as a second
Expand All @@ -125,6 +135,9 @@ def detect(self, data: pd.DataFrame, visualization: bool = False, **kwargs) -> p
if not self._fitted:
self._mlpipeline = self._get_mlpipeline()

if normal is not None:
kwargs['normal'] = normal

result = self._detect(self._mlpipeline.fit, data, visualization, **kwargs)
self._fitted = True

Expand Down
125 changes: 125 additions & 0 deletions sigllm/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
# -*- coding: utf-8 -*-

"""Data Management module.

This module contains functions that allow downloading demo data from Amazon S3,
as well as load and work with other data stored locally.
"""

import logging
import os

import pandas as pd
from orion.data import format_csv, load_csv

LOGGER = logging.getLogger(__name__)

DATA_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data')
BUCKET = 'sintel-sigllm'
S3_URL = 'https://{}.s3.amazonaws.com/{}'


def download_normal(name, data_path=DATA_PATH):
"""Load the CSV with the given name from S3.

If the CSV has never been loaded before, it will be downloaded
from the [sintel-sigllm bucket](https://sintel-sigllm.s3.amazonaws.com) or
the S3 bucket specified following the `s3://{bucket}/path/to/the.csv` format,
and then cached inside the `data` folder, within the `sigllm` package
directory, and then returned.

Otherwise, if it has been downloaded and cached before, it will be directly
loaded from the `sigllm/data` folder without contacting S3.

Args:
name (str):
Name of the CSV to load.
data_path (str):
Path to store data.

Returns:
pandas.DataFrame:
A pandas.DataFrame is returned containing all the data.

Raises:
FileNotFoundError: If the normal file doesn't exist locally and can't
be downloaded from S3.
"""
try:
url = None
if name.startswith('s3://'):
parts = name[5:].split('/', 1)
bucket = parts[0]
path = parts[1]
url = S3_URL.format(bucket, path)
filename = os.path.join(data_path, path.split('/')[-1])
else:
filename = os.path.join(data_path, name + '_normal.csv')
data_path = os.path.join(data_path, os.path.dirname(name))

if os.path.exists(filename):
data = pd.read_csv(filename)
return data

url = url or S3_URL.format(BUCKET, '{}_normal.csv'.format(name))
LOGGER.info('Downloading CSV %s from %s', name, url)

try:
data = pd.read_csv(url)
os.makedirs(data_path, exist_ok=True)
data.to_csv(filename, index=False)
return data
except Exception:
error_msg = (
f'Could not download or find normal file for {name}. '
f'Please ensure the file exists at {filename} or can be '
f'downloaded from {url}'
)
LOGGER.error(error_msg)
raise FileNotFoundError(error_msg)

except Exception as e:
error_msg = f'Error processing normal file for {name}: {str(e)}'
LOGGER.error(error_msg)
raise FileNotFoundError(error_msg)


def load_normal(name, timestamp_column=None, value_column=None, start=None, end=None):
"""Load normal data from file or download if needed.

Args:
name (str):
Name or path of the normal data.
timestamp_column (str or int):
Column index or name for timestamp.
value_column (str or int):
Column index or name for values.
start (int or timestamp):
Optional. If specified, this will be start of the sub-sequence.
end (int or timestamp):
Optional. If specified, this will be end of the sub-sequence.

Returns:
pandas.DataFrame:
Loaded subsequence with `timestamp` and `value` columns.
"""
if os.path.isfile(name):
data = load_csv(name, timestamp_column, value_column)
else:
data = download_normal(name)

data = format_csv(data)

# handle start or end is specified
if start or end:
if any(data.index.isin([start, end])):
data = data.iloc[start:end]
else:
mask = True
if start is not None:
mask &= data[timestamp_column] >= start
if end is not None:
mask &= data[timestamp_column] <= end
data = data[mask]

return data
3 changes: 2 additions & 1 deletion sigllm/pipelines/prompter/mistral_prompter.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@
},
"sigllm.primitives.prompting.huggingface.HF#1": {
"name": "mistralai/Mistral-7B-Instruct-v0.2",
"samples": 10
"samples": 10,
"restrict_tokens": true
},
"sigllm.primitives.prompting.anomalies.find_anomalies_in_windows#1": {
"alpha": 0.4
Expand Down
74 changes: 74 additions & 0 deletions sigllm/pipelines/prompter/mistral_prompter_0shot.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
{
"primitives": [
"mlstars.custom.timeseries_preprocessing.time_segments_aggregate",
"sklearn.impute.SimpleImputer",
"sigllm.primitives.transformation.Float2Scalar",
"sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences",
"sigllm.primitives.transformation.format_as_string",

"sigllm.primitives.prompting.huggingface.HF",
"sigllm.primitives.transformation.parse_anomaly_response",
"sigllm.primitives.transformation.format_as_integer",
"sigllm.primitives.prompting.anomalies.val2idx",
"sigllm.primitives.prompting.anomalies.find_anomalies_in_windows",
"sigllm.primitives.prompting.anomalies.merge_anomalous_sequences",
"sigllm.primitives.prompting.anomalies.format_anomalies"
],
"init_params": {
"mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": {
"time_column": "timestamp",
"interval": 21600,
"method": "mean"
},
"sigllm.primitives.transformation.Float2Scalar#1": {
"decimal": 2,
"rescale": true
},
"sigllm.primitives.prompting.timeseries_preprocessing.rolling_window_sequences#1": {
"window_size": 100,
"step_size": 40
},
"sigllm.primitives.transformation.format_as_string#1": {
"space": false
},
"sigllm.primitives.prompting.huggingface.HF#1": {
"name": "mistralai/Mistral-7B-Instruct-v0.2",
"samples": 1,
"temp": 0.01
},
"sigllm.primitives.prompting.anomalies.find_anomalies_in_windows#1": {
"alpha": 0.4
},
"sigllm.primitives.prompting.anomalies.merge_anomalous_sequences#1": {
"beta": 0.5
}
},
"input_names": {
"sigllm.primitives.prompting.huggingface.HF#1": {
"X": "X_str"
},
"sigllm.primitives.transformation.parse_anomaly_response#1": {
"X": "y_hat"
},
"sigllm.primitives.transformation.format_as_integer#1": {
"X": "y_parsed"
}
},
"output_names": {
"mlstars.custom.timeseries_preprocessing.time_segments_aggregate#1": {
"index": "timestamp"
},
"sigllm.primitives.transformation.format_as_string#1": {
"X": "X_str"
},
"sigllm.primitives.prompting.huggingface.HF#1": {
"y": "y_hat"
},
"sigllm.primitives.transformation.parse_anomaly_response#1": {
"X": "y_parsed"
},
"sigllm.primitives.transformation.format_as_integer#1": {
"X": "y"
}
}
}
Loading