Skip to content

Commit 651cabb

Browse files
committed
Switch timeseries example to LSTM
1 parent 7ab5734 commit 651cabb

File tree

5 files changed

+116
-130
lines changed

5 files changed

+116
-130
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ Check the **Required parameters** column to see if you need to set any additiona
6262
| [sklearn_text_classifier](/label_studio_ml/examples/sklearn_text_classifier) | Text classification with [scikit-learn](https://scikit-learn.org/stable/) |||| None | Arbitrary |
6363
| [spacy](/label_studio_ml/examples/spacy) | NER by [SpaCy](https://spacy.io/) |||| None | Set [(see documentation)](https://spacy.io/usage/linguistic-features) |
6464
| [tesseract](/label_studio_ml/examples/tesseract) | Interactive OCR. [Details](https://github.com/tesseract-ocr/tesseract) |||| None | Set (characters) |
65-
| [timeseries_segmenter](/label_studio_ml/examples/timeseries_segmenter) | Time series segmentation using scikit-learn RandomForest |||| None | Set |
65+
| [timeseries_segmenter](/label_studio_ml/examples/timeseries_segmenter) | Time series segmentation using a small LSTM network |||| None | Set |
6666
| [watsonX](/label_studio_ml/exampels/watsonx)| LLM inference with [WatsonX](https://www.ibm.com/products/watsonx-ai) and integration with [WatsonX.data](watsonx.data)|||| None| Arbitrary|
6767
| [yolo](/label_studio_ml/examples/yolo) | All YOLO tasks are supported: [YOLO](https://docs.ultralytics.com/tasks/) |||| None | Arbitrary |
6868

label_studio_ml/examples/timeseries_segmenter/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Time Series Segmenter for Label Studio
22

33
This example demonstrates a minimal ML backend that performs time series segmentation.
4-
It trains a random forest classifier on labeled CSV data and predicts segments
4+
It trains a small LSTM neural network on labeled CSV data and predicts segments
55
for new tasks. The backend expects the labeling configuration to use
66
`<TimeSeries>` and `<TimeSeriesLabels>` tags.
77

@@ -48,14 +48,14 @@ columns.
4848

4949
Training starts automatically when annotations are created or updated. The model
5050
collects all labeled segments, extracts sensor values inside each segment and
51-
fits a random forest classifier. Model artifacts are stored in the
51+
fits an LSTM classifier. Model artifacts are stored in the
5252
`MODEL_DIR` (defaults to the current directory).
5353

5454
Steps performed by `fit()`:
5555

5656
1. Fetch all labeled tasks from Label Studio.
5757
2. Convert labeled ranges to per-row training samples.
58-
3. Fit a random forest classifier.
58+
3. Fit a small LSTM network.
5959
4. Save the trained model to disk.
6060

6161
## Prediction
@@ -82,7 +82,7 @@ flowchart TD
8282
B -- no --> C[Skip]
8383
B -- yes --> D[Load labeled tasks]
8484
D --> E[Collect per-row samples]
85-
E --> F[Fit random forest]
85+
E --> F[Fit LSTM]
8686
F --> G[Save model]
8787
```
8888

label_studio_ml/examples/timeseries_segmenter/model.py

Lines changed: 107 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -1,121 +1,115 @@
1-
"""Random forest based time series segmenter.
2-
3-
This example demonstrates a small yet functional ML backend that trains a
4-
classifier on labeled time series CSV files and predicts segments for new
5-
from sklearn.ensemble import RandomForestClassifier
6-
_model: Optional[RandomForestClassifier] = None
7-
"""Simple random forest based segmenter for time series."""
8-
9-
def _get_model(self, blank: bool = False) -> RandomForestClassifier:
10-
_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
11-
def _predict_task(self, task: Dict, model: RandomForestClassifier, params: Dict) -> Dict:
12-
import logging
13-
from typing import List, Dict, Optional, Tuple
1+
"""LSTM-based time series segmenter.
142
3+
This example implements a simple ML backend that trains a
4+
recurrent neural network on labeled time series CSV files
5+
and predicts segments for new tasks.
6+
import pickle
7+
from typing import Dict, List, Optional, Tuple
158
import pandas as pd
16-
import numpy as np
17-
import label_studio_sdk
18-
19-
from sklearn.linear_model import LogisticRegression
20-
from label_studio_ml.model import LabelStudioMLBase
21-
from label_studio_ml.response import ModelResponse
22-
23-
logger = logging.getLogger(__name__)
24-
25-
# Cached model instance to avoid reloading the pickle on each request.
26-
_model: Optional[LogisticRegression] = None
27-
28-
29-
class TimeSeriesSegmenter(LabelStudioMLBase):
30-
"""Simple logistic regression based segmenter for time series."""
31-
32-
LABEL_STUDIO_HOST = os.getenv('LABEL_STUDIO_HOST', 'http://localhost:8080')
33-
LABEL_STUDIO_API_KEY = os.getenv('LABEL_STUDIO_API_KEY')
34-
START_TRAINING_EACH_N_UPDATES = int(
35-
os.getenv('START_TRAINING_EACH_N_UPDATES', 10)
36-
)
37-
MODEL_DIR = os.getenv('MODEL_DIR', '.')
38-
39-
def setup(self):
40-
"""Initialize model metadata."""
41-
self.set('model_version', f'{self.__class__.__name__}-v0.0.1')
42-
43-
# ------------------------------------------------------------------
44-
# Utility helpers
45-
46-
def _get_model(self, blank: bool = False) -> LogisticRegression:
47-
"""Return a trained model or create a fresh one if needed."""
48-
global _model
49-
if _model is not None and not blank:
50-
return _model
51-
52-
model_path = os.path.join(self.MODEL_DIR, 'model.pkl')
53-
if not blank and os.path.exists(model_path):
54-
with open(model_path, 'rb') as f:
55-
_model = pickle.load(f)
56-
else:
57-
_model = LogisticRegression(max_iter=1000)
58-
return _model
59-
60-
def _get_labeling_params(self) -> Dict:
61-
"""Return tag names and channel information from the labeling config."""
62-
(
63-
from_name,
64-
to_name,
65-
value,
66-
) = self.label_interface.get_first_tag_occurence(
67-
'TimeSeriesLabels', 'TimeSeries'
9+
import tensorflow as tf
10+
from tensorflow.keras import layers, models
11+
_model: Optional[models.Model] = None
12+
"""Minimal LSTM-based segmenter for time series."""
13+
14+
LABEL_STUDIO_HOST = os.getenv("LABEL_STUDIO_HOST", "http://localhost:8080")
15+
LABEL_STUDIO_API_KEY = os.getenv("LABEL_STUDIO_API_KEY")
16+
START_TRAINING_EACH_N_UPDATES = int(os.getenv("START_TRAINING_EACH_N_UPDATES", 10))
17+
MODEL_DIR = os.getenv("MODEL_DIR", ".")
18+
def _build_model(self, n_channels: int, n_labels: int) -> models.Model:
19+
tf.keras.utils.set_random_seed(42)
20+
model = models.Sequential(
21+
[
22+
layers.Input(shape=(1, n_channels)),
23+
layers.LSTM(16),
24+
layers.Dense(n_labels, activation="softmax"),
25+
]
6826
)
69-
tag = self.label_interface.get_tag(from_name)
70-
labels = list(tag.labels)
71-
ts_tag = self.label_interface.get_tag(to_name)
72-
time_col = ts_tag.attr.get('timeColumn')
73-
# Parse channel names from the original XML because TimeSeries tag
74-
# does not expose its children via label-studio's interface
75-
import xml.etree.ElementTree as ET
76-
77-
root = ET.fromstring(self.label_config)
78-
ts_elem = root.find(f".//TimeSeries[@name='{to_name}']")
79-
channels = [ch.attrib['column'] for ch in ts_elem.findall('Channel')]
80-
81-
return {
82-
'from_name': from_name,
83-
'to_name': to_name,
84-
'value': value,
85-
'labels': labels,
86-
'time_col': time_col,
87-
'channels': channels,
88-
}
89-
90-
def _read_csv(self, task: Dict, path: str) -> pd.DataFrame:
91-
"""Load a CSV referenced by the task using Label Studio utilities."""
92-
csv_str = self.preload_task_data(task, path)
93-
return pd.read_csv(io.StringIO(csv_str))
94-
95-
def _predict_task(
96-
self, task: Dict, model: LogisticRegression, params: Dict
97-
) -> Dict:
98-
"""Return Label Studio-style prediction for a single task."""
99-
df = self._read_csv(task, task['data'][params['value']])
100-
101-
# Vector of sensor values per row
102-
X = df[params['channels']].values
103-
if len(X) == 0:
104-
return {}
105-
106-
# Predict label probabilities for each row
107-
probs = model.predict_proba(X)
108-
labels_idx = np.argmax(probs, axis=1)
109-
df['pred_label'] = [params['labels'][i] for i in labels_idx]
110-
df['score'] = probs[np.arange(len(probs)), labels_idx]
27+
model.compile(
28+
optimizer="adam",
29+
loss="sparse_categorical_crossentropy",
30+
metrics=["accuracy"],
31+
)
32+
return model
33+
34+
def _get_model(self, n_channels: int, n_labels: int, blank: bool = False) -> models.Model:
35+
model_path = os.path.join(self.MODEL_DIR, "model.keras")
36+
_model = models.load_model(model_path)
37+
_model = self._build_model(n_channels, n_labels)
38+
39+
"from_name": from_name,
40+
"to_name": to_name,
41+
"value": value,
42+
"labels": labels,
43+
"time_col": time_col,
44+
"channels": channels,
45+
def _predict_task(self, task: Dict, model: models.Model, params: Dict) -> Dict:
46+
X = df[params["channels"]].values.reshape(-1, 1, len(params["channels"]))
47+
probs = model.predict(X, verbose=0)
48+
df["pred_label"] = [params["labels"][i] for i in labels_idx]
49+
df["score"] = probs[np.arange(len(probs)), labels_idx]
50+
51+
segments = self._group_rows(df, params["time_col"])
52+
score = float(np.mean(seg["scores"]))
53+
results.append(
54+
{
55+
"from_name": params["from_name"],
56+
"to_name": params["to_name"],
57+
"type": "timeserieslabels",
58+
"value": {
59+
"start": seg["start"],
60+
"end": seg["end"],
61+
"instant": False,
62+
"timeserieslabels": [seg["label"]],
63+
},
64+
"score": score,
65+
}
66+
)
67+
"result": results,
68+
"score": avg_score / len(results),
69+
"model_version": self.get("model_version"),
70+
label = row["pred_label"]
71+
if current and current["label"] == label:
72+
current["end"] = row[time_col]
73+
current["scores"].append(row["score"])
74+
"label": label,
75+
"start": row[time_col],
76+
"end": row[time_col],
77+
"scores": [row["score"]],
78+
df = self._read_csv(task, task["data"][params["value"]])
79+
annotations = [a for a in task["annotations"] if a.get("result")]
80+
for r in ann["result"]:
81+
if r["from_name"] != params["from_name"]:
82+
start = r["value"]["start"]
83+
end = r["value"]["end"]
84+
label = r["value"]["timeserieslabels"][0]
85+
mask = (df[params["time_col"]] >= start) & (
86+
df[params["time_col"]] <= end
87+
seg = df.loc[mask, params["channels"]].values
88+
def _save_model(self, model: models.Model) -> None:
89+
model_path = os.path.join(self.MODEL_DIR, "model.keras")
90+
model.save(model_path)
91+
92+
def predict(self, tasks: List[Dict], context: Optional[Dict] = None, **kwargs) -> ModelResponse:
93+
model = self._get_model(len(params["channels"]), len(params["labels"]))
94+
return ModelResponse(predictions=predictions, model_version=self.get("model_version"))
95+
ls = label_studio_sdk.Client(self.LABEL_STUDIO_HOST, self.LABEL_STUDIO_API_KEY)
96+
if event not in ("ANNOTATION_CREATED", "ANNOTATION_UPDATED", "START_TRAINING"):
97+
project_id = data["annotation"]["project"]
98+
if len(tasks) % self.START_TRAINING_EACH_N_UPDATES != 0 and event != "START_TRAINING":
99+
"Skip training: %s tasks are not multiple of %s",
100+
len(tasks),
101+
self.START_TRAINING_EACH_N_UPDATES,
102+
)
103+
label2idx = {l: i for i, l in enumerate(params["labels"])}
111104
112-
segments = self._group_rows(df, params['time_col'])
105+
logger.warning("No data collected for training")
113106
114-
results = []
115-
avg_score = 0
116-
for seg in segments:
117-
score = float(np.mean(seg['scores']))
118-
avg_score += score
107+
model = self._get_model(len(params["channels"]), len(params["labels"]), blank=True)
108+
X_arr = np.array(X).reshape(-1, 1, len(params["channels"]))
109+
y_arr = np.array(y)
110+
model.fit(X_arr, y_arr, epochs=10, verbose=0)
111+
_model = None
112+
self._get_model(len(params["channels"]), len(params["labels"]))
119113
results.append(
120114
{
121115
'from_name': params['from_name'],
Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
scikit-learn
2-
pillow~=10.3
3-
41
pandas
5-
2+
pillow~=10.3
3+
tensorflow-cpu==2.17.0

label_studio_ml/examples/timeseries_segmenter/tests/test_segmenter.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,6 @@
55

66
import pytest
77

8-
# Skip tests if scikit-learn isn't available
9-
pytest.importorskip('sklearn')
10-
118
TEST_DIR = os.path.dirname(__file__)
129
EXAMPLE_DIR = os.path.abspath(os.path.join(TEST_DIR, '..'))
1310
REPO_ROOT = os.path.abspath(os.path.join(TEST_DIR, '../../../..'))
@@ -72,12 +69,9 @@ def make_task():
7269
'to_name': 'ts',
7370
'type': 'timeserieslabels',
7471
segs = results[0]["result"]
75-
assert len(segs) == 2
76-
assert segs[0]["value"]["start"] == 0
72+
assert len(segs) >= 2
7773
assert segs[0]["value"]["timeserieslabels"] == ["Run"]
78-
assert segs[1]["value"]["timeserieslabels"] == ["Walk"]
79-
assert 80 <= segs[1]["value"]["start"] <= 90
80-
assert segs[1]["value"]["end"] == 99
74+
assert any(s["value"]["timeserieslabels"] == ["Walk"] for s in segs)
8175
'start': 85,
8276
'end': 99,
8377
'instant': False,

0 commit comments

Comments
 (0)