Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/releases/unreleased.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## datasets
- Fixed download in Insects dataset. The datasets incremental_abrupt_imbalanced, incremental_imbalanced, incremental_reoccurring_imbalanced and out-of-control are not supported anymore.
- Refactored `benchmarks` and added plotly dependency for interactive plots
- Added the BETH dataset for labeled system process events.

## stats

Expand Down
2 changes: 2 additions & 0 deletions river/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from . import base, synth
from .airline_passengers import AirlinePassengers
from .bananas import Bananas
from .beth import BETH
from .bikes import Bikes
from .chick_weights import ChickWeights
from .credit_card import CreditCard
Expand All @@ -38,6 +39,7 @@
__all__ = [
"AirlinePassengers",
"Bananas",
"BETH",
"base",
"Bikes",
"ChickWeights",
Expand Down
98 changes: 98 additions & 0 deletions river/datasets/beth.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from __future__ import annotations

import itertools
import shutil
import zipfile
from urllib import request

from river import stream

from . import base


class BETH(base.RemoteDataset):
"""BETH dataset of system process events.

This dataset contains labeled host-based telemetry collected from benign and malicious activity.
The loader uses the process event CSV files and predicts whether an event is labeled as evil.
DNS logs and the testing CSV are ignored.

References
----------
[^1]: [BETH dataset on Kaggle](https://www.kaggle.com/katehighnam/beth-dataset)
[^2]: [Imperial College London data archive](https://data.hpc.imperial.ac.uk/resolve/?doi=9422&file=4&access=)

"""

def __init__(self):
super().__init__(
n_samples=2_666_118,
n_features=11,
task=base.BINARY_CLF,
url="https://data.hpc.imperial.ac.uk/resolve/?doi=9422&file=4&access=",
size=928_188_305,
filename=".",
)

def download(self, force: bool = False, verbose: bool = True):
if not force and self.is_downloaded:
return

directory = self.path
directory.mkdir(parents=True, exist_ok=True)
archive_path = directory.joinpath("full_BETH_dataset.zip")

with request.urlopen(self.url) as r:
if verbose:
meta = r.info()
try:
n_bytes = int(meta["Content-Length"])
msg = f"Downloading {self.url} ({base.utils.pretty.humanize_bytes(n_bytes)})"
except (KeyError, TypeError):
msg = f"Downloading {self.url}"
print(msg)

with open(archive_path, "wb") as f:
shutil.copyfileobj(r, f)

if verbose:
print(f"Uncompressing into {directory}")

with zipfile.ZipFile(archive_path, "r") as zf:
zf.extractall(directory)

archive_path.unlink()

def _iter(self): # type: ignore[override]
converters = {
"timestamp": float,
"processId": int,
"parentProcessId": int,
"userId": int,
"eventId": int,
"argsNum": int,
"returnValue": int,
"evil": lambda x: x == "1",
}

files = [
file
for file in self.path.glob("*.csv")
if "-dns" not in file.name
and file.name
not in {
"labelled_testing_data.csv",
"labelled_training_data.csv",
"labelled_validation_data.csv",
}
]
return itertools.chain.from_iterable(
stream.iter_csv(
file,
target="evil",
converters=converters,
drop=["sus"],
field_size_limit=1_000_000,
)
for file in sorted(files)
)