Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions .github/workflows/code_checks.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
name: code checks
permissions:
contents: read
pull-requests: write

on:
push:
branches:
- main
paths:
- .pre-commit-config.yaml
- .github/workflows/code_checks.yml
- '**.py'
- uv.lock
- pyproject.toml
- '**.ipynb'
pull_request:
branches:
- main
paths:
- .pre-commit-config.yaml
- .github/workflows/code_checks.yml
- '**.py'
- uv.lock
- pyproject.toml
- '**.ipynb'

jobs:
run-code-check:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Install uv
uses: astral-sh/setup-uv@v5
with:
version: "latest"
enable-cache: true

- name: "Set up Python"
uses: actions/setup-python@v5
with:
python-version-file: ".python-version"

- name: Install the project
run: uv sync --all-extras --dev

- name: Install dependencies and check code
run: |
source .venv/bin/activate
pre-commit run --all-files
26 changes: 0 additions & 26 deletions .github/workflows/static_code_checks.yaml

This file was deleted.

3 changes: 0 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,6 @@ target/
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
Expand Down
34 changes: 12 additions & 22 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0 # Use the ref you want to point at
rev: v5.0.0 # Use the ref you want to point at
hooks:
- id: trailing-whitespace
- id: check-ast
Expand All @@ -11,29 +11,19 @@ repos:
- id: end-of-file-fixer
- id: mixed-line-ending
args: [--fix=lf]
- id: requirements-txt-fixer
- id: trailing-whitespace
- id: detect-private-key
- id: check-byte-order-marker
- id: check-merge-conflict
- id: check-symlinks
- id: check-yaml
args: [--unsafe]
- id: check-toml

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.280
rev: v0.11.4
hooks:
- id: ruff

- repo: https://github.com/psf/black
rev: 23.7.0
hooks:
- id: black

- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.4.1
hooks:
- id: mypy

- repo: https://github.com/nbQA-dev/nbQA
rev: 1.7.0
hooks:
- id: nbqa-black
- id: nbqa-ruff
- id: nbqa-check-ast
- id: nbqa-mypy
args: [--fix, --exit-non-zero-on-fix]
types_or: [ python, pyi, jupyter ]
- id: ruff-format
types_or: [ python, pyi, jupyter ]
1 change: 1 addition & 0 deletions .python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.9
57 changes: 43 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,56 @@
This repository contains reference implementations of three self-supervised learning
techniques explored during the Vector Institute's Self-Supervised Learning (SSL) Bootcamp.

# Installing dependencies
```
python3 -m venv /path/to/new/virtual/environment/ssl_env
source /path/to/new/virtual/environment/ssl_env/bin/activate
pip install --upgrade pip
pip install -r requirements.txt
```
# Summary of Reference Implementations

If you are on the Vector Institute's Vaughan cluster, the environment is already set up and can be activated with
| Name | Description | Reference Implementation |
|------|-------------|-------|
Internal Contrastive Learning (ICL) + Latent Outlier Exposure (LOE)| ICL learns to maximize the mutual information between two complementary subsets based on the assumption that the relation between a subset of features and the rest of the features is class-dependent. LOE extends ICL to work with contaminated datasets. | [Anomaly Detection in Tabular Data with ICL](src/contrastive_learning/ICL/ICL.ipynb), [Latent Outlier Exposure for Anomaly Detection with Contaminated Data](src/contrastive_learning/LatentOE/LatentOE_Notebook.ipynb)
SimMTM | Reconstructs a time series signal from multiple randomly masked versions. Uses series-wise representation similarity to do a weighted aggregation of point-wise representations before reconstruction. | [Beijing PM2.5 Air Quality Forecasting](src/masked_modelling/simmtm/simmtm-BeijingPM25Quality-forecasting.ipynb)
TabRet | TabRet is a pre-trainable Transformer-based model for tabular data and designed to work on a downstream task that contains columns not seen in pre-training. Unlike other methods, TabRet has an extra learning step before fine-tuning called retokenizing, which calibrates feature embeddings based on the masked autoencoding loss. | [Stroke Prediction with the BRFSS dataset](src/masked_modelling/tabret/TabRet.ipynb)
Data2Vec | Combines masked prediction with self-distillation to predict contextualized latent representations (produced by the teacher network) based on a partial/masked view of the input (given to the student network). | [Image Classification with STL-10 dataset](src/self_distillation/data2vec_vision.ipynb)


# Setting up the environment
Prior to installing the dependencies for this project, it is recommended to install
[uv](https://github.com/astral-sh/uv?tab=readme-ov-file#installation) and create
a virtual environment. You may use whatever virtual environment management tool
that you like, including uv, conda, and virtualenv.

With uv, you can create a virtual environment with the following command:

```bash
uv venv -n --seed --python 3.9 /path/to/new/virtual/environment/ssl_env
```
source /ssd003/projects/aieng/public/ssl_bootcamp_resources/venv/bin/activate
```
This will create a new virtual environment in the specified path.

**Note**: If you are using the Vector Institute's Vaughan cluster, a virtual
environment has already been created for you at `/ssd003/projects/aieng/public/ssl_bootcamp_resources/venv`.

Once you have created a virtual environment, you can activate it with the command:

# Using pre-commit hooks
To check your code at commit time
```
pre-commit install
source /path/to/new/virtual/environment/ssl_env/bin/activate
```

You can also get pre-commit to fix your code
Then, you can install the dependencies for this project with the following command:

```bash
git clone https://github.com/VectorInstitute/SSL-Bootcamp.git
cd SSL-Bootcamp
uv sync --no-cache --active --dev
```
**Note**: The `--active` flag in the above command assumes that you have already
activated your virtual environment. If you prefer not to create a new virtual
environment yourself, you can omit the `--active` flag and uv will create a new virtual environment
for you in the `.venv` directory inside the project root.

## Using pre-commit hooks
To ensure that your code adheres to the project's style and formatting guidelines,
you can use pre-commit hooks to check for common issues, such as code formatting,
linting, and security vulnerabilities. Run the following command before pushing
your code to the repository:

```
pre-commit run --all-files
```
83 changes: 58 additions & 25 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,33 +1,66 @@
[build-system]
requires = ["setuptools", "wheel"]
build-backend = "setuptools.build_meta"

[tool.black]
line-length = 88

[tool.mypy]
ignore_missing_imports = true
install_types = true
pretty = true
non_interactive = true
disallow_untyped_defs = true
no_implicit_optional = true
check_untyped_defs = true
[project]
name = "ssl-bootcamp"
version = "0.1.0"
description = "Reference implementations for the Vector Institute's self-supervised learning (SSL) bootcamp (2023)"
readme = "README.md"
authors = [{name = "Vector AI Engineering", email = "[email protected]"}]
license = "MIT"
repository = "https://github.com/VectorInstitute/SSL-Bootcamp"
requires-python = ">=3.9"
dependencies = [
"copulas>=0.12.0",
"deepod>=0.4.1",
"ipykernel>=6.29.5",
"lightning==2.0.6",
"lightning-bolts>=0.7.0",
"matplotlib>=3.7.5",
"notebook>=7.3.3",
"numpy>=1.24.4",
"optuna>=4.2.1",
"pandas>=2.0.3",
"pyod>=2.0.4",
"pytorch-tabular==1.0.2",
"rtdl==0.0.13",
"scikit-learn>=1.3.2",
"scipy>=1.10.1",
"timm==0.9.2",
"toml>=0.10.2",
"torch==1.13.0",
"torchvision==0.14.0",
"transformers==4.31.0",
"transtab>=0.0.5",
"wandb>=0.19.9",
"xgboost>=2.1.4",
]

[dependency-groups]
dev = [
"pre-commit>=3.5.0",
"ruff>=0.11.4",
]

[tool.ruff]
select = ["B", "C", "D", "E", "F", "I", "W"]
line-length = 88
include = ["*.py", "pyproject.toml", "*.ipynb"]
line-length = 119

[tool.ruff.format]
quote-style = "double"
indent-style = "space"
docstring-code-format = true

[tool.ruff.isort]
[tool.ruff.lint]
select = ["A", "B", "C", "E", "F", "I", "W"]
fixable = ["A", "B", "COM", "C", "C4", "RET", "SIM", "ICN", "Q", "RSE", "E", "F", "I", "W", "PL"]
ignore = ["E501", "C901"]

[tool.ruff.lint.isort]
lines-after-imports = 2

[tool.nbqa.addopts]
ruff = [
"--ignore=D100,D203,D211,D212,D213,D401",
"--fix",
"--line-length=119",
]
black = ["--line-length=119"]
[tool.ruff.lint.pycodestyle]
max-doc-length = 119

[tool.ruff.lint.pydocstyle]
convention = "numpy"

[tool.nbqa.md]
blacken-docs = true
32 changes: 0 additions & 32 deletions requirements.txt

This file was deleted.

Loading