Skip to content

Commit 919064d

Browse files
authored
Merge pull request #2 from ChEB-AI/main
From `main` branch to `dev`. Delete `main` after merge.
2 parents 7b228b6 + 4c2971c commit 919064d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+16932
-2
lines changed

.gitattributes

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Juypter notebooks contains images, and tables, and parsing text
2+
# blowing up the total language fraction unrealistically;
3+
# then 'Juypter notebooks' are suddenly major part of repo language.
4+
5+
# As they don't want to parse notebooks better
6+
# (wont-fix = https://github.com/github/linguist/issues/3496)
7+
# Simply exclude this file from counting now:
8+
9+
notebooks/*.ipynb linguist-generated=true
10+
stream_viz/tutorial/*.ipynb linguist-generated=true

.github/workflows/black.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
name: Lint
2+
3+
on: [push, pull_request]
4+
5+
jobs:
6+
lint:
7+
runs-on: ubuntu-latest
8+
steps:
9+
- uses: actions/checkout@v2
10+
- uses: psf/black@stable

.github/workflows/test.yml

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
name: Unittests
2+
3+
on: [pull_request]
4+
5+
jobs:
6+
build:
7+
8+
runs-on: ubuntu-latest
9+
strategy:
10+
fail-fast: false
11+
matrix:
12+
python-version: ["3.9", "3.10", "3.11", "3.12"]
13+
14+
steps:
15+
- uses: actions/checkout@v4
16+
17+
- name: Set up Python ${{ matrix.python-version }}
18+
uses: actions/setup-python@v5
19+
with:
20+
python-version: ${{ matrix.python-version }}
21+
22+
- name: Install dependencies
23+
run: |
24+
python -m pip install --upgrade pip
25+
python -m pip install --upgrade pip setuptools wheel
26+
python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
27+
python -m pip install -e .
28+
29+
- name: Display Python & Installed Packages
30+
run: |
31+
python --version
32+
pip freeze
33+
34+
- name: Run Unit Tests
35+
run: python -m unittest discover -s tests/unit -v
36+
env:
37+
ACTIONS_STEP_DEBUG: true # Enable debug logs
38+
ACTIONS_RUNNER_DEBUG: true # Additional debug logs from Github Actions itself
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
name: Check consistency of tokens.txt file
2+
3+
# Define the file paths under `paths` to trigger this check only when specific files are modified.
4+
# This script will then execute checks only on files that have changed, rather than all files listed in `paths`.
5+
6+
# **Note** : To add a new token file for checks, include its path in:
7+
# - `on` -> `push` and `pull_request` sections
8+
# - `jobs` -> `check_tokens` -> `steps` -> Set global variable for multiple tokens.txt paths -> `TOKENS_FILES`
9+
10+
on:
11+
push:
12+
paths:
13+
- "chebai/preprocessing/bin/protein_token/tokens.txt"
14+
- "chebai/preprocessing/bin/protein_token_3_gram/tokens.txt"
15+
pull_request:
16+
paths:
17+
- "chebai/preprocessing/bin/protein_token/tokens.txt"
18+
- "chebai/preprocessing/bin/protein_token_3_gram/tokens.txt"
19+
20+
jobs:
21+
check_tokens:
22+
runs-on: ubuntu-latest
23+
24+
steps:
25+
- name: Checkout code
26+
uses: actions/checkout@v2
27+
28+
- name: Get list of changed files
29+
id: changed_files
30+
run: |
31+
git fetch origin dev
32+
33+
# Get the list of changed files compared to origin/dev and save them to a file
34+
git diff --name-only origin/dev > changed_files.txt
35+
36+
# Print the names of changed files on separate lines
37+
echo "Changed files:"
38+
while read -r line; do
39+
echo "Changed File name : $line"
40+
done < changed_files.txt
41+
42+
- name: Set global variable for multiple tokens.txt paths
43+
run: |
44+
# All token files that needs to checked must be included here too, same as in `paths`.
45+
TOKENS_FILES=(
46+
"chebai/preprocessing/bin/protein_token/tokens.txt"
47+
"chebai/preprocessing/bin/protein_token_3_gram/tokens.txt"
48+
)
49+
echo "TOKENS_FILES=${TOKENS_FILES[*]}" >> $GITHUB_ENV
50+
51+
- name: Process only changed tokens.txt files
52+
run: |
53+
# Convert the TOKENS_FILES environment variable into an array
54+
TOKENS_FILES=(${TOKENS_FILES})
55+
56+
# Iterate over each token file path
57+
for TOKENS_FILE_PATH in "${TOKENS_FILES[@]}"; do
58+
# Check if the current token file path is in the list of changed files
59+
if grep -q "$TOKENS_FILE_PATH" changed_files.txt; then
60+
echo "----------------------- Processing $TOKENS_FILE_PATH -----------------------"
61+
62+
# Get previous tokens.txt version
63+
git fetch origin dev
64+
git diff origin/dev -- $TOKENS_FILE_PATH > tokens_diff.txt || echo "No previous tokens.txt found for $TOKENS_FILE_PATH"
65+
66+
# Check for deleted or added lines in tokens.txt
67+
if [ -f tokens_diff.txt ]; then
68+
69+
# Check for deleted lines (lines starting with '-')
70+
deleted_lines=$(grep '^-' tokens_diff.txt | grep -v '^---' | sed 's/^-//' || true)
71+
if [ -n "$deleted_lines" ]; then
72+
echo "Error: Lines have been deleted from $TOKENS_FILE_PATH."
73+
echo -e "Deleted Lines: \n$deleted_lines"
74+
exit 1
75+
fi
76+
77+
# Check for added lines (lines starting with '+')
78+
added_lines=$(grep '^+' tokens_diff.txt | grep -v '^+++' | sed 's/^+//' || true)
79+
if [ -n "$added_lines" ]; then
80+
81+
# Count how many lines have been added
82+
num_added_lines=$(echo "$added_lines" | wc -l)
83+
84+
# Get last `n` lines (equal to num_added_lines) of tokens.txt
85+
last_lines=$(tail -n "$num_added_lines" $TOKENS_FILE_PATH)
86+
87+
# Check if the added lines are at the end of the file
88+
if [ "$added_lines" != "$last_lines" ]; then
89+
90+
# Find lines that were added but not appended at the end of the file
91+
non_appended_lines=$(diff <(echo "$added_lines") <(echo "$last_lines") | grep '^<' | sed 's/^< //')
92+
93+
echo "Error: New lines have been added to $TOKENS_FILE_PATH, but they are not at the end of the file."
94+
echo -e "Added lines that are not at the end of the file: \n$non_appended_lines"
95+
exit 1
96+
fi
97+
fi
98+
99+
if [ "$added_lines" == "" ]; then
100+
echo "$TOKENS_FILE_PATH validation successful: No lines were deleted, and no new lines were added."
101+
else
102+
echo "$TOKENS_FILE_PATH validation successful: No lines were deleted, and new lines were correctly appended at the end."
103+
fi
104+
else
105+
echo "No previous version of $TOKENS_FILE_PATH found."
106+
fi
107+
else
108+
echo "$TOKENS_FILE_PATH was not changed, skipping."
109+
fi
110+
done

.gitignore

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
share/python-wheels/
24+
*.egg-info/
25+
.installed.cfg
26+
*.egg
27+
MANIFEST
28+
29+
# PyInstaller
30+
# Usually these files are written by a python script from a template
31+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
32+
*.manifest
33+
*.spec
34+
35+
# Installer logs
36+
pip-log.txt
37+
pip-delete-this-directory.txt
38+
39+
# Unit test / coverage reports
40+
htmlcov/
41+
.tox/
42+
.nox/
43+
.coverage
44+
.coverage.*
45+
.cache
46+
nosetests.xml
47+
coverage.xml
48+
*.cover
49+
*.py,cover
50+
.hypothesis/
51+
.pytest_cache/
52+
cover/
53+
54+
# Translations
55+
*.mo
56+
*.pot
57+
58+
# Django stuff:
59+
*.log
60+
local_settings.py
61+
db.sqlite3
62+
db.sqlite3-journal
63+
64+
# Flask stuff:
65+
instance/
66+
.webassets-cache
67+
68+
# Scrapy stuff:
69+
.scrapy
70+
71+
# Sphinx documentation
72+
docs/_build/
73+
docs/build/
74+
75+
# PyBuilder
76+
.pybuilder/
77+
target/
78+
79+
# Jupyter Notebook
80+
.ipynb_checkpoints
81+
82+
# IPython
83+
profile_default/
84+
ipython_config.py
85+
86+
# pyenv
87+
# For a library or package, you might want to ignore these files since the code is
88+
# intended to run in multiple environments; otherwise, check them in:
89+
# .python-version
90+
91+
# pipenv
92+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
94+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
95+
# install all needed dependencies.
96+
#Pipfile.lock
97+
98+
# poetry
99+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100+
# This is especially recommended for binary packages to ensure reproducibility, and is more
101+
# commonly ignored for libraries.
102+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103+
#poetry.lock
104+
105+
# pdm
106+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107+
#pdm.lock
108+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109+
# in version control.
110+
# https://pdm.fming.dev/#use-with-ide
111+
.pdm.toml
112+
113+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
114+
__pypackages__/
115+
116+
# Celery stuff
117+
celerybeat-schedule
118+
celerybeat.pid
119+
120+
# SageMath parsed files
121+
*.sage.py
122+
123+
# Environments
124+
.env
125+
.venv
126+
env/
127+
venv/
128+
ENV/
129+
env.bak/
130+
venv.bak/
131+
132+
# Spyder project settings
133+
.spyderproject
134+
.spyproject
135+
136+
# Rope project settings
137+
.ropeproject
138+
139+
# mkdocs documentation
140+
/site
141+
142+
# mypy
143+
.mypy_cache/
144+
.dmypy.json
145+
dmypy.json
146+
147+
# Pyre type checker
148+
.pyre/
149+
150+
# pytype static type analyzer
151+
.pytype/
152+
153+
# Cython debug symbols
154+
cython_debug/
155+
156+
# PyCharm
157+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
158+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
159+
# and can be added to the global gitignore or merged into this file. For a more nuclear
160+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
161+
#.idea/
162+
163+
# configs/ # commented as new configs can be added as a part of a feature
164+
165+
/.idea
166+
/data
167+
/logs
168+
/results_buffer
169+
electra_pretrained.ckpt
170+
.jupyter
171+
.virtual_documents
172+
.isort.cfg

.pre-commit-config.yaml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
repos:
2+
- repo: https://github.com/psf/black
3+
rev: "24.2.0"
4+
hooks:
5+
- id: black
6+
- id: black-jupyter # for formatting jupyter-notebook
7+
8+
- repo: https://github.com/pycqa/isort
9+
rev: 5.13.2
10+
hooks:
11+
- id: isort
12+
name: isort (python)
13+
args: ["--profile=black"]
14+
15+
- repo: https://github.com/asottile/seed-isort-config
16+
rev: v2.2.0
17+
hooks:
18+
- id: seed-isort-config
19+
20+
- repo: https://github.com/pre-commit/pre-commit-hooks
21+
rev: v4.6.0
22+
hooks:
23+
- id: check-yaml
24+
- id: end-of-file-fixer
25+
- id: trailing-whitespace

0 commit comments

Comments
 (0)