Skip to content

Commit 2519852

Browse files
committed
add protein-related code from https://github.com/ChEB-AI/python-chebai
1 parent 7b228b6 commit 2519852

File tree

112 files changed

+24147
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

112 files changed

+24147
-0
lines changed

.gitattributes

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Juypter notebooks contains images, and tables, and parsing text
2+
# blowing up the total language fraction unrealistically;
3+
# then 'Juypter notebooks' are suddenly major part of repo language.
4+
5+
# As they don't want to parse notebooks better
6+
# (wont-fix = https://github.com/github/linguist/issues/3496)
7+
# Simply exclude this file from counting now:
8+
9+
notebooks/*.ipynb linguist-generated=true
10+
stream_viz/tutorial/*.ipynb linguist-generated=true

.github/workflows/black.yml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
name: Lint
2+
3+
on: [push, pull_request]
4+
5+
jobs:
6+
lint:
7+
runs-on: ubuntu-latest
8+
steps:
9+
- uses: actions/checkout@v2
10+
- uses: psf/black@stable
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import json
2+
3+
from chebai.preprocessing.reader import (
4+
CLS_TOKEN,
5+
EMBEDDING_OFFSET,
6+
MASK_TOKEN_INDEX,
7+
PADDING_TOKEN_INDEX,
8+
)
9+
10+
# Define the constants you want to export
11+
# Any changes in the key names here should also follow the same change in verify_constants.yml code
12+
constants = {
13+
"EMBEDDING_OFFSET": EMBEDDING_OFFSET,
14+
"CLS_TOKEN": CLS_TOKEN,
15+
"PADDING_TOKEN_INDEX": PADDING_TOKEN_INDEX,
16+
"MASK_TOKEN_INDEX": MASK_TOKEN_INDEX,
17+
}
18+
19+
if __name__ == "__main__":
20+
# Write constants to a JSON file
21+
with open("constants.json", "w") as f:
22+
json.dump(constants, f)

.github/workflows/test.yml

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
name: Unittests
2+
3+
on: [pull_request]
4+
5+
jobs:
6+
build:
7+
8+
runs-on: ubuntu-latest
9+
strategy:
10+
fail-fast: false
11+
matrix:
12+
python-version: ["3.9", "3.10", "3.11", "3.12"]
13+
14+
steps:
15+
- uses: actions/checkout@v4
16+
17+
- name: Set up Python ${{ matrix.python-version }}
18+
uses: actions/setup-python@v5
19+
with:
20+
python-version: ${{ matrix.python-version }}
21+
22+
- name: Install dependencies
23+
run: |
24+
python -m pip install --upgrade pip
25+
python -m pip install --upgrade pip setuptools wheel
26+
python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
27+
python -m pip install -e .
28+
29+
- name: Display Python & Installed Packages
30+
run: |
31+
python --version
32+
pip freeze
33+
34+
- name: Run Unit Tests
35+
run: python -m unittest discover -s tests/unit -v
36+
env:
37+
ACTIONS_STEP_DEBUG: true # Enable debug logs
38+
ACTIONS_RUNNER_DEBUG: true # Additional debug logs from Github Actions itself
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
name: Check consistency of tokens.txt file
2+
3+
# Define the file paths under `paths` to trigger this check only when specific files are modified.
4+
# This script will then execute checks only on files that have changed, rather than all files listed in `paths`.
5+
6+
# **Note** : To add a new token file for checks, include its path in:
7+
# - `on` -> `push` and `pull_request` sections
8+
# - `jobs` -> `check_tokens` -> `steps` -> Set global variable for multiple tokens.txt paths -> `TOKENS_FILES`
9+
10+
on:
11+
push:
12+
paths:
13+
- "chebai/preprocessing/bin/protein_token/tokens.txt"
14+
- "chebai/preprocessing/bin/protein_token_3_gram/tokens.txt"
15+
pull_request:
16+
paths:
17+
- "chebai/preprocessing/bin/protein_token/tokens.txt"
18+
- "chebai/preprocessing/bin/protein_token_3_gram/tokens.txt"
19+
20+
jobs:
21+
check_tokens:
22+
runs-on: ubuntu-latest
23+
24+
steps:
25+
- name: Checkout code
26+
uses: actions/checkout@v2
27+
28+
- name: Get list of changed files
29+
id: changed_files
30+
run: |
31+
git fetch origin dev
32+
33+
# Get the list of changed files compared to origin/dev and save them to a file
34+
git diff --name-only origin/dev > changed_files.txt
35+
36+
# Print the names of changed files on separate lines
37+
echo "Changed files:"
38+
while read -r line; do
39+
echo "Changed File name : $line"
40+
done < changed_files.txt
41+
42+
- name: Set global variable for multiple tokens.txt paths
43+
run: |
44+
# All token files that needs to checked must be included here too, same as in `paths`.
45+
TOKENS_FILES=(
46+
"chebai/preprocessing/bin/protein_token/tokens.txt"
47+
"chebai/preprocessing/bin/protein_token_3_gram/tokens.txt"
48+
)
49+
echo "TOKENS_FILES=${TOKENS_FILES[*]}" >> $GITHUB_ENV
50+
51+
- name: Process only changed tokens.txt files
52+
run: |
53+
# Convert the TOKENS_FILES environment variable into an array
54+
TOKENS_FILES=(${TOKENS_FILES})
55+
56+
# Iterate over each token file path
57+
for TOKENS_FILE_PATH in "${TOKENS_FILES[@]}"; do
58+
# Check if the current token file path is in the list of changed files
59+
if grep -q "$TOKENS_FILE_PATH" changed_files.txt; then
60+
echo "----------------------- Processing $TOKENS_FILE_PATH -----------------------"
61+
62+
# Get previous tokens.txt version
63+
git fetch origin dev
64+
git diff origin/dev -- $TOKENS_FILE_PATH > tokens_diff.txt || echo "No previous tokens.txt found for $TOKENS_FILE_PATH"
65+
66+
# Check for deleted or added lines in tokens.txt
67+
if [ -f tokens_diff.txt ]; then
68+
69+
# Check for deleted lines (lines starting with '-')
70+
deleted_lines=$(grep '^-' tokens_diff.txt | grep -v '^---' | sed 's/^-//' || true)
71+
if [ -n "$deleted_lines" ]; then
72+
echo "Error: Lines have been deleted from $TOKENS_FILE_PATH."
73+
echo -e "Deleted Lines: \n$deleted_lines"
74+
exit 1
75+
fi
76+
77+
# Check for added lines (lines starting with '+')
78+
added_lines=$(grep '^+' tokens_diff.txt | grep -v '^+++' | sed 's/^+//' || true)
79+
if [ -n "$added_lines" ]; then
80+
81+
# Count how many lines have been added
82+
num_added_lines=$(echo "$added_lines" | wc -l)
83+
84+
# Get last `n` lines (equal to num_added_lines) of tokens.txt
85+
last_lines=$(tail -n "$num_added_lines" $TOKENS_FILE_PATH)
86+
87+
# Check if the added lines are at the end of the file
88+
if [ "$added_lines" != "$last_lines" ]; then
89+
90+
# Find lines that were added but not appended at the end of the file
91+
non_appended_lines=$(diff <(echo "$added_lines") <(echo "$last_lines") | grep '^<' | sed 's/^< //')
92+
93+
echo "Error: New lines have been added to $TOKENS_FILE_PATH, but they are not at the end of the file."
94+
echo -e "Added lines that are not at the end of the file: \n$non_appended_lines"
95+
exit 1
96+
fi
97+
fi
98+
99+
if [ "$added_lines" == "" ]; then
100+
echo "$TOKENS_FILE_PATH validation successful: No lines were deleted, and no new lines were added."
101+
else
102+
echo "$TOKENS_FILE_PATH validation successful: No lines were deleted, and new lines were correctly appended at the end."
103+
fi
104+
else
105+
echo "No previous version of $TOKENS_FILE_PATH found."
106+
fi
107+
else
108+
echo "$TOKENS_FILE_PATH was not changed, skipping."
109+
fi
110+
done
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
name: Verify Constants
2+
3+
# Define the file paths under `paths` to trigger this check only when specific files are modified.
4+
# This script will then execute checks only on files that have changed, rather than all files listed in `paths`.
5+
6+
# **Note** : To add a new file for checks, include its path in:
7+
# - `on` -> `push` and `pull_request` sections
8+
# - `jobs` -> `verify-constants` -> `steps` -> Verify constants -> Add a new if else for your file, with check logic inside it.
9+
10+
11+
on:
12+
push:
13+
paths:
14+
- "chebai/preprocessing/reader.py"
15+
pull_request:
16+
paths:
17+
- "chebai/preprocessing/reader.py"
18+
19+
jobs:
20+
verify-constants:
21+
runs-on: ubuntu-latest
22+
strategy:
23+
fail-fast: false
24+
matrix:
25+
python-version: [
26+
# Only use 3.10 as of now
27+
# "3.9",
28+
"3.10",
29+
# "3.11"
30+
]
31+
32+
steps:
33+
- name: Checkout code
34+
uses: actions/checkout@v4
35+
36+
- name: Set PYTHONPATH
37+
run: echo "PYTHONPATH=$PWD" >> $GITHUB_ENV
38+
39+
- name: Get list of changed files
40+
id: changed_files
41+
run: |
42+
git fetch origin dev
43+
44+
# Get the list of changed files compared to origin/dev and save them to a file
45+
git diff --name-only origin/dev > changed_files.txt
46+
47+
# Print the names of changed files on separate lines
48+
echo "Changed files:"
49+
while read -r line; do
50+
echo "Changed File name : $line"
51+
done < changed_files.txt
52+
53+
- name: Set up Python ${{ matrix.python-version }}
54+
uses: actions/setup-python@v5
55+
with:
56+
python-version: ${{ matrix.python-version }}
57+
58+
- name: Install dependencies
59+
# Setting a fix version for torch due to an error with latest version (2.5.1)
60+
# ImportError: cannot import name 'T_co' from 'torch.utils.data.dataset'
61+
run: |
62+
python -m pip install --upgrade pip
63+
python -m pip install --upgrade pip setuptools wheel
64+
python -m pip install torch==2.4.1 --index-url https://download.pytorch.org/whl/cpu
65+
python -m pip install -e .
66+
67+
- name: Export constants
68+
run: python .github/workflows/export_constants.py
69+
70+
- name: Load constants into environment variables
71+
id: load_constants
72+
# "E_" is appended as suffix to every constant, to protect overwriting other sys env variables with same name
73+
run: |
74+
constants=$(cat constants.json)
75+
echo "$constants" | jq -r 'to_entries|map("E_\(.key)=\(.value|tostring)")|.[]' >> $GITHUB_ENV
76+
77+
- name: Print all environment variables
78+
run: printenv
79+
80+
- name: Verify constants
81+
run: |
82+
file_name="chebai/preprocessing/reader.py"
83+
if grep -q "$file_name" changed_files.txt; then
84+
echo "----------------------- Checking file : $file_name ----------------------- "
85+
86+
# Define expected values for constants
87+
exp_embedding_offset="10"
88+
exp_cls_token="2"
89+
exp_padding_token_index="0"
90+
exp_mask_token_index="1"
91+
92+
# Debugging output to check environment variables
93+
echo "Current Environment Variables:"
94+
echo "E_EMBEDDING_OFFSET = $E_EMBEDDING_OFFSET"
95+
echo "Expected: $exp_embedding_offset"
96+
97+
# Verify constants match expected values
98+
if [ "$E_EMBEDDING_OFFSET" != "$exp_embedding_offset" ]; then
99+
echo "EMBEDDING_OFFSET ($E_EMBEDDING_OFFSET) does not match expected value ($exp_embedding_offset)!"
100+
exit 1
101+
fi
102+
if [ "$E_CLS_TOKEN" != "$exp_cls_token" ]; then
103+
echo "CLS_TOKEN ($E_CLS_TOKEN) does not match expected value ($exp_cls_token)!"
104+
exit 1
105+
fi
106+
if [ "$E_PADDING_TOKEN_INDEX" != "$exp_padding_token_index" ]; then
107+
echo "PADDING_TOKEN_INDEX ($E_PADDING_TOKEN_INDEX) does not match expected value ($exp_padding_token_index)!"
108+
exit 1
109+
fi
110+
if [ "$E_MASK_TOKEN_INDEX" != "$exp_mask_token_index" ]; then
111+
echo "MASK_TOKEN_INDEX ($E_MASK_TOKEN_INDEX) does not match expected value ($exp_mask_token_index)!"
112+
exit 1
113+
fi
114+
else
115+
echo "$file_name not found in changed_files.txt; skipping check."
116+
fi

0 commit comments

Comments
 (0)