Setup.py now includes necessary binaries to be included

niklases · niklases · commit 54fc604ab733 · 2025-05-26T22:11:57.000+02:00
+ Added Dockerfile
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -231,6 +231,26 @@
             ]
         },
 
+        { // Test on test set: Hybrid DCA-LLM ProSST
+            "name": "Python: PyPEF hybrid/only-TS-zero-shot GREMLIN-DCA-ProSST avGFP",
+            "type": "debugpy",
+            "request": "launch",
+            "env": {"PYTHONPATH": "${workspaceFolder}"},
+            "program": "${workspaceFolder}/pypef/main.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "cwd": "${workspaceFolder}/datasets/AVGFP/",
+            "args": [
+                "hybrid", 
+                //"-m", "GREMLIN",   // optional, not required  
+                "--ts", "TS.fasl", 
+                "--params", "GREMLIN",
+                "--llm", "prosst",
+                "--wt", "P42212_F64L.fasta",
+                "--pdb", "GFP_AEQVI.pdb"
+            ]
+        },
+
         {
             "name": "Python: PyPEF hybrid/only-PS-zero-shot GREMLIN-DCA avGFP",
             "type": "debugpy",
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,17 @@
+FROM python:3.12-slim
+
+WORKDIR /app
+RUN mkdir -p pypef
+
+COPY requirements.txt run.py /app/
+COPY pypef/ /app/pypef/
+
+RUN pip install --upgrade pip
+RUN pip install --no-cache-dir -r requirements.txt
+RUN ["python", "-c", "import torch;print(torch.__version__)"]
+
+EXPOSE 5000
+
+# Not defining entrypoint herein for eased chaining of multiple commands 
+# with /bin/bash -c "command1 && command2..."
+#ENTRYPOINT ["python", "/app/run.py"]
diff --git a/README.md b/README.md
@@ -1,6 +1,7 @@
 ## Table of Contents
 [PyPEF: Pythonic Protein Engineering Framework](#pypef-pythonic-protein-engineering-framework)
   - [Quick Installation](#quick-installation)
+    - [Setup and Run Docker Image](#setup-and-run-docker-image)
     - [GUI Installation](#gui-installation)
   - [Requirements](#requirements)
   - [Running Examples](#running-examples)
@@ -67,6 +68,24 @@ pypef --help
 The detailed routine for setting up a new virtual environment with Anaconda, installing the necessary Python packages for that environment, and running the Jupyter notebook tutorial can be found below in the Tutorial section.
 A quick file setup and run test can be performed running files in [scripts/Setup](scripts/Setup) containing a Batch script for Windows and a Bash script for Linux (the latter requires conda, i.e. Miniconda3 or Anaconda3, already being installed).
 
+
+<a name="docker-installation"></a>
+### Setup and Run Docker Image
+
+Build the image using the stored [Dockerfile](./Dockerfile)
+```bash
+docker build -t pypef . # --progress=plain --no-cache
+```
+
+A chained container command using the built Docker image can be run with e.g.:
+```bash
+docker run --gpus=all -v ./datasets/:/datasets --workdir /datasets/AVGFP pypef /bin/bash -c \
+    "python /app/run.py mklsts --wt P42212_F64L.fasta --input avGFP.csv --ls_proportion 0.01 &&  \
+     python /app/run.py hybrid --ls LS.fasl --ts TS.fasl --params GREMLIN --llm prosst --wt P42212_F64L.fasta --pdb GFP_AEQVI.pdb"
+
+```
+
+
 <a name="gui-installation"></a>
 ### GUI Installation
 
diff --git a/build_with_pyinstaller.bat b/build_with_pyinstaller.bat
@@ -1,21 +1,24 @@
 REM Up to now pastes DLLs from local Python environment bin's to _internal...
-REM alternative?: set PATH=%PATH%;%USERPROFILE%\miniconda3\envs\py312\Library\bin\;
+REM alternative?: set PATH=%PATH%;%USERPROFILE%\miniconda3\envs\pypef\Library\bin\;
 pip install -r requirements.txt
 pip install -U pyinstaller pyside6
 pip install -e .
+set PATH=%PATH%;%USERPROFILE%\miniconda3\Scripts
 pyinstaller^
   --console^
   --noconfirm^
+  --collect-data pypef^
+  --collect-all pypef^
   --collect-data torch^
   --collect-data biotite^
   --collect-all biotite^
   --collect-data torch_geometric^
   --collect-all torch_geometric^
   --hidden-import torch_geometric^
-  --add-binary=%USERPROFILE%\miniconda3\envs\py312\Library\bin\onedal_thread.3.dll:.^
-  --add-binary=%USERPROFILE%\miniconda3\envs\py312\Library\bin\tbbbind.dll:.^
-  --add-binary=%USERPROFILE%\miniconda3\envs\py312\Library\bin\tbbbind_2_0.dll:.^
-  --add-binary=%USERPROFILE%\miniconda3\envs\py312\Library\bin\tbbbind_2_5.dll:.^
-  --add-binary=%USERPROFILE%\miniconda3\envs\py312\Library\bin\tbbmalloc.dll:.^
-  --add-binary=%USERPROFILE%\miniconda3\envs\py312\Library\bin\tbbmalloc_proxy.dll:.^
+  --add-binary=%USERPROFILE%\miniconda3\envs\pypef\Library\bin\onedal_thread.3.dll:.^
+  --add-binary=%USERPROFILE%\miniconda3\envs\pypef\Library\bin\tbbbind.dll:.^
+  --add-binary=%USERPROFILE%\miniconda3\envs\pypef\Library\bin\tbbbind_2_0.dll:.^
+  --add-binary=%USERPROFILE%\miniconda3\envs\pypef\Library\bin\tbbbind_2_5.dll:.^
+  --add-binary=%USERPROFILE%\miniconda3\envs\pypef\Library\bin\tbbmalloc.dll:.^
+  --add-binary=%USERPROFILE%\miniconda3\envs\pypef\Library\bin\tbbmalloc_proxy.dll:.^
   gui\PyPEFGUIQtWindow.py
diff --git a/build_with_pyinstaller.sh b/build_with_pyinstaller.sh
@@ -5,6 +5,8 @@ pip install -e .
 pyinstaller \
   --console \
   --noconfirm \
+  --collect-data pypef \
+  --collect-all pypef \
   --collect-data torch \
   --collect-data biotite \
   --collect-all biotite \
diff --git a/pypef/__init__.py b/pypef/__init__.py
@@ -12,4 +12,4 @@
 # Journal of Chemical Information and Modeling, 2021, 61, 3463-3476
 # https://doi.org/10.1021/acs.jcim.1c00099
 
-__version__ = '0.4.1'
+__version__ = '0.4.2'
diff --git a/pypef/hybrid/hybrid_model.py b/pypef/hybrid/hybrid_model.py
@@ -396,11 +396,9 @@ def train_llm(self):
         # LoRA training on y_llm_ttrain --> Testing on y_llm_ttest 
         x_llm_ttrain_b, scores_ttrain_b = (
             get_batches(self.x_llm_ttrain, batch_size=self.batch_size, dtype=int), 
-            #get_batches(self.attn_llm_ttrain, batch_size=self.batch_size, dtype=int), 
             get_batches(self.y_ttrain, batch_size=self.batch_size, dtype=float)
         )
 
-        #x_llm_ttest_b = get_batches(self.x_llm_ttest, batch_size=self.batch_size, dtype=int)
         if self.llm_key == 'prosst':
             y_llm_ttest = self.llm_inference_function(
                 xs=self.x_llm_ttest,
@@ -457,8 +455,7 @@ def train_llm(self):
                 self.llm_attention_mask,  
                 self.structure_input_ids,
                 n_epochs=50, 
-                device=self.device,
-                #seed=self.seed
+                device=self.device
             )
             y_llm_lora_ttrain = self.llm_inference_function(
                 xs=self.x_llm_ttrain,
@@ -486,8 +483,7 @@ def train_llm(self):
                 self.llm_model,
                 self.llm_optimizer,  
                 n_epochs=5, 
-                device=self.device,
-                #seed=self.seed
+                device=self.device
             )
             y_llm_lora_ttrain = self.llm_inference_function(
                 xs=x_llm_ttrain_b,
diff --git a/scripts/ProteinGym_runs/README.md b/scripts/ProteinGym_runs/README.md
@@ -1,8 +1,13 @@
 ## Benchmark runs on publicly available ProteinGym protein variant sequence-fitness datasets
 
-Data is taken (script-based download) from "DMS Assays"-->"Substitutions" and "Multiple Sequence Alignments"-->"DMS Assays" data from https://proteingym.org/download.
-Run the following to download and extract the ProteinGym data and subsequently to get the predictions/the performance on those datasets.
-Based on available GPU/VRAM, variable `MAX_WT_SEQUENCE_LENGTH` in script [run_performance_tests_proteingym_hybrid_dca_llm.py](run_performance_tests_proteingym_hybrid_dca_llm.py) has to adjusted according to available (V)RAM. E.g., results ([results/dca_esm_and_hybrid_opt_results.csv](results/dca_esm_and_hybrid_opt_results.csv), graphically presented on the main page README) were computed with an NVIDIA GeForce RTX 5090 with 32 GB VRAM and setting `MAX_WT_SEQUENCE_LENGTH` to 1000 (GPU power limit set to 520 W):
+Data is taken (script-based download) from 
+
+"DMS Assays"-->"Substitutions" and "Multiple Sequence Alignments"-->"DMS Assays" data 
+
+from https://proteingym.org/download.
+
+Perform the following steps to download and extract the ProteinGym data and then obtain the predictions/performance for these datasets.
+Depending on the available GPU/VRAM, the variable `MAX_WT_SEQUENCE_LENGTH` in the script [run_performance_tests_proteingym_hybrid_dca_llm.py](run_performance_tests_proteingym_hybrid_dca_llm.py) must be adjusted according to the available (V)RAM. For example, the results ([results/dca_esm_and_hybrid_opt_results.csv](results/dca_esm_and_hybrid_opt_results.csv), shown graphically on the main README page) were calculated with an NVIDIA GeForce RTX 5090 with 32 GB VRAM and the setting `MAX_WT_SEQUENCE_LENGTH = 1000` (GPU power limit set to 520 W):
 
 ```sh
 #python -m pip install -r ../../requirements.txt
diff --git a/setup.py b/setup.py
@@ -39,7 +39,11 @@
     url='https://github.com/niklases/PyPEF',
     py_modules=['pypef'],
     packages=find_packages(include=['pypef', 'pypef.*']),
-    package_data={'pypef': ['ml/AAindex/*', 'ml/AAindex/Refined_cluster_indices_r0.93_r0.97/*']},
+    package_data={'pypef': [
+        'ml/AAindex/*', 
+        'ml/AAindex/Refined_cluster_indices_r0.93_r0.97/*',
+        'llm/prosst_structure/static/*'
+        ]},
     include_package_data=True,
     install_requires=[cleaned_requirements],
     python_requires='>= 3.10, < 3.13',