refactor: all scripts

cmeesters · cmeesters · commit f85ac427038e · 2025-08-06T13:54:10.000+02:00
diff --git a/workflow/scripts/ResultTxt.py b/workflow/scripts/ResultTxt.py
@@ -5,11 +5,12 @@
 import os
 import pandas as pd
 
-infile  = snakemake.input[0]
+infile = snakemake.input[0]
 outfile = snakemake.output[0]
 
 linesep = os.linesep
 
+
 def pairwise(iterable):
     """
     :param iterable:
@@ -19,38 +20,40 @@ def pairwise(iterable):
     next(b, None)
     return zip(a, b)
 
+
 def extract_pdbqt(infile):
     """
     :param infile: path to infile
     :param outfile: path to outfile
     :return: receptor name, IDs and enthalpies
     """
-    target_IDs  = []
-    enthalpies  = []
-    references  = set()
-    with open(infile, encoding='utf-8') as to_parse:
+    target_IDs = []
+    enthalpies = []
+    references = set()
+    with open(infile, encoding="utf-8") as to_parse:
         # this line reads: 'REMARK RECEPTOR path/to/target.pdbqt'
         headerline = to_parse.readline()
         # we want to extract the target name WITHOUT the suffix
-        receptor   = Path(headerline.split(' ')[-1]).stem
+        receptor = Path(headerline.split(" ")[-1]).stem
         # we then proceed extracting the enthalpy per ligand
         # we iterate two line per iteration, because the file is structured, like:
         #
         # REMARK VINA RESULT:  Enthalpy 1 Enthalpy 2 Enthalpy 3
         # REMARK  Name = <name>
         for i, j in pairwise(to_parse):
-            if 'Name' in j:
-                ID = j.split(' = ')[-1].strip(linesep)
+            if "Name" in j:
+                ID = j.split(" = ")[-1].strip(linesep)
                 if ID not in references:
                     references.add(ID)
                     target_IDs.append(ID)
                     enthalpies.append(i.split()[3])
 
     return receptor, target_IDs, enthalpies
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     receptor, target_IDs, enthalpies = extract_pdbqt(infile)
     # putting into data frame:
-    out_df = pd.DataFrame(enthalpies, index = target_IDs, columns = [receptor])
+    out_df = pd.DataFrame(enthalpies, index=target_IDs, columns=[receptor])
     # finally, save the output
     out_df.to_csv(outfile)
diff --git a/workflow/scripts/ZINCdownload.py b/workflow/scripts/ZINCdownload.py
@@ -1,4 +1,5 @@
 """download ligands in pdbqt format from ZINC database"""
+
 import os
 import requests
 
diff --git a/workflow/scripts/concatPDBQT.py b/workflow/scripts/concatPDBQT.py
@@ -1,4 +1,5 @@
 """read all files in input folder and concat them together"""
+
 import os
 from snakemake.shell import shell
 
diff --git a/workflow/scripts/downloadZINCsubsets.py b/workflow/scripts/downloadZINCsubsets.py
@@ -1,11 +1,13 @@
-'''download ligand subsets in mol2 format from ZINC database'''
+"""download ligand subsets in mol2 format from ZINC database"""
 
 import requests
 
 out = snakemake.output[0]
 subset = snakemake.params.sub
 
-url = ''.join(["https://zinc15.docking.org/substances/subsets/", subset, ".mol2?count=all"])
+url = "".join(
+    ["https://zinc15.docking.org/substances/subsets/", subset, ".mol2?count=all"]
+)
 
 r = requests.get(url, timeout=60)
 with open(out, "wb") as outfile:
diff --git a/workflow/scripts/generateIRODS.py b/workflow/scripts/generateIRODS.py
@@ -3,7 +3,7 @@
 import json
 import pandas as pd
 
-target1 =  snakemake.config["TARGETS"]
+target1 = snakemake.config["TARGETS"]
 targets = snakemake.config["RESCREENING_TARGETS"]
 
 name = snakemake.config["EXPERIMENT_NAME"]
@@ -37,5 +37,5 @@
 
 out_dict.update(results)
 
-with open(outfile, "w", encoding = "utf-8") as f:
-    json.dump(out_dict, f, sort_keys = False, indent = 4)
+with open(outfile, "w", encoding="utf-8") as f:
+    json.dump(out_dict, f, sort_keys=False, indent=4)
diff --git a/workflow/scripts/makeHistogram.py b/workflow/scripts/makeHistogram.py
@@ -5,12 +5,14 @@
 import pandas as pd
 from matplotlib import pyplot as plt
 
+
 def pairwise(iterable):
     "s -> (s0,s1), (s1,s2), (s2, s3), ..."
-    a, b= itertools.tee(iterable)
+    a, b = itertools.tee(iterable)
     next(b, None)
     return zip(a, b)
 
+
 def getHistogram(in_pdbqt, low, high, step):
     """creates histogram of strutures in_pdbqt.
 
@@ -25,15 +27,15 @@ def getHistogram(in_pdbqt, low, high, step):
     step : float
         size of binding energy buckets.
     """
-    f = gzip.open(in_pdbqt, 'r')
+    f = gzip.open(in_pdbqt, "r")
     lowerBound = low
     bins = []
-    list1 =[]
+    list1 = []
     while lowerBound < high:
         bins.append(lowerBound)
         lowerBound = lowerBound + step
     bins = [round(num, 2) for num in bins]
-    for a,b in pairwise(f):
+    for a, b in pairwise(f):
         if "REMARK VINA RESULT:" in str(b):
             if "MODEL 1" in str(a):
                 tmp = b.split()
@@ -46,6 +48,7 @@ def getHistogram(in_pdbqt, low, high, step):
     plt.ylabel("frequency")
     fig = plt.gcf()
     fig.set_size_inches(10, 5)
-    plt.savefig(snakemake.output[0], dpi = 300)
+    plt.savefig(snakemake.output[0], dpi=300)
+
 
 getHistogram(snakemake.input[0], -13, -3, 0.2)
diff --git a/workflow/scripts/mergeOutput.py b/workflow/scripts/mergeOutput.py
@@ -1,18 +1,21 @@
 """merge all docking output files together"""
+
 import shutil
 import subprocess
 
 outFile = snakemake.output[0]
 inFiles = snakemake.input
 
-#TODO: change to ZIPFILE to support better result portability
-with open(outFile, 'wb') as outF:
+# TODO: change to ZIPFILE to support better result portability
+with open(outFile, "wb") as outF:
     for file in inFiles:
         try:
-            out = subprocess.check_output(['gunzip', '-t', file])  #test if gzip file is valid before merging
+            out = subprocess.check_output(
+                ["gunzip", "-t", file]
+            )  # test if gzip file is valid before merging
 
-            #TODO: avoid overwriting the file descriptor!
-            with open(file, 'rb') as sourceFile:
+            # TODO: avoid overwriting the file descriptor!
+            with open(file, "rb") as sourceFile:
                 shutil.copyfileobj(sourceFile, outF)
         except subprocess.CalledProcessError:
-            print(f'Erros occured during the docking of {file}')
+            print(f"Erros occured during the docking of {file}")
diff --git a/workflow/scripts/prepareDocking.py b/workflow/scripts/prepareDocking.py
@@ -2,6 +2,7 @@
 preparation of ligand input file for VinaLC containing the path
  to every ligand with same weigth + logP
  """
+
 import os
 
 input_directory = snakemake.input.in_dir
@@ -15,7 +16,13 @@
                 for purchase in snakemake.config["ZINC_INPUT"]["PURCHASE"]:
                     for ph in snakemake.config["ZINC_INPUT"]["PH"]:
                         for charge in snakemake.config["ZINC_INPUT"]["CHARGE"]:
-                            fname = weightlog + react + purchase + ph + charge + ".pdbqt"
+                            fname = (
+                                weightlog + react + purchase + ph + charge + ".pdbqt"
+                            )
                             ligand_file = os.path.join(input_directory, fname)
-                            with open(os.path.join(snakemake.output.library ),"r", encoding='utf-8') as file_object:
+                            with open(
+                                os.path.join(snakemake.output.library),
+                                "r",
+                                encoding="utf-8",
+                            ) as file_object:
                                 file_object.write(ligand_file + "\n")
diff --git a/workflow/scripts/prepareReceptor.py b/workflow/scripts/prepareReceptor.py
@@ -3,6 +3,7 @@
 import os
 from Bio.PDB import PDBParser, PDBIO
 
+
 def removeChains(model, chainlist):
     """
     removes chains not specified in chainlist from model
@@ -20,7 +21,7 @@ def removeChains(model, chainlist):
     chain_to_remove = []
     for chain in model:
         for residue in chain:
-            if residue.id[0] != ' ':
+            if residue.id[0] != " ":
                 residue_to_remove.append((chain.id, residue.id))
         if not chain:
             chain_to_remove.append(chain.id)
@@ -39,9 +40,9 @@ def prepareRec(inputfile, outputfile, target):
     select chains to delete depending on config definition
     """
     print(target)
-    ID = target.split(',')
+    ID = target.split(",")
     chains = ID[1].split(" ")
-    parser = PDBParser()#MMCIFParser()
+    parser = PDBParser()  # MMCIFParser()
     structure = parser.get_structure(ID[0], inputfile)
     model = structure[0]
     removeChains(model, chains)
@@ -52,12 +53,13 @@ def prepareRec(inputfile, outputfile, target):
     print("printing outfile")
     io.save(out)
 
-head,tail = os.path.split(snakemake.input[0])
+
+head, tail = os.path.split(snakemake.input[0])
 filename = tail.split(".")[0]
 
 if any(filename in target for target in snakemake.config["TARGETS"]):
-    print('filename in targets')
-    prepareRec(snakemake.input[0],  snakemake.output[0], snakemake.config["TARGETS"][0])
+    print("filename in targets")
+    prepareRec(snakemake.input[0], snakemake.output[0], snakemake.config["TARGETS"][0])
 
 elif filename in str(snakemake.config["RESCREENING_TARGETS"]):
     for target in snakemake.config["RESCREENING_TARGETS"]:
diff --git a/workflow/scripts/sortResult.py b/workflow/scripts/sortResult.py
@@ -1,62 +1,64 @@
 """sorting and outputting the N>1 best results from a docking run"""
+
 import subprocess
 import gzip
 import os
 from math import ceil
+
 inPath = snakemake.input[0]
 outFile = snakemake.output[0]
 num = float(snakemake.config["RESULT_NUMBER"])
 
-count=0
-lineN=0
-ids=0
-swapFlg=False
-scoreList=[]
-strucList=[]
-tempList=[]
+count = 0
+lineN = 0
+ids = 0
+swapFlg = False
+scoreList = []
+strucList = []
+tempList = []
 
 getLigNum = "zgrep -c 'REMARK RECEPTOR' " + inPath
 ligand_num = int(subprocess.check_output(getLigNum, shell=True))
 
 if num > 1:
     listSize = num
 else:
-    total=float(ligand_num)
-    listSize = ceil(num*total)
+    total = float(ligand_num)
+    listSize = ceil(num * total)
 
-with gzip.open(inPath, 'rt', encoding='utf-8') as inFile:
+with gzip.open(inPath, "rt", encoding="utf-8") as inFile:
     for line in inFile:
         if "REMARK RECEPTOR" in line:
-            lineN=0
-            if count>0:
-                if count<=listSize:
+            lineN = 0
+            if count > 0:
+                if count <= listSize:
                     strucList.append(tempList)
                 elif swapFlg:
-                    strucList[ids]=tempList
-                    swapFlg=False
-            count=count+1
-            tempList=[]
-        if lineN==3:
-            strs=line.split()
-            curValue=float(strs[3])
-            if count<=listSize:
+                    strucList[ids] = tempList
+                    swapFlg = False
+            count = count + 1
+            tempList = []
+        if lineN == 3:
+            strs = line.split()
+            curValue = float(strs[3])
+            if count <= listSize:
                 scoreList.append(curValue)
             else:
-                maxValue=max(scoreList)
-                if curValue<maxValue:
-                    swapFlg=True
-                    ids=scoreList.index(maxValue)
-                    scoreList[ids]=curValue
+                maxValue = max(scoreList)
+                if curValue < maxValue:
+                    swapFlg = True
+                    ids = scoreList.index(maxValue)
+                    scoreList[ids] = curValue
         tempList.append(line)
-        lineN=lineN+1
-    scoreDict={}
+        lineN = lineN + 1
+    scoreDict = {}
 
     for index, item in enumerate(scoreList):
-        scoreDict[index]=item
+        scoreDict[index] = item
 
-    sortList=sorted(scoreDict, key=scoreDict.get)
-    with open(outFile, 'w', encoding='utf-8') as outF:
+    sortList = sorted(scoreDict, key=scoreDict.get)
+    with open(outFile, "w", encoding="utf-8") as outF:
         for index in sortList:
             for line in strucList[index]:
                 outF.write(line)
-os.chmod(outFile, 0o400) ## TODO: remove after included in Snakemake
+os.chmod(outFile, 0o400)  ## TODO: remove after included in Snakemake
diff --git a/workflow/scripts/splitFile.py b/workflow/scripts/splitFile.py
@@ -7,17 +7,21 @@
 # max. input molecule per output file
 n = 15000
 
-with open(snakemake.input[0], 'r', encoding='utf-8') as infile:
+with open(snakemake.input[0], "r", encoding="utf-8") as infile:
     intext = infile.read()  # read the entire file into memory
 
-nligands = intext.split('ENDMDL')
+nligands = intext.split("ENDMDL")
 # number of output files
 file_number = (len(nligands) // n) + 1
 
 for j in range(file_number):
     if not os.path.exists(snakemake.output[0]):
         os.makedirs(snakemake.output[0], exist_ok=True)
-    with open(os.path.join(snakemake.output[0], "".join([str(j), ".pdbqt"])), 'w', encoding='utf-8') as outfile:
+    with open(
+        os.path.join(snakemake.output[0], "".join([str(j), ".pdbqt"])),
+        "w",
+        encoding="utf-8",
+    ) as outfile:
         count = 0
         for i in nligands:
             if n * j <= count < n * (j + 1):
diff --git a/workflow/scripts/union_venn.py b/workflow/scripts/union_venn.py
diff --git a/workflow/scripts/venn.py b/workflow/scripts/venn.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`"""download ligands in pdbqt format from ZINC database"""`
	`2`	`+`
`2`	`3`	`import os`
`3`	`4`	`import requests`
`4`	`5`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`"""read all files in input folder and concat them together"""`
	`2`	`+`
`2`	`3`	`import os`
`3`	`4`	`from snakemake.shell import shell`
`4`	`5`