-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsetup_databases.sh
More file actions
executable file
·318 lines (259 loc) · 9.66 KB
/
setup_databases.sh
File metadata and controls
executable file
·318 lines (259 loc) · 9.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
#!/bin/bash -e
# Modified verrsion of colabfold database setup script for UoD cluster
# This is intended to be qsubbed to an A40 node so the mmseqs2 indexes
# are built on the same nodes as used for execution, then distributed
# to the remaining nodes
#$ -j y
#$ -o colabfold_db_logs/$JOB_NAME.$JOB_ID
#$ -jc rhel9
#$ -mods l_hard hostname gpu-36.compute.dundee.ac.uk
#$ -adds l_hard gpu 1
#$ -adds l_hard cuda.0.name 'NVIDIA A40'
set -ex
VERSION="1.5.5"
# Current A40 nodes are gpu-34-gpu38 and gpu-40-gpu52
NODES_1=$(seq 34 38)
NODES_2=$(seq 40 52)
NODES=($NODES_1 $NODES_2)
if [[ "$USER" != "dbadmin" ]]; then
echo "Please submit this script as the dbadmin user"
exit 1
fi
source ~/miniconda3/etc/profile.d/conda.sh
conda activate mmseqs2
echo "CONDA_PREFIX=$CONDA_PREFIX"
echo "HOSTNAME=$HOSTNAME"
ARIA_NUM_CONN=8
PDB_SERVER="${2:-"rsync.wwpdb.org::ftp"}"
PDB_PORT="${3:-"33444"}"
PDB_AWS_DOWNLOAD="${4:-}"
PDB_AWS_SNAPSHOT="20240101"
UNIREF30DB="${UNIREF30DB:-"uniref30_2302"}"
CFDB="${CFDB:-"colabfold_envdb_202108"}"
db_dir="/opt/data/colabfold/${VERSION}"
#if [ ! -z $db_dir ] && [ -d "$db_dir" ]; then
# rm -r ${db_dir}
#fi
######################################################################
#
# downloadFile
#
# Downloads specified file to cwd
#
# Required params:
# URL: URL to download
# OUTPUT: Output filename
#
# Returns: None
#
######################################################################
downloadFile() {
URL="$1"
OUTPUT="$2"
set +e
FILENAME=$(basename "${OUTPUT}")
DIR=$(dirname "${OUTPUT}")
echo "aria2c --all-proxy=${http_proxy} --max-connection-per-server="$ARIA_NUM_CONN" --allow-overwrite=true -o "$FILENAME" -d "$DIR" "$URL" && set -e && return 0"
aria2c --all-proxy={http_proxy} --max-connection-per-server="$ARIA_NUM_CONN" --allow-overwrite=true -o "$FILENAME" -d "$DIR" "$URL" && set -e && return 0
set -e
echo "Could not download $URL to $OUTPUT"
}
######################################################################
#
# download_colabfold_dbs
#
# Carries out parallel download of required databases
#
# Required args: None
# Returns: None
#
######################################################################
download_colabfold_db() {
mkdir -p $db_dir
cd $db_dir
# Make MMseqs2 merge the databases to avoid spamming the folder with files
export MMSEQS_FORCE_MERGE=1
GPU=1
GPU_PAR="--gpu 1"
GPU_INDEX_PAR=" --split 1 --index-subset 2"
if [ ! -f DOWNLOADS_READY ]; then
# new prebuilt GPU+CPU databases, that don't require calling tsv2exprofiledb
downloadFile "https://opendata.mmseqs.org/colabfold/${UNIREF30DB}.db.tar.gz" "${UNIREF30DB}.tar.gz"
downloadFile "https://opendata.mmseqs.org/colabfold/${CFDB}.db.tar.gz" "${CFDB}.tar.gz"
downloadFile "https://opendata.mmseqs.org/colabfold/uniref30_2302_newtaxonomy.tar.gz" "uniref30_2302_newtaxonomy.tar.gz"
downloadFile "https://opendata.mmseqs.org/colabfold/pdb100_230517.fasta.gz" "pdb100_230517.fasta.gz"
downloadFile "https://opendata.mmseqs.org/colabfold/pdb100_foldseek_230517.tar.gz" "pdb100_foldseek_230517.tar.gz"
touch DOWNLOADS_READY
fi
if [ ! -f PDB_MMCIF_READY ]; then
mkdir -p pdb/divided
mkdir -p pdb/obsolete
rsync -av --delete /cluster/gjb_lab/db/NOBACK/mirrors/pdb/data/structures/divided/mmCIF pdb/divided
rsync -av --delete /cluster/gjb_lab/db/NOBACK/mirrors/pdb/data/structures/obsolete/mmCIF pdb/obsolete
touch PDB_MMCIF_READY
fi
if [ ! -f UNIREF30_READY ]; then
tar -xzvf "${UNIREF30DB}.tar.gz"
mmseqs createindex "${UNIREF30DB}_db" tmp1 --remove-tmp-files 1 ${GPU_INDEX_PAR}
# replace mapping and taxonomy with rebuilt versions, see:
# https://github.com/sokrypton/ColabFold/wiki/MSA-Server-Database-History#2025-08-04-updated-uniref100_2302-taxonomypairing-files
if [ -e "uniref30_2302_newtaxonomy.tar.gz" ]; then
tar -xzvf "uniref30_2302_newtaxonomy.tar.gz"
fi
if [ -e ${UNIREF30DB}_db_mapping ]; then
# create binary, mmap-able taxonomy mapping, saves a few seconds of load time during pairing
TAXHEADER=$(od -An -N4 -t x4 "${UNIREF30DB}_db_mapping" | tr -d ' ')
# check if the file is already binary, it has a binary-encoded header that spells TAXM if so
if [ "${TAXHEADER}" != "0c170013" ]; then
mmseqs createbintaxmapping "${UNIREF30DB}_db_mapping" "${UNIREF30DB}_db_mapping.bin"
mv -f -- "${UNIREF30DB}_db_mapping.bin" "${UNIREF30DB}_db_mapping"
fi
ln -sf ${UNIREF30DB}_db_mapping ${UNIREF30DB}_db.idx_mapping
fi
if [ -e ${UNIREF30DB}_db_taxonomy ]; then
ln -sf ${UNIREF30DB}_db_taxonomy ${UNIREF30DB}_db.idx_taxonomy
fi
touch UNIREF30_READY
fi
if [ ! -f COLABDB_READY ]; then
tar -xzvf "${CFDB}.tar.gz"
mmseqs createindex "${CFDB}_db" tmp2 --remove-tmp-files 1 ${GPU_INDEX_PAR}
touch COLABDB_READY
fi
if [ ! -f PDB_READY ]; then
# for consistency with the other prebuilt databases
# make pdb also compatible with both gpu and cpu
if [ -n "${GPU}" ] || [ "${FAST_PREBUILT_DATABASES}" = "1" ]; then
mmseqs createdb pdb100_230517.fasta.gz pdb100_230517_tmp
mmseqs makepaddedseqdb pdb100_230517_tmp pdb100_230517
mmseqs rmdb pdb100_230517_tmp
else
mmseqs createdb pdb100_230517.fasta.gz pdb100_230517
fi
touch PDB_READY
fi
if [ ! -f PDB100_READY ]; then
tar -xzvf pdb100_foldseek_230517.tar.gz pdb100_a3m.ffdata pdb100_a3m.ffindex
touch PDB100_READY
fi
rm -f "${UNIREF30DB}.tar.gz"
rm -f "${CFDB}.tar.gz"
rm -f "pdb100_230517.fasta.gz"
rm -f "pdb100_foldseek_230517.tar.gz"
rm -f "uniref30_2302_newtaxonomy.tar.gz"
rm -rf tmp*
}
######################################################################
#
# create_wrapper
#
# Generates a shell script for qsubbing each rsync job.
# Requirement to activate conda environment makes this
# to complex for qsub -b...
#
# Required parameters:
# source_node: hostname to sync from
# target_node: hostname to sync to
# hold: jid to submit hold_jid for
#
# Returns:
# path to wrapper script
#
######################################################################
create_wrapper() {
source_node=$1
target_node=$2
hold=$3
extra_args=""
if [[ ! -z "$hold" ]]; then
extra_args="## -hold_jid $hold"
fi
script="${TMPDIR}/sync_$target_node.sh"
# SGE directives have ## rather than #$ to we can qsub this script, then
# sed them afterwards...
cat<<EOF > $script
#!/bin/env bash
## -mods l_hard hostname ${target_node}
## -N colabfold_mirror
## -j y
## -jc rhel9
## -o $HOME/colabfold_db_logs/\$JOB_NAME.\$JOB_ID
$extra_args
source ~/miniconda3/etc/profile.d/conda.sh
conda activate mmseqs2
rsync -e 'ssh -oStrictHostKeyChecking=no' --rsync-path=$CONDA_PREFIX/bin/rsync --delete -av $source_node:/opt/data/colabfold/ /opt/data/colabfold
find /opt/data/colabfold -type d -exec chmod 0755 {} \;
find /opt/data/colabfold -type f -exec chmod 0644 {} \;
EOF
sed -i 's/##/#$/' $script
echo $script
}
######################################################################
#
# distribute_db
#
# Shares database across appropriate GPU nodes via qsubbed rsync jobs
# The database is pulled onto two nodes for each source node, so
# the initial transfer is submitted to run immediately, but subsequent
# dependent jobs which require a previous transfer to complete are
# submitted with '-hold_jid' so they will not run until the database has
# mirrored to their source node
#
# Required args: None
# Returns: None
#
######################################################################
distribute_db() {
# We need to exlude the current host since we already have the database
# and are going to use this as a starting point...
source_node=$(hostname -s)
cur_node=$(echo $source_node|sed -r 's/gpu-([0-9]+)/\1/')
nodes=( "${NODES[@]/$cur_node}" )
# Now we need to reorder the node list so that any nodes which are down
# appear last on the list, then the jobs assigned to them will hold until
# the nodes return, rather than having live nodes waiting on a job to complete
# on a node which is down
declare -a bad_nodes
for node in ${nodes[@]}; do
nodename=$(printf "gpu-%s.compute.dundee.ac.uk" "$node")
# qhost will return '-' in NLOAD field if a node is offline or uncommunicative...
status=$( qhost -h $nodename |tail -n +4|awk '{print $7}')
if [[ "$status" == '-' ]]; then
echo "Warning: $nodename is down..."
bad_nodes+=($node)
fi
done
# remove 'bad nodes' from the list and append them again
# at the end to ensure they are not dependancies for other jobs
for bad_node in ${bad_nodes[@]}; do
nodes=( "${nodes[@]/$bad_node}" )
nodes+=($bad_node)
done
declare -a submitted_nodes # list of nodes for which jobs have been prepared
declare -A node_jobs # associative array mapping node name to job id, for determining jid to hold
# 'submission' tracks the number of jobs submitted from each node
submission=-1
# sub_level tracks index of submitted nodes to identify correct source node
sub_level=1
for node in ${nodes[@]};do
submission=$((submission+1))
target_node=$(printf "gpu-%s.compute.dundee.ac.uk" "$node")
submitted_nodes+=($target_node)
# change the source node when we have prepared jobs for two nodes...
if [[ "$submission" == 2 ]]; then
submission=0
source_node=${submitted_nodes[0]}
submitted_nodes=$(echo ${submitted_nodes[$sub_level]:-1})
sub_level=$(($sub_level+1))
fi
echo "$source_node -> $target_node"
hold=${node_jobs[$source_node]}
script=$(create_wrapper $source_node $target_node $hold)
return=$(qsub $script)
job_id=$(echo $return|cut -f3 -d' ')
echo "job_id=$job_id"
node_jobs[$target_node]=$job_id
done
}
download_colabfold_db
#distribute_db