Colabfold_batch_installer/setup_databases.sh at main · bartongroup/Colabfold_batch_installer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
#!/bin/bash -e

# Modified verrsion of colabfold database setup script for UoD cluster

# This is intended to be qsubbed to an A40 node so the mmseqs2 indexes
# are built on the same nodes as used for execution, then distributed
# to the remaining nodes

#$ -j y
#$ -o colabfold_db_logs/$JOB_NAME.$JOB_ID
#$ -jc rhel9
#$ -mods l_hard hostname gpu-36.compute.dundee.ac.uk
#$ -adds l_hard gpu 1
#$ -adds l_hard cuda.0.name 'NVIDIA A40'

set -ex

VERSION="1.5.5"

# Current A40 nodes are gpu-34-gpu38 and gpu-40-gpu52
NODES_1=$(seq 34 38)
NODES_2=$(seq 40 52)
NODES=($NODES_1 $NODES_2)

if [[ "$USER" != "dbadmin" ]]; then
	echo "Please submit this script as the dbadmin user"
	exit 1
fi

source ~/miniconda3/etc/profile.d/conda.sh
conda activate mmseqs2

echo "CONDA_PREFIX=$CONDA_PREFIX"
echo "HOSTNAME=$HOSTNAME"

ARIA_NUM_CONN=8

PDB_SERVER="${2:-"rsync.wwpdb.org::ftp"}"
PDB_PORT="${3:-"33444"}"
PDB_AWS_DOWNLOAD="${4:-}"
PDB_AWS_SNAPSHOT="20240101"
UNIREF30DB="${UNIREF30DB:-"uniref30_2302"}"
CFDB="${CFDB:-"colabfold_envdb_202108"}"

db_dir="/opt/data/colabfold/${VERSION}"
#if [ ! -z $db_dir ] && [ -d "$db_dir" ]; then
#  rm -r ${db_dir}
#fi

######################################################################
#
# downloadFile
#
# Downloads specified file to cwd
#
# Required params:
#   URL: URL to download
#   OUTPUT: Output filename
#
# Returns: None
#
######################################################################

downloadFile() {
    URL="$1"
    OUTPUT="$2"
    set +e
    FILENAME=$(basename "${OUTPUT}")
    DIR=$(dirname "${OUTPUT}")
    echo "aria2c --all-proxy=${http_proxy} --max-connection-per-server="$ARIA_NUM_CONN" --allow-overwrite=true -o "$FILENAME" -d "$DIR" "$URL" && set -e && return 0"
    aria2c --all-proxy={http_proxy} --max-connection-per-server="$ARIA_NUM_CONN" --allow-overwrite=true -o "$FILENAME" -d "$DIR" "$URL" && set -e && return 0

    set -e
    echo "Could not download $URL to $OUTPUT"
}

######################################################################
#
# download_colabfold_dbs
#
# Carries out parallel download of required databases
#
# Required args: None
# Returns: None
#
######################################################################

download_colabfold_db() {

  mkdir -p $db_dir
  cd $db_dir

  # Make MMseqs2 merge the databases to avoid spamming the folder with files
  export MMSEQS_FORCE_MERGE=1
  GPU=1
  GPU_PAR="--gpu 1"
  GPU_INDEX_PAR=" --split 1 --index-subset 2"

  if [ ! -f DOWNLOADS_READY ]; then
    # new prebuilt GPU+CPU databases, that don't require calling tsv2exprofiledb
    downloadFile "https://opendata.mmseqs.org/colabfold/${UNIREF30DB}.db.tar.gz" "${UNIREF30DB}.tar.gz"
    downloadFile "https://opendata.mmseqs.org/colabfold/${CFDB}.db.tar.gz" "${CFDB}.tar.gz"

    downloadFile "https://opendata.mmseqs.org/colabfold/uniref30_2302_newtaxonomy.tar.gz" "uniref30_2302_newtaxonomy.tar.gz"
    downloadFile "https://opendata.mmseqs.org/colabfold/pdb100_230517.fasta.gz" "pdb100_230517.fasta.gz"
    downloadFile "https://opendata.mmseqs.org/colabfold/pdb100_foldseek_230517.tar.gz" "pdb100_foldseek_230517.tar.gz"
    touch DOWNLOADS_READY
  fi

  if [ ! -f PDB_MMCIF_READY ]; then
    mkdir -p pdb/divided
    mkdir -p pdb/obsolete

    rsync -av --delete /cluster/gjb_lab/db/NOBACK/mirrors/pdb/data/structures/divided/mmCIF pdb/divided
    rsync -av --delete /cluster/gjb_lab/db/NOBACK/mirrors/pdb/data/structures/obsolete/mmCIF pdb/obsolete
    touch PDB_MMCIF_READY
  fi

  if [ ! -f UNIREF30_READY ]; then
    tar -xzvf "${UNIREF30DB}.tar.gz"
    mmseqs createindex "${UNIREF30DB}_db" tmp1 --remove-tmp-files 1 ${GPU_INDEX_PAR}

    # replace mapping and taxonomy with rebuilt versions, see:
    # https://github.com/sokrypton/ColabFold/wiki/MSA-Server-Database-History#2025-08-04-updated-uniref100_2302-taxonomypairing-files
    if [ -e "uniref30_2302_newtaxonomy.tar.gz" ]; then
      tar -xzvf "uniref30_2302_newtaxonomy.tar.gz"
    fi

    if [ -e ${UNIREF30DB}_db_mapping ]; then
      # create binary, mmap-able taxonomy mapping, saves a few seconds of load time during pairing
      TAXHEADER=$(od -An -N4 -t x4 "${UNIREF30DB}_db_mapping" | tr -d ' ')
      # check if the file is already binary, it has a binary-encoded header that spells TAXM if so
      if [ "${TAXHEADER}" != "0c170013" ]; then
        mmseqs createbintaxmapping "${UNIREF30DB}_db_mapping" "${UNIREF30DB}_db_mapping.bin"
        mv -f -- "${UNIREF30DB}_db_mapping.bin" "${UNIREF30DB}_db_mapping"
      fi
      ln -sf ${UNIREF30DB}_db_mapping ${UNIREF30DB}_db.idx_mapping
    fi
    if [ -e ${UNIREF30DB}_db_taxonomy ]; then
      ln -sf ${UNIREF30DB}_db_taxonomy ${UNIREF30DB}_db.idx_taxonomy
    fi
    touch UNIREF30_READY
  fi

  if [ ! -f COLABDB_READY ]; then
    tar -xzvf "${CFDB}.tar.gz"
    mmseqs createindex "${CFDB}_db" tmp2 --remove-tmp-files 1 ${GPU_INDEX_PAR}
    touch COLABDB_READY
  fi

  if [ ! -f PDB_READY ]; then
    # for consistency with the other prebuilt databases
    # make pdb also compatible with both gpu and cpu
    if [ -n "${GPU}" ] || [ "${FAST_PREBUILT_DATABASES}" = "1" ]; then
      mmseqs createdb pdb100_230517.fasta.gz pdb100_230517_tmp
      mmseqs makepaddedseqdb pdb100_230517_tmp pdb100_230517
      mmseqs rmdb pdb100_230517_tmp
    else
      mmseqs createdb pdb100_230517.fasta.gz pdb100_230517
    fi
    touch PDB_READY
  fi

  if [ ! -f PDB100_READY ]; then
    tar -xzvf pdb100_foldseek_230517.tar.gz pdb100_a3m.ffdata pdb100_a3m.ffindex
    touch PDB100_READY
  fi

  rm -f "${UNIREF30DB}.tar.gz"
  rm -f "${CFDB}.tar.gz"
  rm -f "pdb100_230517.fasta.gz"
  rm -f "pdb100_foldseek_230517.tar.gz"
  rm -f "uniref30_2302_newtaxonomy.tar.gz"
  rm -rf tmp*

}

######################################################################
#
# create_wrapper
#
# Generates a shell script for qsubbing each rsync job.
# Requirement to activate conda environment makes this
# to complex for qsub -b...
#
# Required parameters:
#  source_node: hostname to sync from
#  target_node: hostname to sync to
#  hold: jid to submit hold_jid for
#
# Returns:
#  path to wrapper script
#
######################################################################

create_wrapper() {

  source_node=$1
  target_node=$2
  hold=$3

  extra_args=""
  if [[ ! -z "$hold" ]]; then
    extra_args="## -hold_jid $hold"
  fi

  script="${TMPDIR}/sync_$target_node.sh"

# SGE directives have ## rather than #$ to we can qsub this script, then
# sed them afterwards...

cat<<EOF > $script
#!/bin/env bash

## -mods l_hard hostname ${target_node}
## -N colabfold_mirror
## -j y
## -jc rhel9
## -o $HOME/colabfold_db_logs/\$JOB_NAME.\$JOB_ID
$extra_args

source ~/miniconda3/etc/profile.d/conda.sh
conda activate mmseqs2
rsync -e 'ssh -oStrictHostKeyChecking=no' --rsync-path=$CONDA_PREFIX/bin/rsync --delete -av $source_node:/opt/data/colabfold/ /opt/data/colabfold
find /opt/data/colabfold -type d -exec chmod 0755 {} \;
find /opt/data/colabfold -type f -exec chmod 0644 {} \;

EOF

  sed -i 's/##/#$/' $script
  echo $script
}

######################################################################
#
# distribute_db
#
# Shares database across appropriate GPU nodes via qsubbed rsync jobs
# The database is pulled onto two nodes for each source node, so
# the initial transfer is submitted to run immediately, but subsequent
# dependent jobs which require a previous transfer to complete are
# submitted with '-hold_jid' so they will not run until the database has
# mirrored to their source node
#
# Required args: None
# Returns: None
#
######################################################################

distribute_db() {

  # We need to exlude the current host since we already have the database
  # and are going to use this as a starting point...
  source_node=$(hostname -s)
  cur_node=$(echo $source_node|sed -r 's/gpu-([0-9]+)/\1/')
  nodes=( "${NODES[@]/$cur_node}" )

  # Now we need to reorder the node list so that any nodes which are down
  # appear last on the list, then the jobs assigned to them will hold until
  # the nodes return, rather than having live nodes waiting on a job to complete
  # on a node which is down

  declare -a bad_nodes

  for node in ${nodes[@]}; do
    nodename=$(printf "gpu-%s.compute.dundee.ac.uk" "$node")
    # qhost will return '-' in NLOAD field if a node is offline or uncommunicative...
    status=$( qhost -h $nodename |tail -n +4|awk '{print $7}')
    if [[ "$status" == '-' ]]; then
      echo "Warning: $nodename is down..."
      bad_nodes+=($node)
    fi
  done

  # remove 'bad nodes' from the list and append them again
  # at the end to ensure they are not dependancies for other jobs
  for bad_node in ${bad_nodes[@]}; do
    nodes=( "${nodes[@]/$bad_node}" )
    nodes+=($bad_node)
  done

  declare -a submitted_nodes # list of nodes for which jobs have been prepared
  declare -A node_jobs # associative array mapping node name to job id, for determining jid to hold

  # 'submission' tracks the number of jobs submitted from each node
  submission=-1
  # sub_level tracks index of submitted nodes to identify correct source node
  sub_level=1

  for node in ${nodes[@]};do
    submission=$((submission+1))

    target_node=$(printf "gpu-%s.compute.dundee.ac.uk" "$node")
    submitted_nodes+=($target_node)

    # change the source node when we have prepared jobs for two nodes...
    if [[ "$submission" == 2 ]]; then
      submission=0
      source_node=${submitted_nodes[0]}
      submitted_nodes=$(echo ${submitted_nodes[$sub_level]:-1})
      sub_level=$(($sub_level+1))
    fi

    echo "$source_node -> $target_node"

    hold=${node_jobs[$source_node]}
    script=$(create_wrapper $source_node $target_node $hold)

    return=$(qsub $script)
    job_id=$(echo $return|cut -f3 -d' ')
    echo "job_id=$job_id"
    node_jobs[$target_node]=$job_id

  done
}

download_colabfold_db
#distribute_db