Skip to content

Commit 07644a8

Browse files
add environmental pairing feature (sokrypton#614)
* add environmental pairing feature * Refactor env pairing --------- Co-authored-by: Milot Mirdita <[email protected]>
1 parent 1653605 commit 07644a8

File tree

1 file changed

+44
-11
lines changed

1 file changed

+44
-11
lines changed

colabfold/mmseqs/search.py

Lines changed: 44 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,9 @@ def mmseqs_search_pair(
176176
dbbase: Path,
177177
base: Path,
178178
uniref_db: Path = Path("uniref30_2302_db"),
179+
spire_db: Path = Path("spire_ctg10_2401_db"),
179180
mmseqs: Path = Path("mmseqs"),
181+
pair_env: bool = True,
180182
prefilter_mode: int = 0,
181183
s: float = 8,
182184
threads: int = 64,
@@ -200,6 +202,13 @@ def mmseqs_search_pair(
200202
dbSuffix1 = ".idx"
201203
dbSuffix2 = ".idx"
202204

205+
if pair_env:
206+
db = spire_db
207+
output = ".env.paired.a3m"
208+
else:
209+
db = uniref_db
210+
output = ".paired.a3m"
211+
203212
# fmt: off
204213
# @formatter:off
205214
search_param = ["--num-iterations", "3", "--db-load-mode", str(db_load_mode), "-a", "-e", "0.1", "--max-seqs", "10000",]
@@ -209,16 +218,14 @@ def mmseqs_search_pair(
209218
else:
210219
search_param += ["--k-score", "'seq:96,prof:80'"]
211220
expand_param = ["--expansion-mode", "0", "-e", "inf", "--expand-filter-clusters", "0", "--max-seq-id", "0.95",]
212-
run_mmseqs(mmseqs, ["search", base.joinpath("qdb"), dbbase.joinpath(uniref_db), base.joinpath("res"), base.joinpath("tmp"), "--threads", str(threads),] + search_param,)
213-
run_mmseqs(mmseqs, ["expandaln", base.joinpath("qdb"), dbbase.joinpath(f"{uniref_db}{dbSuffix1}"), base.joinpath("res"), dbbase.joinpath(f"{uniref_db}{dbSuffix2}"), base.joinpath("res_exp"), "--db-load-mode", str(db_load_mode), "--threads", str(threads),] + expand_param,)
214-
run_mmseqs(mmseqs, ["align", base.joinpath("qdb"), dbbase.joinpath(f"{uniref_db}{dbSuffix1}"), base.joinpath("res_exp"), base.joinpath("res_exp_realign"), "--db-load-mode", str(db_load_mode), "-e", "0.001", "--max-accept", "1000000", "--threads", str(threads), "-c", "0.5", "--cov-mode", "1",],)
215-
run_mmseqs(mmseqs, ["pairaln", base.joinpath("qdb"), dbbase.joinpath(f"{uniref_db}"), base.joinpath("res_exp_realign"), base.joinpath("res_exp_realign_pair"), "--db-load-mode", str(db_load_mode), "--pairing-mode", str(pairing_strategy), "--pairing-dummy-mode", "0", "--threads", str(threads), ],)
216-
run_mmseqs(mmseqs, ["align", base.joinpath("qdb"), dbbase.joinpath(f"{uniref_db}{dbSuffix1}"), base.joinpath("res_exp_realign_pair"), base.joinpath("res_exp_realign_pair_bt"), "--db-load-mode", str(db_load_mode), "-e", "inf", "-a", "--threads", str(threads), ],)
217-
run_mmseqs(mmseqs, ["pairaln", base.joinpath("qdb"), dbbase.joinpath(f"{uniref_db}"), base.joinpath("res_exp_realign_pair_bt"), base.joinpath("res_final"), "--db-load-mode", str(db_load_mode), "--pairing-mode", str(pairing_strategy), "--pairing-dummy-mode", "1", "--threads", str(threads),],)
218-
run_mmseqs(mmseqs, ["result2msa", base.joinpath("qdb"), dbbase.joinpath(f"{uniref_db}{dbSuffix1}"), base.joinpath("res_final"), base.joinpath("pair.a3m"), "--db-load-mode", str(db_load_mode), "--msa-format-mode", "5", "--threads", str(threads),],)
219-
run_mmseqs(mmseqs, ["unpackdb", base.joinpath("pair.a3m"), base.joinpath("."), "--unpack-name-mode", "0", "--unpack-suffix", ".paired.a3m",],)
220-
run_mmseqs(mmseqs, ["rmdb", base.joinpath("qdb")])
221-
run_mmseqs(mmseqs, ["rmdb", base.joinpath("qdb_h")])
221+
run_mmseqs(mmseqs, ["search", base.joinpath("qdb"), dbbase.joinpath(db), base.joinpath("res"), base.joinpath("tmp"), "--threads", str(threads),] + search_param,)
222+
run_mmseqs(mmseqs, ["expandaln", base.joinpath("qdb"), dbbase.joinpath(f"{db}{dbSuffix1}"), base.joinpath("res"), dbbase.joinpath(f"{db}{dbSuffix2}"), base.joinpath("res_exp"), "--db-load-mode", str(db_load_mode), "--threads", str(threads),] + expand_param,)
223+
run_mmseqs(mmseqs, ["align", base.joinpath("qdb"), dbbase.joinpath(f"{db}{dbSuffix1}"), base.joinpath("res_exp"), base.joinpath("res_exp_realign"), "--db-load-mode", str(db_load_mode), "-e", "0.001", "--max-accept", "1000000", "--threads", str(threads), "-c", "0.5", "--cov-mode", "1",],)
224+
run_mmseqs(mmseqs, ["pairaln", base.joinpath("qdb"), dbbase.joinpath(f"{db}"), base.joinpath("res_exp_realign"), base.joinpath("res_exp_realign_pair"), "--db-load-mode", str(db_load_mode), "--pairing-mode", str(pairing_strategy), "--pairing-dummy-mode", "0", "--threads", str(threads), ],)
225+
run_mmseqs(mmseqs, ["align", base.joinpath("qdb"), dbbase.joinpath(f"{db}{dbSuffix1}"), base.joinpath("res_exp_realign_pair"), base.joinpath("res_exp_realign_pair_bt"), "--db-load-mode", str(db_load_mode), "-e", "inf", "-a", "--threads", str(threads), ],)
226+
run_mmseqs(mmseqs, ["pairaln", base.joinpath("qdb"), dbbase.joinpath(f"{db}"), base.joinpath("res_exp_realign_pair_bt"), base.joinpath("res_final"), "--db-load-mode", str(db_load_mode), "--pairing-mode", str(pairing_strategy), "--pairing-dummy-mode", "1", "--threads", str(threads),],)
227+
run_mmseqs(mmseqs, ["result2msa", base.joinpath("qdb"), dbbase.joinpath(f"{db}{dbSuffix1}"), base.joinpath("res_final"), base.joinpath("pair.a3m"), "--db-load-mode", str(db_load_mode), "--msa-format-mode", "5", "--threads", str(threads),],)
228+
run_mmseqs(mmseqs, ["unpackdb", base.joinpath("pair.a3m"), base.joinpath("."), "--unpack-name-mode", "0", "--unpack-suffix", output,],)
222229
run_mmseqs(mmseqs, ["rmdb", base.joinpath("res")])
223230
run_mmseqs(mmseqs, ["rmdb", base.joinpath("res_exp")])
224231
run_mmseqs(mmseqs, ["rmdb", base.joinpath("res_exp_realign")])
@@ -230,7 +237,6 @@ def mmseqs_search_pair(
230237
# @formatter:on
231238
# fmt: on
232239

233-
234240
def main():
235241
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
236242
parser.add_argument(
@@ -271,11 +277,15 @@ def main():
271277
default=Path("colabfold_envdb_202108_db"),
272278
help="Environmental database",
273279
)
280+
parser.add_argument("--db4", type=Path, default=Path("spire_ctg10_2401_db"), help="Environmental pairing database")
274281

275282
# poor man's boolean arguments
276283
parser.add_argument(
277284
"--use-env", type=int, default=1, choices=[0, 1], help="Use --db3"
278285
)
286+
parser.add_argument(
287+
"--use-env-pairing", type=int, default=0, choices=[0, 1], help="Use --db4"
288+
)
279289
parser.add_argument(
280290
"--use-templates", type=int, default=0, choices=[0, 1], help="Use --db2"
281291
)
@@ -418,7 +428,22 @@ def main():
418428
db_load_mode=args.db_load_mode,
419429
threads=args.threads,
420430
pairing_strategy=args.pairing_strategy,
431+
pair_env=False,
421432
)
433+
if args.use_env_pairing:
434+
mmseqs_search_pair(
435+
mmseqs=args.mmseqs,
436+
dbbase=args.dbbase,
437+
base=args.base,
438+
uniref_db=args.db1,
439+
spire_db=args.db4,
440+
prefilter_mode=args.prefilter_mode,
441+
s=args.s,
442+
db_load_mode=args.db_load_mode,
443+
threads=args.threads,
444+
pairing_strategy=args.pairing_strategy,
445+
pair_env=True,
446+
)
422447

423448
id = 0
424449
for job_number, (
@@ -434,6 +459,14 @@ def main():
434459
with args.base.joinpath(f"{id}.a3m").open("r") as f:
435460
unpaired_msa.append(f.read())
436461
args.base.joinpath(f"{id}.a3m").unlink()
462+
463+
if args.use_env_pairing:
464+
with open(args.base.joinpath(f"{id}.paired.a3m"), 'a') as file_pair:
465+
with open(args.base.joinpath(f"{id}.env.paired.a3m"), 'r') as file_pair_env:
466+
while chunk := file_pair_env.read(10 * 1024 * 1024):
467+
file_pair.write(chunk)
468+
args.base.joinpath(f"{id}.env.paired.a3m").unlink()
469+
437470
if len(query_seqs_cardinality) > 1:
438471
with args.base.joinpath(f"{id}.paired.a3m").open("r") as f:
439472
paired_msa.append(f.read())

0 commit comments

Comments
 (0)