Skip to content

Commit 5973fae

Browse files
authored
Merge pull request #375 from PaddlePaddle/msa_patch
fix(msa): fix typo for readme and remove unused parameters of genetic databases.
2 parents 22c8450 + f96cf93 commit 5973fae

File tree

7 files changed

+16
-31
lines changed

7 files changed

+16
-31
lines changed

apps/protein_folding/helixfold3/README.md

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ HelixFold3 depends on [PaddlePaddle](https://github.com/paddlepaddle/paddle). Py
6262
is provided in `requirements.txt`. `kalign`, the [`HH-suite`](https://github.com/soedinglab/hh-suite) and `jackhmmer` are
6363
also needed to produce multiple sequence alignments. The download scripts require `aria2c`.
6464

65-
Locate to the directory of `helixfold` then run:
65+
Locate to the directory of `helixfold3` then run:
6666

6767
```bash
6868
# install msa env
@@ -89,15 +89,6 @@ please place the downloaded checkpoint in ```./init_models/ ```directory.
8989

9090
The script `scripts/download_all_data.sh` can be used to download and set up all genetic databases with the following configs:
9191

92-
* By default:
93-
94-
```bash
95-
scripts/download_all_data.sh ./data
96-
```
97-
98-
will download the complete databases. The total download size for the complete databases is around 415 GB,
99-
and the total size when unzipped is 2.2 TB.
100-
10192
* With `reduced_dbs`:
10293

10394
```bash
@@ -107,6 +98,10 @@ The script `scripts/download_all_data.sh` can be used to download and set up all
10798
will download a reduced version of the databases to be used with the `reduced_dbs` preset. The total download
10899
size for the reduced databases is around 190 GB, and the total unzipped size is around 530 GB.
109100

101+
* With `full_dbs`:
102+
103+
NOTE: ***Support for full_dbs is not available yet and will be introduced in a future update.***
104+
110105
#### 🤔 Understanding Model Input
111106

112107
There are some demo input under `./data/` for your test and reference. Data input is in the form of JSON containing several entities such as `protein`, `ligand`, `dna`, `rna` and `ion`. Proteins and nucleic acids inputs are their sequence.
@@ -185,9 +180,7 @@ CUDA_VISIBLE_DEVICES=0 "$PYTHON_BIN" inference.py \
185180
--hmmbuild_binary_path "$ENV_BIN/hmmbuild" \
186181
--nhmmer_binary_path "$ENV_BIN/nhmmer" \
187182
--preset='reduced_dbs' \
188-
--bfd_database_path "$DATA_DIR/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt" \
189183
--reduced_bfd_database_path "$DATA_DIR/small_bfd/bfd-first_non_consensus_sequences.fasta" \
190-
--uniclust30_database_path "$DATA_DIR/uniclust30/uniclust30_2018_08/uniclust30_2018_08" \
191184
--uniprot_database_path "$DATA_DIR/uniprot/uniprot.fasta" \
192185
--pdb_seqres_database_path "$DATA_DIR/pdb_seqres/pdb_seqres.txt" \
193186
--uniref90_database_path "$DATA_DIR/uniref90/uniref90.fasta" \

apps/protein_folding/helixfold3/inference.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -133,8 +133,7 @@ def get_msa_templates_pipeline(args: argparse.Namespace) -> Dict:
133133
if use_reduced_bfd:
134134
assert args.reduced_bfd_database_path is not None
135135
else:
136-
assert args.bfd_database_path is not None
137-
assert args.uniclust30_database_path is not None
136+
raise NotImplementedError("Full dbs is not supported yet.")
138137

139138
template_searcher = hmmsearch.Hmmsearch(
140139
binary_path=args.hmmsearch_binary_path,
@@ -154,8 +153,8 @@ def get_msa_templates_pipeline(args: argparse.Namespace) -> Dict:
154153
hhblits_binary_path=args.hhblits_binary_path,
155154
uniref90_database_path=args.uniref90_database_path,
156155
mgnify_database_path=args.mgnify_database_path,
157-
bfd_database_path=args.bfd_database_path,
158-
uniclust30_database_path=args.uniclust30_database_path,
156+
bfd_database_path=None,
157+
uniclust30_database_path=None,
159158
reduced_bfd_database_path=args.reduced_bfd_database_path,
160159
uniprot_database_path=args.uniprot_database_path,
161160
template_searcher=template_searcher,
@@ -607,14 +606,9 @@ def main(args):
607606
default=None, required=True,
608607
help='Path to the MGnify database for use by '
609608
'JackHMMER.')
610-
parser.add_argument('--bfd_database_path', type=str, default=None,
611-
help='Path to the BFD database for use by HHblits.')
612609
parser.add_argument('--reduced_bfd_database_path', type=str, default=None,
613610
help='Path to the reduced version of BFD used '
614611
'with the "reduced_dbs" preset.')
615-
parser.add_argument('--uniclust30_database_path', type=str, default=None,
616-
help='Path to the Uniclust30 database for use '
617-
'by HHblits.')
618612
# RNA MSA searching databases
619613
parser.add_argument('--rfam_database_path', type=str,
620614
default=None, required=True,

apps/protein_folding/helixfold3/run_infer.sh

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,7 @@ CUDA_VISIBLE_DEVICES=0 "$PYTHON_BIN" inference.py \
1313
--hmmbuild_binary_path "$ENV_BIN/hmmbuild" \
1414
--nhmmer_binary_path "$ENV_BIN/nhmmer" \
1515
--preset='reduced_dbs' \
16-
--bfd_database_path "$DATA_DIR/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt" \
1716
--reduced_bfd_database_path "$DATA_DIR/small_bfd/bfd-first_non_consensus_sequences.fasta" \
18-
--uniclust30_database_path "$DATA_DIR/uniclust30/uniclust30_2018_08/uniclust30_2018_08" \
1917
--uniprot_database_path "$DATA_DIR/uniprot/uniprot.fasta" \
2018
--pdb_seqres_database_path "$DATA_DIR/pdb_seqres/pdb_seqres.txt" \
2119
--uniref90_database_path "$DATA_DIR/uniref90/uniref90.fasta" \

apps/protein_folding/helixfold3/scripts/download_mgnify.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,5 @@ BASENAME=$(basename "${SOURCE_URL}")
2222
mkdir --parents "${ROOT_DIR}"
2323
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
2424
pushd "${ROOT_DIR}"
25-
gunzip "${ROOT_DIR}/${BASENAME}"
25+
gunzip "./${BASENAME}"
2626
popd

apps/protein_folding/helixfold3/scripts/download_small_bfd.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,5 @@ BASENAME=$(basename "${SOURCE_URL}")
2020
mkdir --parents "${ROOT_DIR}"
2121
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
2222
pushd "${ROOT_DIR}"
23-
gunzip "${ROOT_DIR}/${BASENAME}"
23+
gunzip "./${BASENAME}"
2424
popd

apps/protein_folding/helixfold3/scripts/download_uniprot.sh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,11 +27,11 @@ mkdir --parents "${ROOT_DIR}"
2727
aria2c "${TREMBL_SOURCE_URL}" --dir="${ROOT_DIR}"
2828
aria2c "${SPROT_SOURCE_URL}" --dir="${ROOT_DIR}"
2929
pushd "${ROOT_DIR}"
30-
gunzip "${ROOT_DIR}/${TREMBL_BASENAME}"
31-
gunzip "${ROOT_DIR}/${SPROT_BASENAME}"
30+
gunzip "./${TREMBL_BASENAME}"
31+
gunzip "./${SPROT_BASENAME}"
3232

3333
# Concatenate TrEMBL and SwissProt, rename to uniprot and clean up.
34-
cat "${ROOT_DIR}/${SPROT_UNZIPPED_BASENAME}" >> "${ROOT_DIR}/${TREMBL_UNZIPPED_BASENAME}"
35-
mv "${ROOT_DIR}/${TREMBL_UNZIPPED_BASENAME}" "${ROOT_DIR}/uniprot.fasta"
36-
rm "${ROOT_DIR}/${SPROT_UNZIPPED_BASENAME}"
34+
cat "./${SPROT_UNZIPPED_BASENAME}" >> "./${TREMBL_UNZIPPED_BASENAME}"
35+
mv "./${TREMBL_UNZIPPED_BASENAME}" "./uniprot.fasta"
36+
rm "./${SPROT_UNZIPPED_BASENAME}"
3737
popd

apps/protein_folding/helixfold3/scripts/download_uniref90.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,5 @@ BASENAME=$(basename "${SOURCE_URL}")
2020
mkdir --parents "${ROOT_DIR}"
2121
aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
2222
pushd "${ROOT_DIR}"
23-
gunzip "${ROOT_DIR}/${BASENAME}"
23+
gunzip "./${BASENAME}"
2424
popd

0 commit comments

Comments
 (0)