Merge pull request #375 from PaddlePaddle/msa_patch

leaves520 · web-flow · commit 5973fae6f0d0 · 2025-08-21T11:34:17.000+08:00
fix(msa): fix typo for readme and  remove unused parameters of genetic databases.
diff --git a/apps/protein_folding/helixfold3/README.md b/apps/protein_folding/helixfold3/README.md
@@ -62,7 +62,7 @@ HelixFold3 depends on [PaddlePaddle](https://github.com/paddlepaddle/paddle). Py
 is provided in `requirements.txt`. `kalign`, the [`HH-suite`](https://github.com/soedinglab/hh-suite) and `jackhmmer` are 
 also needed to produce multiple sequence alignments. The download scripts require `aria2c`. 
 
-Locate to the directory of `helixfold` then run:
+Locate to the directory of `helixfold3` then run:
 
 ```bash
 # install msa env
@@ -89,15 +89,6 @@ please place the downloaded checkpoint in ```./init_models/ ```directory.
 
 The script `scripts/download_all_data.sh` can be used to download and set up all genetic databases with the following configs:
 
-*   By default:
-
-    ```bash
-    scripts/download_all_data.sh ./data
-    ```
-
-   will download the complete databases. The total download size for the complete databases is around 415 GB, 
-   and the total size when unzipped is 2.2 TB.  
-
 *   With `reduced_dbs`:
 
     ```bash
@@ -107,6 +98,10 @@ The script `scripts/download_all_data.sh` can be used to download and set up all
     will download a reduced version of the databases to be used with the `reduced_dbs` preset. The total download 
     size for the reduced databases is around 190 GB, and the total unzipped size is around 530 GB.
 
+*   With `full_dbs`:
+
+    NOTE: ***Support for full_dbs is not available yet and will be introduced in a future update.***
+
 #### 🤔 Understanding Model Input
 
 There are some demo input under `./data/` for your test and reference. Data input is in the form of JSON containing several entities such as `protein`, `ligand`, `dna`, `rna` and `ion`. Proteins and nucleic acids inputs are their sequence.
@@ -185,9 +180,7 @@ CUDA_VISIBLE_DEVICES=0 "$PYTHON_BIN" inference.py \
 	--hmmbuild_binary_path "$ENV_BIN/hmmbuild" \
     --nhmmer_binary_path "$ENV_BIN/nhmmer" \
     --preset='reduced_dbs' \
-    --bfd_database_path "$DATA_DIR/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt" \
     --reduced_bfd_database_path "$DATA_DIR/small_bfd/bfd-first_non_consensus_sequences.fasta" \
-    --uniclust30_database_path "$DATA_DIR/uniclust30/uniclust30_2018_08/uniclust30_2018_08" \
     --uniprot_database_path "$DATA_DIR/uniprot/uniprot.fasta" \
     --pdb_seqres_database_path "$DATA_DIR/pdb_seqres/pdb_seqres.txt" \
     --uniref90_database_path "$DATA_DIR/uniref90/uniref90.fasta" \
diff --git a/apps/protein_folding/helixfold3/inference.py b/apps/protein_folding/helixfold3/inference.py
@@ -133,8 +133,7 @@ def get_msa_templates_pipeline(args: argparse.Namespace) -> Dict:
     if use_reduced_bfd:
         assert args.reduced_bfd_database_path is not None
     else:
-        assert args.bfd_database_path is not None
-        assert args.uniclust30_database_path is not None
+        raise NotImplementedError("Full dbs is not supported yet.")
 
     template_searcher = hmmsearch.Hmmsearch(
         binary_path=args.hmmsearch_binary_path,
@@ -154,8 +153,8 @@ def get_msa_templates_pipeline(args: argparse.Namespace) -> Dict:
         hhblits_binary_path=args.hhblits_binary_path,
         uniref90_database_path=args.uniref90_database_path,
         mgnify_database_path=args.mgnify_database_path,
-        bfd_database_path=args.bfd_database_path,
-        uniclust30_database_path=args.uniclust30_database_path,
+        bfd_database_path=None,
+        uniclust30_database_path=None,
         reduced_bfd_database_path=args.reduced_bfd_database_path,
         uniprot_database_path=args.uniprot_database_path,
         template_searcher=template_searcher,
@@ -607,14 +606,9 @@ def main(args):
                         default=None, required=True,
                         help='Path to the MGnify database for use by '
                         'JackHMMER.')
-    parser.add_argument('--bfd_database_path', type=str, default=None,
-                        help='Path to the BFD database for use by HHblits.')
     parser.add_argument('--reduced_bfd_database_path', type=str, default=None,
                         help='Path to the reduced version of BFD used '
                         'with the "reduced_dbs" preset.')
-    parser.add_argument('--uniclust30_database_path', type=str, default=None,
-                        help='Path to the Uniclust30 database for use '
-                        'by HHblits.')
     # RNA MSA searching databases
     parser.add_argument('--rfam_database_path', type=str,
                         default=None, required=True,
diff --git a/apps/protein_folding/helixfold3/run_infer.sh b/apps/protein_folding/helixfold3/run_infer.sh
@@ -13,9 +13,7 @@ CUDA_VISIBLE_DEVICES=0 "$PYTHON_BIN" inference.py \
 	--hmmbuild_binary_path "$ENV_BIN/hmmbuild" \
     --nhmmer_binary_path "$ENV_BIN/nhmmer" \
     --preset='reduced_dbs' \
-    --bfd_database_path "$DATA_DIR/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt" \
     --reduced_bfd_database_path "$DATA_DIR/small_bfd/bfd-first_non_consensus_sequences.fasta" \
-    --uniclust30_database_path "$DATA_DIR/uniclust30/uniclust30_2018_08/uniclust30_2018_08" \
     --uniprot_database_path "$DATA_DIR/uniprot/uniprot.fasta" \
     --pdb_seqres_database_path "$DATA_DIR/pdb_seqres/pdb_seqres.txt" \
     --uniref90_database_path "$DATA_DIR/uniref90/uniref90.fasta" \
diff --git a/apps/protein_folding/helixfold3/scripts/download_mgnify.sh b/apps/protein_folding/helixfold3/scripts/download_mgnify.sh
@@ -22,5 +22,5 @@ BASENAME=$(basename "${SOURCE_URL}")
 mkdir --parents "${ROOT_DIR}"
 aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
 pushd "${ROOT_DIR}"
-gunzip "${ROOT_DIR}/${BASENAME}"
+gunzip "./${BASENAME}"
 popd
diff --git a/apps/protein_folding/helixfold3/scripts/download_small_bfd.sh b/apps/protein_folding/helixfold3/scripts/download_small_bfd.sh
@@ -20,5 +20,5 @@ BASENAME=$(basename "${SOURCE_URL}")
 mkdir --parents "${ROOT_DIR}"
 aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
 pushd "${ROOT_DIR}"
-gunzip "${ROOT_DIR}/${BASENAME}"
+gunzip "./${BASENAME}"
 popd
diff --git a/apps/protein_folding/helixfold3/scripts/download_uniprot.sh b/apps/protein_folding/helixfold3/scripts/download_uniprot.sh
@@ -27,11 +27,11 @@ mkdir --parents "${ROOT_DIR}"
 aria2c "${TREMBL_SOURCE_URL}" --dir="${ROOT_DIR}"
 aria2c "${SPROT_SOURCE_URL}" --dir="${ROOT_DIR}"
 pushd "${ROOT_DIR}"
-gunzip "${ROOT_DIR}/${TREMBL_BASENAME}"
-gunzip "${ROOT_DIR}/${SPROT_BASENAME}"
+gunzip "./${TREMBL_BASENAME}"
+gunzip "./${SPROT_BASENAME}"
 
 # Concatenate TrEMBL and SwissProt, rename to uniprot and clean up.
-cat "${ROOT_DIR}/${SPROT_UNZIPPED_BASENAME}" >> "${ROOT_DIR}/${TREMBL_UNZIPPED_BASENAME}"
-mv "${ROOT_DIR}/${TREMBL_UNZIPPED_BASENAME}" "${ROOT_DIR}/uniprot.fasta"
-rm "${ROOT_DIR}/${SPROT_UNZIPPED_BASENAME}"
+cat "./${SPROT_UNZIPPED_BASENAME}" >> "./${TREMBL_UNZIPPED_BASENAME}"
+mv "./${TREMBL_UNZIPPED_BASENAME}" "./uniprot.fasta"
+rm "./${SPROT_UNZIPPED_BASENAME}"
 popd
diff --git a/apps/protein_folding/helixfold3/scripts/download_uniref90.sh b/apps/protein_folding/helixfold3/scripts/download_uniref90.sh
@@ -20,5 +20,5 @@ BASENAME=$(basename "${SOURCE_URL}")
 mkdir --parents "${ROOT_DIR}"
 aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}"
 pushd "${ROOT_DIR}"
-gunzip "${ROOT_DIR}/${BASENAME}"
+gunzip "./${BASENAME}"
 popd