steineggerlab
diff --git a/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 3 additions & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 16 additions & 5 deletions b/‎README.md‎
Lines changed: 16 additions & 5 deletions
diff --git a/‎lib/fasta_validator‎ b/‎lib/fasta_validator‎
diff --git a/‎lib/fastq_utils/CMakeLists.txt‎
Lines changed: 9 additions & 0 deletions b/‎lib/fastq_utils/CMakeLists.txt‎
Lines changed: 9 additions & 0 deletions
@@ -5,3 +5,6 @@
 [submodule "lib/mmseqs"]
 	path = lib/mmseqs
 	url = https://github.com/jaebeom-kim/MMseqs2.git
+[submodule "lib/fasta_validator"]
+	path = lib/fasta_validator
+	url = https://github.com/jaebeom-kim/fasta_validator.git
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.15 FATAL_ERROR)
 # keep old policy for setting implicit link libraries
 # zlib causes issues in static builds otherwise
 cmake_policy(SET CMP0060 OLD)
-project(metabuli CXX)
+project(metabuli CXX C)
 set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/lib/mmseqs/cmake")
@@ -15,6 +15,8 @@ set(FRAMEWORK_ONLY 1 CACHE INTERNAL "" FORCE)
 include(MMseqsSetupDerivedTarget)
 add_subdirectory(lib/mmseqs)
 add_subdirectory(lib/prodigal)
+add_subdirectory(lib/fasta_validator)
+add_subdirectory(lib/fastq_utils)
 add_subdirectory(src)
 add_subdirectory(data)
 include_directories(lib)
 
@@ -143,6 +143,8 @@ Downloaded files are stored in `OUTDIR/DB_NAME` directory, which can be provided
 ---
 
 ## Classification
+> [!NOTE] 
+> We commend running software like `fastp` or `fastplong` to remove adapters and low-quality reads before classification. 
 ```
 metabuli classify <i:FASTA/Q> <i:DBDIR> <o:OUTDIR> <Job ID> [options]
 - INPUT : FASTA/Q file of reads you want to classify. (gzip supported)
@@ -160,6 +162,7 @@ metabuli classify --seq-mode 1 read.fna dbdir outdir jobid
 metabuli classify --seq-mode 3 read.fna dbdir outdir jobid
 
   * Important parameters:
+   --validate-input : Validate query file format (0 by default)
    --threads : The number of threads used (all by default)
    --max-ram : The maximum RAM usage. (128 GiB by default)
    --min-score : The minimum score to be classified 
@@ -241,13 +244,15 @@ metabuli classifiedRefiner <i:read-by-read classification> <i:DBDIR> [options]
 
 * Options
    --threads : The number of threads to utilize (all by default)
+   --min-score : Remove classifications with score below this value
    --remove-unclassified : Remove unclassified reads
-   --exclude-taxid : Remove list of taxids as well as its children (e.g., 1758,9685,1234)
-   --select-taxid : Select list of taxids as well as its children (e.g., 1758,9685,1234)
+   --exclude-taxid : Remove list of taxids as well as its children (e.g., 1758,9685)
+   --select-taxid : Select list of taxids as well as its children (e.g., 1758,9685)
    --select-columns : Select list of columns with number and handle full lineage as 7 (generated if absent) (e.g., 2,5,7,3)
-   --report : Write report of refined classification file
+   --report : Write report of refined classification results
    --rank : Adjust classification to the specified rank
-   --rank-file-type : Choose how to handle reads assigned to higher taxonomic ranks when using the --rank option. [0: exclude higher rank, 1: include higher rank, 2: make separate file for higher rank classification]
+   --rank-file-type : Choose how to handle classifications at higher ranks when using --rank option. 
+        [0: exclude them, 1: include them, 2: make separate file for them]
 
 ```
 #### Output
@@ -307,6 +312,7 @@ metabuli build --gtdb 1 <DBDIR> <FASTA_LIST> <GTDB_TAXDUMP/taxid.map> --taxonomy
    --max-ram : The maximum RAM usage. (128 GiB by default)
    --accession-level : Set 1 to creat a DB for accession level classification (0 by default).
    --cds-info : List of absolute paths to CDS files.
+   --validate-input : Validate FASTA file format (0 by default)
   
 ```
 This will generate **diffIdx**, **info**, **split**, and **taxID_list** and some other files. You can delete `*_diffIdx` and `*_info` files.
@@ -333,6 +339,7 @@ metabuli updateDB --gtdb 1 <NEW DBDIR> <FASTA_LIST> <GTDB_TAXDUMP/taxid.map> <OL
   --max-ram: The maximum RAM usage. (128 GiB by default)
   --accession-level: Set 1 to add new sequences for accession level classification (0 by default).
   --cds-info: List of absolute paths to CDS files.
+  --validate-input : Validate FASTA file format (0 by default)
 ```
 
 #### \<Add sequences of new taxa>
@@ -421,6 +428,7 @@ metabuli build <DBDIR> <FASTA_LIST> <accession2taxid> --taxonomy-path <TAXDUMP>
   --max-ram: The maximum RAM usage. (128 GiB by default)
   --accession-level: Set 1 to creat a DB for accession level classification (0 by default).
   --cds-info: List of absolute paths to CDS files.
+  --validate-input : Validate FASTA file format (0 by default)
 ```
 This will generate **diffIdx**, **info**, **split**, and **taxID_list** and some other files. You can delete `*_diffIdx` and `*_info` files and `DATE-TIME` folder (e.g., `2025-1-24-10-32`) if generated.
 
@@ -456,6 +464,7 @@ metabuli updateDB <NEW DBDIR> <FASTA_LIST> <accession2taxid> <OLD DBDIR> [option
   --accession-level : Set 1 to create a DB for accession level classification (0 by default).
   --make-library : Make species library for faster execution (1 by default).
   --new-taxa : List of new taxa to be added.
+  --validate-input : Validate FASTA file format (0 by default)
 ```
 
 #### \<Add sequences of new taxa> - Please refer [this section](#add-sequences-of-new-taxa).
@@ -489,4 +498,6 @@ fasterq-dump --split-files SRR14484345
   ```
 
 ## Reference
-Shen, W., Ren, H., TaxonKit: a practical and efficient NCBI Taxonomy toolkit, Journal of Genetics and Genomics, https://doi.org/10.1016/j.jgg.2021.03.006
+- **Taxonomy dump**: [Shen W, Ren H. TaxonKit: a practical and efficient NCBI Taxonomy toolkit. Journal of Genetics and Genomics (2021).](https://doi.org/10.1016/j.jgg.2021.03.006)
+- **FASTA format validation**: [Edwards R.A. fasta_validate: a fast and efficient fasta validator written in pure C. Zenodo.](https://doi.org/10.5281/zenodo.2532044) 
+- **FASTQ format validation**: [Fonseca N, Manning J. nunofonseca/fastq_utils: 0.25.2. Zenodo.](https://doi.org/10.5281/zenodo.7755574)
@@ -0,0 +1,9 @@
+add_library(fastq_utils
+        fastq_info.cpp
+        hash.cpp
+        hash.h
+        fastq.cpp
+        fastq.h
+)
+
+set_target_properties(fastq_utils PROPERTIES COMPILE_FLAGS "${MMSEQS_CXX_FLAGS} -w" LINK_FLAGS "${MMSEQS_CXX_FLAGS} -w")