Skip to content

Commit ad5cf6b

Browse files
authored
Merge pull request #143 from jaebeom-kim/master
Input FASTA/FASTQ format validation option
2 parents 25c0961 + 70c258a commit ad5cf6b

21 files changed

+2077
-18
lines changed

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,6 @@
55
[submodule "lib/mmseqs"]
66
path = lib/mmseqs
77
url = https://github.com/jaebeom-kim/MMseqs2.git
8+
[submodule "lib/fasta_validator"]
9+
path = lib/fasta_validator
10+
url = https://github.com/jaebeom-kim/fasta_validator.git

CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.15 FATAL_ERROR)
22
# keep old policy for setting implicit link libraries
33
# zlib causes issues in static builds otherwise
44
cmake_policy(SET CMP0060 OLD)
5-
project(metabuli CXX)
5+
project(metabuli CXX C)
66
set(CMAKE_CXX_STANDARD 14)
77
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
88
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/lib/mmseqs/cmake")
@@ -15,6 +15,8 @@ set(FRAMEWORK_ONLY 1 CACHE INTERNAL "" FORCE)
1515
include(MMseqsSetupDerivedTarget)
1616
add_subdirectory(lib/mmseqs)
1717
add_subdirectory(lib/prodigal)
18+
add_subdirectory(lib/fasta_validator)
19+
add_subdirectory(lib/fastq_utils)
1820
add_subdirectory(src)
1921
add_subdirectory(data)
2022
include_directories(lib)

README.md

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,8 @@ Downloaded files are stored in `OUTDIR/DB_NAME` directory, which can be provided
143143
---
144144

145145
## Classification
146+
> [!NOTE]
147+
> We commend running software like `fastp` or `fastplong` to remove adapters and low-quality reads before classification.
146148
```
147149
metabuli classify <i:FASTA/Q> <i:DBDIR> <o:OUTDIR> <Job ID> [options]
148150
- INPUT : FASTA/Q file of reads you want to classify. (gzip supported)
@@ -160,6 +162,7 @@ metabuli classify --seq-mode 1 read.fna dbdir outdir jobid
160162
metabuli classify --seq-mode 3 read.fna dbdir outdir jobid
161163
162164
* Important parameters:
165+
--validate-input : Validate query file format (0 by default)
163166
--threads : The number of threads used (all by default)
164167
--max-ram : The maximum RAM usage. (128 GiB by default)
165168
--min-score : The minimum score to be classified
@@ -241,13 +244,15 @@ metabuli classifiedRefiner <i:read-by-read classification> <i:DBDIR> [options]
241244
242245
* Options
243246
--threads : The number of threads to utilize (all by default)
247+
--min-score : Remove classifications with score below this value
244248
--remove-unclassified : Remove unclassified reads
245-
--exclude-taxid : Remove list of taxids as well as its children (e.g., 1758,9685,1234)
246-
--select-taxid : Select list of taxids as well as its children (e.g., 1758,9685,1234)
249+
--exclude-taxid : Remove list of taxids as well as its children (e.g., 1758,9685)
250+
--select-taxid : Select list of taxids as well as its children (e.g., 1758,9685)
247251
--select-columns : Select list of columns with number and handle full lineage as 7 (generated if absent) (e.g., 2,5,7,3)
248-
--report : Write report of refined classification file
252+
--report : Write report of refined classification results
249253
--rank : Adjust classification to the specified rank
250-
--rank-file-type : Choose how to handle reads assigned to higher taxonomic ranks when using the --rank option. [0: exclude higher rank, 1: include higher rank, 2: make separate file for higher rank classification]
254+
--rank-file-type : Choose how to handle classifications at higher ranks when using --rank option.
255+
[0: exclude them, 1: include them, 2: make separate file for them]
251256
252257
```
253258
#### Output
@@ -307,6 +312,7 @@ metabuli build --gtdb 1 <DBDIR> <FASTA_LIST> <GTDB_TAXDUMP/taxid.map> --taxonomy
307312
--max-ram : The maximum RAM usage. (128 GiB by default)
308313
--accession-level : Set 1 to creat a DB for accession level classification (0 by default).
309314
--cds-info : List of absolute paths to CDS files.
315+
--validate-input : Validate FASTA file format (0 by default)
310316
311317
```
312318
This will generate **diffIdx**, **info**, **split**, and **taxID_list** and some other files. You can delete `*_diffIdx` and `*_info` files.
@@ -333,6 +339,7 @@ metabuli updateDB --gtdb 1 <NEW DBDIR> <FASTA_LIST> <GTDB_TAXDUMP/taxid.map> <OL
333339
--max-ram: The maximum RAM usage. (128 GiB by default)
334340
--accession-level: Set 1 to add new sequences for accession level classification (0 by default).
335341
--cds-info: List of absolute paths to CDS files.
342+
--validate-input : Validate FASTA file format (0 by default)
336343
```
337344

338345
#### \<Add sequences of new taxa>
@@ -421,6 +428,7 @@ metabuli build <DBDIR> <FASTA_LIST> <accession2taxid> --taxonomy-path <TAXDUMP>
421428
--max-ram: The maximum RAM usage. (128 GiB by default)
422429
--accession-level: Set 1 to creat a DB for accession level classification (0 by default).
423430
--cds-info: List of absolute paths to CDS files.
431+
--validate-input : Validate FASTA file format (0 by default)
424432
```
425433
This will generate **diffIdx**, **info**, **split**, and **taxID_list** and some other files. You can delete `*_diffIdx` and `*_info` files and `DATE-TIME` folder (e.g., `2025-1-24-10-32`) if generated.
426434

@@ -456,6 +464,7 @@ metabuli updateDB <NEW DBDIR> <FASTA_LIST> <accession2taxid> <OLD DBDIR> [option
456464
--accession-level : Set 1 to create a DB for accession level classification (0 by default).
457465
--make-library : Make species library for faster execution (1 by default).
458466
--new-taxa : List of new taxa to be added.
467+
--validate-input : Validate FASTA file format (0 by default)
459468
```
460469
461470
#### \<Add sequences of new taxa> - Please refer [this section](#add-sequences-of-new-taxa).
@@ -489,4 +498,6 @@ fasterq-dump --split-files SRR14484345
489498
```
490499
491500
## Reference
492-
Shen, W., Ren, H., TaxonKit: a practical and efficient NCBI Taxonomy toolkit, Journal of Genetics and Genomics, https://doi.org/10.1016/j.jgg.2021.03.006
501+
- **Taxonomy dump**: [Shen W, Ren H. TaxonKit: a practical and efficient NCBI Taxonomy toolkit. Journal of Genetics and Genomics (2021).](https://doi.org/10.1016/j.jgg.2021.03.006)
502+
- **FASTA format validation**: [Edwards R.A. fasta_validate: a fast and efficient fasta validator written in pure C. Zenodo.](https://doi.org/10.5281/zenodo.2532044)
503+
- **FASTQ format validation**: [Fonseca N, Manning J. nunofonseca/fastq_utils: 0.25.2. Zenodo.](https://doi.org/10.5281/zenodo.7755574)

lib/fasta_validator

Submodule fasta_validator added at 0beaee9

lib/fastq_utils/CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
add_library(fastq_utils
2+
fastq_info.cpp
3+
hash.cpp
4+
hash.h
5+
fastq.cpp
6+
fastq.h
7+
)
8+
9+
set_target_properties(fastq_utils PROPERTIES COMPILE_FLAGS "${MMSEQS_CXX_FLAGS} -w" LINK_FLAGS "${MMSEQS_CXX_FLAGS} -w")

0 commit comments

Comments
 (0)