steineggerlab
diff --git a/‎Makefile‎
Lines changed: 40 additions & 1 deletion b/‎Makefile‎
Lines changed: 40 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 147 additions & 4 deletions b/‎README.md‎
Lines changed: 147 additions & 4 deletions
@@ -12,9 +12,10 @@ LIPO ?= lipo
 endif
 
 win: resources/win/x64/${FRONTEND_APP}.bat 
-mac: resources/mac/x64/${FRONTEND_APP} resources/mac/arm64/${FRONTEND_APP} 
+mac: resources/mac/x64/${FRONTEND_APP} resources/mac/arm64/${FRONTEND_APP} resources/mac/x64/fastp resources/mac/arm64/fastp resources/mac/x64/fastplong resources/mac/arm64/fastplong
 linux: resources/linux/arm64/${FRONTEND_APP} resources/linux/x64/${FRONTEND_APP}
 
+# macOS
 resources/mac/${FRONTEND_APP}:
 	mkdir -p resources/mac
 	wget -nv -q -O - https://mmseqs.com/metabuli/metabuli-osx-universal.tar.gz | tar -xOf - ${FRONTEND_APP}/bin/${FRONTEND_APP} > resources/mac/${FRONTEND_APP}
@@ -28,6 +29,34 @@ resources/mac/arm64/${FRONTEND_APP}: resources/mac/${FRONTEND_APP}
 	mkdir -p resources/mac/arm64
 	$(LIPO) resources/mac/${FRONTEND_APP} -thin arm64 -output resources/mac/arm64/${FRONTEND_APP} || cp -f -- resources/mac/${FRONTEND_APP} resources/mac/arm64/${FRONTEND_APP}
 
+resources/mac/fastp:
+	mkdir -p resources/mac
+	wget -nv -q -O - https://github.com/jaebeom-kim/fastp/releases/download/v0.0.1/fastp-osx-universal.gz | gunzip > resources/mac/fastp
+	chmod +x resources/mac/fastp
+
+resources/mac/x64/fastp: resources/mac/fastp
+	mkdir -p resources/mac/x64
+	$(LIPO) resources/mac/fastp -remove arm64 -output resources/mac/x64/fastp || cp -f -- resources/mac/fastp resources/mac/x64/fastp
+
+resources/mac/arm64/fastp: resources/mac/fastp
+	mkdir -p resources/mac/arm64
+	$(LIPO) resources/mac/fastp -thin arm64 -output resources/mac/arm64/fastp || cp -f -- resources/mac/fastp resources/mac/arm64/fastp
+
+resources/mac/fastplong:
+	mkdir -p resources/mac
+	wget -nv -q -O - https://github.com/jaebeom-kim/fastplong/releases/download/v0.0.1/fastplong-osx-universal.gz | gunzip > resources/mac/fastplong
+	chmod +x resources/mac/fastplong
+
+resources/mac/x64/fastplong: resources/mac/fastplong
+	mkdir -p resources/mac/x64
+	$(LIPO) resources/mac/fastplong -remove arm64 -output resources/mac/x64/fastplong || cp -f -- resources/mac/fastplong resources/mac/x64/fastplong
+
+resources/mac/arm64/fastplong: resources/mac/fastplong
+	mkdir -p resources/mac/arm64
+	$(LIPO) resources/mac/fastplong -thin arm64 -output resources/mac/arm64/fastplong || cp -f -- resources/mac/fastplong resources/mac/arm64/fastplong
+
+
+# Linux
 resources/linux/x64/${FRONTEND_APP}-sse2:
 	mkdir -p resources/linux/x64
 	wget -nv -q -O - https://mmseqs.com/metabuli/metabuli-linux-sse2.tar.gz | tar -xOf - ${FRONTEND_APP}/bin/${FRONTEND_APP} > resources/linux/x64/${FRONTEND_APP}-sse2
@@ -47,11 +76,21 @@ resources/linux/arm64/${FRONTEND_APP}:
 	wget -nv -q -O - https://mmseqs.com/metabuli/metabuli-linux-arm64.tar.gz | tar -xOf - ${FRONTEND_APP}/bin/${FRONTEND_APP} > resources/linux/arm64/${FRONTEND_APP}
 	chmod +x resources/linux/arm64/${FRONTEND_APP}
 
+resources/linux/x64/fastp:
+	mkdir -p resources/linux/x64
+	wget http://opengene.org/fastp/fastp && mv fastp resources/linux/x64/fastp && chmod a+x resources/linux/x64/fastp
+
+
+
 resources/win/x64/${FRONTEND_APP}.bat:
 	mkdir -p resources/win/x64
 	cd resources/win/x64 && wget -nv -O ${FRONTEND_APP}-win64.zip https://mmseqs.com/metabuli/metabuli-win64.zip \
 		&& unzip ${FRONTEND_APP}-win64.zip && mv ${FRONTEND_APP}/* . && rmdir ${FRONTEND_APP} && rm ${FRONTEND_APP}-win64.zip
 	chmod -R +x resources/win/x64/${FRONTEND_APP}.bat resources/win/x64/bin/*
 
+resources/win/x64/fastp:
+	mkdir -p resources/win/x64
+	wget https://github.com/jaebeom-kim/fastp/releases/download/v0.0.1/fastp-windows.exe && mv fastp-windows.exe resources/win/x64/fastp && chmod a+x resources/win/x64/fastp
+
 clean:
 	@rm -rf resources/mac/* resources/linux/* resources/win/* 
@@ -72,19 +72,136 @@ We will make a button for GTDB soon.
 
 ---
 
+# Raw Read Quality Control
+> You can preprocess raw reads either in the separate `Quality Control` tab or in the `Search Settings` tab as part of the classification process. 
+
+Metabuli App supports `fastp` and `fastplong` for raw read quality control, respectively for short and long reads.
+You can upload one or more (gzipped) FASTQ files for quality control. 
+
+For each sample, `fastp`/`fastplong` will generate the following files:
+- Pre-processed FASTQ files
+- Quality control and filtering report files in HTML format
+- JSON format report files for further analysis
+
+## Parameter settings for short read QC using fastp
+Default settings are generally suitable for most datasets, but you can adjust them as needed.
+Below are the parameters adjustable in the GUI. Other parameters can be provided as a text file (Please see "Advanced Settings" below).
+For more details, please refer [fastp GitHub repository](https://github.com/OpenGene/fastp).
+
+### Quality Filtering (Enabled by default)
+- `--disable_quality_filtering`: Disable quality filtering.
+- `--qualified_quality_phred`: Minimum per-base Phred quality score (default 15).
+- `--unqualified_percent_limit`: Maximum fraction of "low-quality" bases allowed (default 40%).
+- `--average_qual`: Minimum average quality score for the read (default none).
+
+### Length Filtering (Enabled by default)
+- `--disable_length_filtering`: Disable length filtering.
+- `--length_required`: Minimum read length required (default 50). Reads shorter than this are discarded.
+- `--length_limit`: Maximum read length allowed (default none). Reads longer than this are discarded.
+
+### Adapter trimming (Enabled by default)
+- Adapter sequences are automatically detected if not specified.
+- `--disable_adapter_trimming`: Disable adapter trimming.
+- `--adapter_sequence`: Adapter for read 1. It disables auto-detection for SE reads.
+- `--adapter_sequence_r2`: Adapter for read 2 (for PE data). For PE data, the specified adapter sequences are used only when auto-detection fails.  
+- `--adapter_fasta`: FASTA file of adapter sequences. They are used after trimming adapters that are either auto-detected or specified with `--adapter_sequence` or `--adapter_sequence_r2`.
+
+### Low complexity filtering (*Disabled* by default)
+- `--low_complexity_filter`: Enable low complexity filtering.
+- `--complexity_threshold`: Reads with complexity below this value are discarded. Range: 0~100. (default 30)
+
+### Per read cutting by quality (*Disabled* by default)
+- `--cut_front`: Enable cutting reads from the front (5') based on quality.
+- `--cut_front_window_size`: Size of the window for cutting from the front (default 4).
+- `--cut_front_mean_quality`: Minimum mean quality for the front window (default 20).
+- `--cut_tail`: Enable cutting reads from the tail (3') based on quality.
+- `--cut_tail_window_size`: Size of the window for cutting from the tail (default 4).
+- `--cut_tail_mean_quality`: Minimum mean quality for the tail window (default 20).
+
+### Other Parameters
+- `--thread`: Number of threads to use (default max(all, 16)).
+- `--compression`: Output compression level (default 4).
+
+
+## Parameter settings for long read QC using fastplong
+Default settings are generally suitable for most datasets, but you can adjust them as needed.
+Below are the parameters adjustable in the GUI. Other parameters can be provided as a text file (Please see "Advanced Settings" below).
+For more details, please refer [fastplong GitHub repository](https://github.com/OpenGene/fastplong).
+
+### Quality Filtering (Enabled by default)
+- `--disable_quality_filtering`: Disable quality filtering.
+- `--qualified_quality_phred`: Minimum per-base Phred quality score (default 15).
+- `--unqualified_percent_limit`: Maximum fraction of "low-quality" bases allowed (default 40%).
+- `--mean_qual`: Minimum average quality score for the read (default none).
+
+### Length Filtering (Enabled by default)
+- `--disable_length_filtering`: Disable length filtering.
+- `--length_required`: Minimum read length required (default 1000). Reads shorter than this are discarded.
+- `--length_limit`: Maximum read length allowed (default none). Reads longer than this are discarded.
+
+### Adapter trimming (Enabled by default)
+- Adapter sequences are automatically detected if not specified.
+- It's recommended to specify adapters if they are known using `--start_adapter` and `--end_adapter`. 
+- `--disable_adapter_trimming`: Disable adapter trimming.
+- `--start_adapter`: Read start adapter sequence.
+- `--end_adapter`: Read end adapter sequence.
+- `--adapter_fasta`: FASTA file of adapter sequences. 
+
+### Low complexity filtering (*Disabled* by default)
+- `--low_complexity_filter`: Enable low complexity filtering.
+- `--complexity_threshold`: Reads with complexity below this value are discarded. Range: 0~100. (default 30)
+
+### Per read cutting by quality (*Disabled* by default)
+- `--cut_front`: Enable cutting reads from the front (5') based on quality.
+- `--cut_front_window_size`: Size of the window for cutting from the front (default 4).
+- `--cut_front_mean_quality`: Minimum mean quality for the front window (default 20).
+- `--cut_tail`: Enable cutting reads from the tail (3') based on quality.
+- `--cut_tail_window_size`: Size of the window for cutting from the tail (default 4).
+- `--cut_tail_mean_quality`: Minimum mean quality for the tail window (default 20).
+
+### Other Parameters
+- `--thread`: Number of threads to use (default max(all, 16)).
+- `--compression`: Output compression level (default 4).
+
+## Advanced Settings
+You can provide additional parameters in a text file. The file should contain one parameter per line, and each line should start with the parameter name followed by its value. Parameters here will override the GUI settings.
+Check [fastp](https://github.com/OpenGene/fastp) and [fastplong](https://github.com/OpenGene/fastplong) GitHub repository for parameter list.
+Please use long options (e.g., `--disable_quality_filtering`) instead of short options (e.g., `-Q`).
+For example:
+```
+--disable_quality_filtering
+--qualified_quality_phred 20
+--unqualified_percent_limit 30
+```
+
+
+
+
+
+
+
+
 # Classification
 Metabuli App provides two taxonomic profiling modes in **Search Settings** panel: **New Search** and **Upload Report**.
-<img alt="SearchPage_Demo_Image" src="https://github.com/user-attachments/assets/9ab5a86c-5603-4dc7-be3b-baf2ed490ef0" style="max-height: 600px; width: auto;">
+<!-- <img alt="SearchPage_Demo_Image" src="https://github.com/user-attachments/assets/9ab5a86c-5603-4dc7-be3b-baf2ed490ef0" style="max-height: 600px; width: auto;"> -->
 
 ## New Classification
+#### You can perform taxonomic classification on one or more samples using a specified database.
 ### Required Fields:
 1. **Mode:** Select the analysis mode among single-end, paired-end, or long-read.
-2. **Job ID:** Enter a unique identifier for the job.
-3. **Select Files:** Upload the necessary files and directories.
+2. **Enable Quality Control:** Check it to enable quality control for the input reads. 
+    - `fastp` and `fastplong` are used for short and long reads, respectively.
+    - Please see QC documentation for more details.
+3. **Job ID:** Enter a unique identifier for the job.
+4. **Select Files:** Upload the necessary files and directories.
     - Read 1 File (and Read 2 File if Paired-end is selected)
+        - FASTA/FASTQ and their gzipped versions are supported.
+        - `ADD ENTRY` to upload **multiple samples** to process using the same settings.
     - Database Directory
     - Output Directory
-4. **Max RAM:** Specify the maximum RAM (in GiB) to allocate for the job.
+        - Result files are saved in `Job ID` directory under the specified output directory.
+        - When **multiple samples** are processed, results are saved in `Job ID/sample_name` directories.
+5. **Max RAM:** Specify the maximum RAM (in GiB) to allocate for the job.
 
 ### Advanced Settings (Optional): 
 - **Threads:** Specify thread count for the job.
@@ -111,6 +228,27 @@ Metabuli App provides two taxonomic profiling modes in **Search Settings** panel
      - **Sankey Diagram**: A flow diagram representing the lineage information of the displayed taxa.
      - **Krona Chart**: A hierarchical interactive chart that visualizes classification results.
 
+### Generated Result Files:
+#### 1. JobID_classifications.tsv: It contains the classification results for each read. The columns are as follows.
+1. `is_classified`: Classified or not
+2. `name`: Read ID
+3. `taxID`: Tax. ID in the tax. dump files used in database creation
+4. `query_length`: Effective read length
+5. `score`: DNA level identity score
+6. `rank`: Taxonomic rank of the taxon
+7. `taxID:match_count`: List of "taxID : k-mer match count"
+
+#### 2. JobID_report.tsv: It follows Kraken2's report format. The first line is a header, and the rest of the lines are tab-separated values. The columns are as follow.
+
+1. `clade_proportion`: Percentage of reads classified to the clade rooted at this taxon
+2. `clade_count`: Number of reads classified to the clade rooted at this taxon
+3. `taxon_count`: Number of reads classified directly to this taxon
+4. `rank`: Taxonomic rank of the taxon
+5. `taxID`: Tax ID according to the taxonomy dump files used in the database creation
+6. `name`: Taxonomic name of the taxon
+
+#### 3. JobID_krona.html: It is for an interactive Krona plot. You can use any modern web browser to open `JobID_krona.html`.
+
 ## Upload Report
 
 To visualize results from a previously completed job:
@@ -123,6 +261,11 @@ To visualize results from a previously completed job:
 
 ---
 
+# Database Curation
+
+## Download Database
+You can download pre-built databases [here](https://metabuli.steineggerlab.workers.dev/).
+
 ## Create New Database
 You can create a new database in "NEW DATABASE" tab by providing these three files:
 1. **FASTA files** : Each sequence must have a unique `>accession.version` or `>accesion` header (e.g., `>CP001849.1` or `>CP001849`).