Skip to content

Commit 0a3434d

Browse files
authored
Merge pull request #75 from hasindu2008/dev
Dev
2 parents 327ff84 + c305604 commit 0a3434d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+1555
-197
lines changed

docs/commands.md

100644100755
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,8 @@ If multiple samples (different run ids) are detected, the header and the *read_g
8989
The batch size. This is the number of records on the memory at once [default value: 4096]. An increased batch size improves multi-threaded performance at cost of higher RAM.
9090
* `--lossless STR`:
9191
Retain information in auxiliary fields during file merging [default value: true]. This information is generally not required for downstream analysis can be optionally discarded to reduce file size. *IMPORTANT: Generated files are only to be used for intermediate analysis and NOT for archiving. You will not be able to convert lossy files back to FAST5*.
92-
* `--allow STR`:
93-
Continue to merge the files despite the WARNINGS about the differences in run_id groups [default value: true].
92+
* `-a, --allow`:
93+
Allow merging despite attribute differences in the same run_id.
9494
* `-h, --help`:
9595
Prints the help menu.
9696

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
#!/bin/bash
2+
3+
# An example script that properly converts a set of multi-fast5 generated
4+
# using ONT's multi_to_single_fast5 utility.
5+
# ONT's multi_to_single_fast5 packs reads with mutltiple run_ids in to an individual multi-fast5 file
6+
# slow5tools f2s does directly support converting such files and this script is an example of how to do it.
7+
# First this script uses ONT's single_to_multi_fast5 utility to convert multi-fast5 back to single-fast5 files
8+
# Then we classify the single-fast5 files into multiple directories based on the run_id
9+
# Then we use slow5tools f2s to convert each of those directory separately to blow5
10+
# Next we merge all the blow5 files into a single blow5 file
11+
# Finally we do a sanity check to see if number of records in the blow5 file matches the number of records in the original multi-fast5
12+
# as well as a slow5tools index to see if all the read IDS in the merged BLOW5 file are unique
13+
14+
# Dependencies
15+
# - multi_to_single_fast5 in ont_fast5_api
16+
# - h5dump
17+
# - parallel
18+
# - slow5tools
19+
20+
21+
set -e
22+
23+
FAST5_DIR=$1
24+
TMP_FAST5=tmp_fast5
25+
SINGLE_FAST5=tmp_single_fast5
26+
TMP_BLOW5=tmp_blow5
27+
MERGED_BLOW5=reads.blow5
28+
[ -z ${SLOW5TOOLS} ] && SLOW5TOOLS=slow5tools
29+
[ -z ${NUM_THREADS} ] && NUM_THREADS=$(nproc)
30+
31+
# terminate script
32+
die() {
33+
echo "$1" >&2
34+
echo
35+
exit 1
36+
}
37+
38+
if [ "$#" -ne 1 ]; then
39+
die "Usage: $0 <fast5_dir>"
40+
fi
41+
42+
test -d $FAST5_DIR || die "Directory $FAST5_DIR does not exist"
43+
test -d $SINGLE_FAST5 && die "Directory $SINGLE_FAST5 already exists. Please delete that first."
44+
test -d $TMP_FAST5 && die "Directory $TMP_FAST5 already exists. Please delete that first."
45+
test -d $TMP_BLOW5 && die "Directory $TMP_BLOW5 already exists. Please delete that first."
46+
test -e $MERGED_BLOW5 && die "$MERGED_BLOW5 already exists. Please delete that first."
47+
48+
multi_to_single_fast5 --version || die "Could not find single_to_multi_fast5. Check if ont_fast5_api [https://github.com/nanoporetech/ont_fast5_api] is installed"
49+
h5dump --version || die "Could not find h5dump. Install using sudo apt-get install hdf5-tools"
50+
parallel --version || die "Could not find command 'parallel'. On Ubuntu, install using sudo apt-get install parallel"
51+
$SLOW5TOOLS --version || die "Could not find $SLOW5TOOLS. Add slow5tools to PATH or set SLOW5TOOLS environment variable"
52+
53+
mkdir $TMP_FAST5 $TMP_BLOW5 || die "Creating directory $TMP_FAST5 and $TMP_BLOW5 failed"
54+
55+
multi_to_single_fast5 -i $FAST5_DIR -s $SINGLE_FAST5 --recursive -t $NUM_THREADS || die "Converting multi-fast5 to single-fast5 failed"
56+
57+
read_id_based_classify_func(){
58+
59+
file=$1
60+
TMP_FAST5=$2
61+
# while read file;
62+
# do
63+
#RUN_ID=$(strings $file | grep -w -A1 "run_id" | tail -1)
64+
RUN_ID=$(h5dump -A $file | grep -w "run_id" -A20 | grep "(0):" | head -1 | awk '{print $NF}' | tr -d '"')
65+
#echo "file $file has runID $RUN_ID"
66+
test -z $RUN_ID && { echo "Could not deduce run_id in $file"; exit 1; }
67+
68+
mkdir -p $TMP_FAST5/$RUN_ID
69+
mv -i $file $TMP_FAST5/$RUN_ID || { echo "moving $file to $TMP_FAST5/$RUN_ID failed"; exit 1;}
70+
# done
71+
72+
}
73+
export -f read_id_based_classify_func
74+
75+
#classify
76+
find $SINGLE_FAST5 -name '*.fast5' | parallel -I% --max-args 1 read_id_based_classify_func % $TMP_FAST5 || die "Classifying single-fast5 based on read ID failed"
77+
78+
#convert
79+
for each in $TMP_FAST5/*; do
80+
echo "Converting $each";
81+
DIR=$(basename $each)
82+
slow5tools f2s $TMP_FAST5/$DIR/ -d $TMP_BLOW5/$DIR -p $NUM_THREADS || die "Converting $each to BLOW5 failed"
83+
done
84+
85+
#merge
86+
slow5tools merge $TMP_BLOW5/ -o $MERGED_BLOW5 -t $NUM_THREADS -K 1000 || die "Merging to a single BLOW5 failed"
87+
88+
#sanity check
89+
NUM_SLOW5_READS=$(slow5tools stats $MERGED_BLOW5 | grep "number of records" | awk '{print $NF}')
90+
NUM_READS=$(find $FAST5_DIR -name '*.fast5' | parallel -I% --max-args 1 strings % | grep "read_.*-.*-.*" | wc -l | awk 'BEGIN {count=0;} {count=count+$0} END {print count;}')
91+
92+
if [ ${NUM_READS} -ne ${NUM_SLOW5_READS} ]
93+
then
94+
echo "WARNING: Sanity check also failed. $NUM_READS in FAST5, but $NUM_SLOW5_READS reads in SLOW5"
95+
exit 1
96+
else
97+
echo "$NUM_READS in FAST5, $NUM_SLOW5_READS reads in SLOW5"
98+
fi
99+
100+
#check if unique read id
101+
slow5tools index $MERGED_BLOW5 || die "Indexing BLOW5 failed"
102+
103+
rm -r $SINGLE_FAST5 $TMP_FAST5 $TMP_BLOW5 || die "Removing temporary directories failed"

src/cat.c

Lines changed: 95 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,87 @@
2626
extern int slow5tools_verbosity_level;
2727
int close_files_and_exit(slow5_file_t *slow5_file, slow5_file_t *slow5_file_i, char *arg_fname_out);
2828

29+
// return 0 if no warnings
30+
// return 1 if warnings are found
31+
int compare_headers(slow5_hdr_t *output_header, slow5_hdr_t *input_header, int64_t output_g, int64_t input_g, const char *i_file_path, const char *j_run_id) {
32+
int flag_warnings = 0;
33+
khash_t(slow5_s2s) *rg_o = slow5_hdr_get_data(output_g, output_header);
34+
khash_t(slow5_s2s) *rg_i = slow5_hdr_get_data(input_g, input_header);
35+
36+
size_t size_rg_o = kh_size(rg_o);
37+
size_t size_rg_i = kh_size(rg_i);
38+
39+
// fprintf(stderr,"size_rg_i,size_rg_i\t%zu,%zu\n",size_rg_i,size_rg_i);
40+
if(size_rg_i != size_rg_o){
41+
flag_warnings = 1;
42+
WARNING("Input file %s (run_id-%s) has a different number of attributes (%zu) than seen in the previous files processed so far (%zu)", i_file_path, j_run_id, size_rg_i, size_rg_o);
43+
}
44+
45+
for (khint_t itr = kh_begin(rg_o); itr != kh_end(rg_o); ++itr) { // traverse hash_table_o
46+
if (kh_exist(rg_o, itr)) {
47+
int flag_set_attr_value_to_empty = 0;
48+
const char* key_o = kh_key(rg_o, itr);
49+
khint_t pos_i = kh_get(slow5_s2s, rg_i, key_o);
50+
if(pos_i == kh_end(rg_i)){
51+
WARNING("Attribute '%s' is not available in input file %s (run_id-%s)", key_o, i_file_path, j_run_id);
52+
flag_warnings = 1;
53+
flag_set_attr_value_to_empty = 1;
54+
} else{
55+
const char *value_o = kh_value(rg_o, itr);
56+
const char *value_i = kh_value(rg_i, pos_i);
57+
if(strcmp(value_o,value_i)){
58+
WARNING("Attribute '%s' in input file %s (run_id-%s) has a different value (%s) than what has been seen so far (%s)", key_o, i_file_path, j_run_id, value_i, value_o);
59+
flag_warnings = 1;
60+
flag_set_attr_value_to_empty = 1;
61+
}
62+
}
63+
if(flag_set_attr_value_to_empty){
64+
INFO("Setting output header's attribute '%s' (run_id-%s) to empty", key_o, j_run_id);
65+
int ret_hdr_attr_set = slow5_hdr_set(key_o, "", output_g, output_header);
66+
if(ret_hdr_attr_set<0){
67+
ERROR("Could not set attribute '%s' value to empty", key_o);
68+
exit(EXIT_FAILURE);
69+
}
70+
}
71+
}
72+
}
73+
74+
for (khint_t itr = kh_begin(rg_i); itr != kh_end(rg_i); ++itr) { // traverse hash_table_o
75+
if (kh_exist(rg_i, itr)) {
76+
const char* key_i = kh_key(rg_i, itr);
77+
khint_t pos_o = kh_get(slow5_s2s, rg_o, key_i);
78+
if(pos_o == kh_end(rg_o)){
79+
const char *value_i = kh_value(rg_i, itr);
80+
flag_warnings = 1;
81+
INFO("Attribute '%s' in input file %s (run_id-%s) is not seen in previous files. It will be added to the output header but its value (%s) will not be set in the output header.", key_i, i_file_path, j_run_id, value_i);
82+
int ret_attr_header = slow5_hdr_add_attr(key_i, output_header);
83+
if(ret_attr_header==-1){
84+
ERROR("Could not add the new attribute %s to the output header", key_i);
85+
exit(EXIT_FAILURE);
86+
}
87+
}
88+
}
89+
}
90+
return flag_warnings;
91+
}
92+
93+
int compare_aux_attrs(slow5_hdr_t *output_header, slow5_hdr_t *input_header, const char *i_file_path) {
94+
uint32_t num_axu_o = output_header->aux_meta->num;
95+
uint32_t num_axu_i = input_header->aux_meta->num;
96+
if(num_axu_o != num_axu_i){
97+
ERROR("Input file %s has a different number of auxiliary attributes (%" PRIu32 ") than seen in the previous files processed so far (%" PRIu32 ")", i_file_path, num_axu_o, num_axu_i);
98+
return -1;
99+
}
100+
for(uint32_t i=0; i<num_axu_o; i++){
101+
DEBUG("%s\t%s\n", output_header->aux_meta->attrs[i], input_header->aux_meta->attrs[i]);
102+
if(strcmp(output_header->aux_meta->attrs[i],input_header->aux_meta->attrs[i])){
103+
ERROR("Input file %s has a different order of auxiliary attributes from the order seen in the previous files processed so far", i_file_path);
104+
return -1;
105+
}
106+
}
107+
return 0;
108+
}
109+
29110
int cat_main(int argc, char **argv, struct program_meta *meta){
30111

31112
print_args(argc,argv);
@@ -181,7 +262,7 @@ int cat_main(int argc, char **argv, struct program_meta *meta){
181262
}
182263
//set run_ids
183264
for(uint32_t j=0; j<num_read_groups; j++){
184-
char* temp = slow5_hdr_get("run_id", 0, slow5File_i->header);
265+
char* temp = slow5_hdr_get("run_id", j, slow5File_i->header);
185266
if(!temp){
186267
ERROR("No run_id information found in %s.", slow5_files[i].c_str());
187268
return close_files_and_exit(slow5File, slow5File_i, user_opts.arg_fname_out);
@@ -236,7 +317,7 @@ int cat_main(int argc, char **argv, struct program_meta *meta){
236317
return close_files_and_exit(slow5File, slow5File_i, user_opts.arg_fname_out);
237318
}
238319
for (uint32_t j = 0; j < num_read_groups; j++) {
239-
char *temp = slow5_hdr_get("run_id", 0, slow5File_i->header);
320+
char *temp = slow5_hdr_get("run_id", j, slow5File_i->header);
240321
if (!temp) {
241322
ERROR("No run_id information found in %s.", slow5_files[i].c_str());
242323
return close_files_and_exit(slow5File, slow5File_i, user_opts.arg_fname_out);
@@ -246,6 +327,18 @@ int cat_main(int argc, char **argv, struct program_meta *meta){
246327
slow5_files[i].c_str());
247328
return close_files_and_exit(slow5File, slow5File_i, user_opts.arg_fname_out);
248329
}
330+
/// compare headers
331+
int ret_compare_headers = compare_headers(slow5File->header, slow5File_i->header, j, j, slow5_files[i].c_str(), run_ids[j].c_str());
332+
if(ret_compare_headers){
333+
return close_files_and_exit(slow5File, slow5File_i, user_opts.arg_fname_out);
334+
}
335+
336+
}
337+
if(lossy == 0){
338+
int ret_compare_aux_attrs = compare_aux_attrs(slow5File->header, slow5File_i->header, slow5_files[i].c_str());
339+
if(ret_compare_aux_attrs){
340+
return close_files_and_exit(slow5File, slow5File_i, user_opts.arg_fname_out);
341+
}
249342
}
250343
}
251344

src/cmd.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,12 @@
5151
//for f2s
5252
#define HELP_MSG_RETAIN_DIR_STRUCTURE \
5353
" --retain retain the same directory structure in the converted output as the input (experimental)\n"
54-
54+
//for f2s
55+
#define HELP_MSG_CONTINUE_F2S \
56+
" -a, --allow allow run id mismatches in a multi-fast5 file or in a single-fast5 directory\n" \
57+
//for merge
5558
#define HELP_MSG_CONTINUE_MERGE \
56-
" --allow continue to merge the files despite the WARNINGS about the differences in run_id groups [false].\n"
59+
" -a, --allow allow merging despite attribute differences in the same run_id\n"
5760

5861
#define HELP_MSG_HELP \
5962
" -h, --help display this message and exit\n" \

src/f2s.c

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
HELP_MSG_PRESS \
3333
HELP_MSG_PROCESSES \
3434
HELP_MSG_LOSSLESS \
35-
" -a, --allow allow run id mismatches in a multi-fast5 file or in a single-fast5 directory\n" \
35+
HELP_MSG_CONTINUE_F2S \
3636
HELP_MSG_RETAIN_DIR_STRUCTURE \
3737
HELP_MSG_HELP \
3838
HELP_FORMATS_METHODS
@@ -355,6 +355,7 @@ int f2s_main(int argc, char **argv, struct program_meta *meta) {
355355
break;
356356
case 'a':
357357
user_opts.flag_allow_run_id_mismatch = 1;
358+
WARNING("%s", "You have requested to allow run ID mismatches. Generated files are only to be used for intermediate analysis and are not recommended for archiving.");
358359
break;
359360
case 'h':
360361
DEBUG("Displaying the large help message%s","");
@@ -437,6 +438,13 @@ int f2s_main(int argc, char **argv, struct program_meta *meta) {
437438
double init_realtime = slow5_realtime();
438439
std::vector<std::string> fast5_files;
439440
int i = optind;
441+
int flag_one_input = i==(argc-1);
442+
443+
if(user_opts.arg_dir_out && user_opts.flag_retain_dir_structure==1 && flag_one_input==0){
444+
ERROR("Cannot retain the directory structure when there are multiple inputs. Please provide one input path%s",".");
445+
return EXIT_FAILURE;
446+
}
447+
440448
for (; i < argc; ++ i) {
441449
list_all_items(argv[i], fast5_files, 0, ".fast5");
442450
}
@@ -447,7 +455,6 @@ int f2s_main(int argc, char **argv, struct program_meta *meta) {
447455
return EXIT_FAILURE;
448456
}
449457

450-
int flag_one_input = i==optind+1;
451458

452459
if(fast5_files.size()==1){
453460
user_opts.num_processes = 1;

src/merge.c

Lines changed: 32 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -82,15 +82,15 @@ int merge_main(int argc, char **argv, struct program_meta *meta){
8282
}
8383

8484
static struct option long_opts[] = {
85-
{"help", no_argument, NULL, 'h'}, //0
86-
{"threads", required_argument, NULL, 't' }, //1
87-
{"to", required_argument, NULL, 'b'}, //2
88-
{"compress", required_argument, NULL, 'c'}, //3
89-
{"sig-compress", required_argument, NULL, 's'}, //4
90-
{ "lossless", required_argument, NULL, 'l'}, //5
91-
{ "allow", required_argument, NULL, 'a'}, //6
92-
{"output", required_argument, NULL, 'o'}, //7
93-
{"batchsize", required_argument, NULL, 'K'}, //8
85+
{"help", no_argument, NULL, 'h'}, //0
86+
{"threads", required_argument, NULL, 't' }, //1
87+
{"to", required_argument, NULL, 0}, //2
88+
{"compress", required_argument, NULL, 'c'}, //3
89+
{"sig-compress", required_argument, NULL, 's'}, //4
90+
{"lossless", required_argument, NULL, 0}, //5
91+
{"allow", no_argument, NULL, 'a'}, //6
92+
{"output", required_argument, NULL, 'o'}, //7
93+
{"batchsize", required_argument, NULL, 'K'}, //8
9494
{NULL, 0, NULL, 0 }
9595
};
9696

@@ -101,10 +101,20 @@ int merge_main(int argc, char **argv, struct program_meta *meta){
101101
int longindex = 0;
102102

103103
// Parse options
104-
while ((opt = getopt_long(argc, argv, "b:c:s:hl:t:o:a:K:", long_opts, &longindex)) != -1) {
104+
while ((opt = getopt_long(argc, argv, "c:s:ht:o:aK:", long_opts, &longindex)) != -1) {
105105
DEBUG("opt='%c', optarg=\"%s\", optind=%d, opterr=%d, optopt='%c'",
106106
opt, optarg, optind, opterr, optopt);
107107
switch (opt) {
108+
case 'c':
109+
user_opts.arg_record_press_out = optarg;
110+
break;
111+
case 's':
112+
user_opts.arg_signal_press_out = optarg;
113+
break;
114+
case 'a':
115+
user_opts.flag_continue_merge = 1;
116+
WARNING("%s", "You have requested to merge files despite attribute differences in same read ID. Generated files are for intermediate analysis and are not recommended for archiving.");
117+
break;
108118
case 'h':
109119
DEBUG("displaying large help message%s","");
110120
fprintf(stdout, HELP_LARGE_MSG, argv[0]);
@@ -113,26 +123,21 @@ int merge_main(int argc, char **argv, struct program_meta *meta){
113123
case 't':
114124
user_opts.arg_num_threads = optarg;
115125
break;
116-
case 'b':
117-
user_opts.arg_fmt_out = optarg;
126+
case 'o':
127+
user_opts.arg_fname_out = optarg;
118128
break;
119129
case 'K':
120130
user_opts.arg_batch = optarg;
121131
break;
122-
case 'c':
123-
user_opts.arg_record_press_out = optarg;
124-
break;
125-
case 's':
126-
user_opts.arg_signal_press_out = optarg;
127-
break;
128-
case 'l':
129-
user_opts.arg_lossless = optarg;
130-
break;
131-
case 'a':
132-
user_opts.arg_continue_merge = optarg;
133-
break;
134-
case 'o':
135-
user_opts.arg_fname_out = optarg;
132+
case 0 :
133+
switch (longindex) {
134+
case 2:
135+
user_opts.arg_fmt_out = optarg;
136+
break;
137+
case 5:
138+
user_opts.arg_lossless = optarg;
139+
break;
140+
}
136141
break;
137142
default: // case '?'
138143
fprintf(stderr, HELP_SMALL_MSG, argv[0]);
@@ -152,10 +157,6 @@ int merge_main(int argc, char **argv, struct program_meta *meta){
152157
EXIT_MSG(EXIT_FAILURE, argv, meta);
153158
return EXIT_FAILURE;
154159
}
155-
if(parse_arg_continue_merge(&user_opts, argc, argv, meta) < 0){
156-
EXIT_MSG(EXIT_FAILURE, argv, meta);
157-
return EXIT_FAILURE;
158-
}
159160
if(parse_format_args(&user_opts,argc,argv,meta) < 0){
160161
EXIT_MSG(EXIT_FAILURE, argv, meta);
161162
return EXIT_FAILURE;
@@ -335,7 +336,7 @@ int merge_main(int argc, char **argv, struct program_meta *meta){
335336
}
336337

337338
if(flag_warnings_occured == 1 && user_opts.flag_continue_merge == DEFAULT_CONTINUE_MERGE){
338-
ERROR("There were warning about attribute differences for the same run_id(s). Please set flag '--continue' to 'true' if you still want to merge files%s", ".");
339+
ERROR("Attributes are different for the same run_id(s). Set -a of you still want to merge files%s", ".");
339340
return EXIT_FAILURE;
340341
}
341342

0 commit comments

Comments
 (0)