@@ -474,194 +474,3 @@ ReadBEDFile(const string &filename, vector<SimpleGenomicRegion> &the_regions) {
474474 if (!is_header_line (line) && !is_track_line (line))
475475 the_regions.push_back (SimpleGenomicRegion (line));
476476}
477-
478-
479- static const char *digits = " 0987654321" ;
480- static const char *whitespace = " \t " ;
481-
482- // define what to do if parts are missing: if no ':' or '-', is whole
483- // thing chrom?
484-
485- void
486- parse_region_name (string region_name,
487- string& chrom, size_t &start, size_t &end) {
488-
489- const size_t colon_offset = region_name.find (" :" );
490-
491- // get the chromosome
492- size_t chr_offset = region_name.find_last_of (whitespace, colon_offset);
493- if (chr_offset == string::npos)
494- chr_offset = 0 ;
495- else
496- chr_offset += 1 ;
497- chrom = region_name.substr (chr_offset, colon_offset - chr_offset);
498-
499- // get the start
500- const size_t start_end = region_name.find (" -" , colon_offset + 1 );
501- const string start_string = region_name.substr (colon_offset + 1 ,
502- start_end - colon_offset + 1 );
503- start = static_cast <size_t >(atoi (start_string.c_str ()));
504-
505- // get the end
506- const size_t end_end =
507- region_name.find_first_not_of (digits, start_end + 1 );
508- const string end_string = region_name.substr (start_end + 1 ,
509- end_end - start_end - 1 );
510- end = static_cast <size_t >(atoi (end_string.c_str ()));
511- }
512-
513- static size_t adjust_start_pos (const size_t orig_start,
514- const string &chrom_name) {
515- static const double LINE_WIDTH = 50.0 ;
516- const size_t name_offset = chrom_name.length () + 2 ; // For the '>' and the '\n';
517- const size_t preceding_newlines =
518- static_cast <size_t >(std::floor (orig_start / LINE_WIDTH));
519- return orig_start + preceding_newlines + name_offset;
520- }
521-
522- static size_t
523- adjust_region_size (const size_t orig_start, const size_t orig_size) {
524- static const double LINE_WIDTH = 50.0 ;
525- const size_t preceding_newlines_start =
526- static_cast <size_t >(std::floor (orig_start / LINE_WIDTH));
527- const size_t preceding_newlines_end =
528- static_cast <size_t >(std::floor ((orig_start + orig_size) / LINE_WIDTH));
529- return (orig_size + (preceding_newlines_end - preceding_newlines_start));
530- }
531-
532-
533- void
534- extract_regions_chrom_fasta (const string &chrom_name,
535- const string &filename,
536- const vector<SimpleGenomicRegion> ®ions,
537- vector<string> &sequences) {
538-
539- std::ifstream in (filename.c_str ());
540- if (!in)
541- throw runtime_error (" cannot read file: " + filename);
542-
543- for (auto i = begin (regions); i != end (regions); ++i) {
544-
545- const size_t orig_start_pos = i->get_start ();
546- const size_t orig_end_pos = i->get_end ();
547- const size_t orig_region_size = orig_end_pos - orig_start_pos;
548-
549- const size_t start_pos = adjust_start_pos (orig_start_pos, chrom_name);
550- const size_t region_size =
551- adjust_region_size (orig_start_pos, orig_region_size);
552-
553- in.seekg (start_pos);
554- char buffer[region_size + 1 ];
555- buffer[region_size] = ' \0 ' ;
556- in.read (buffer, region_size);
557-
558- std::remove_if (buffer, buffer + region_size,
559- std::bind2nd (std::equal_to<char >(), ' \n ' ));
560- buffer[orig_region_size] = ' \0 ' ;
561-
562- sequences.push_back (buffer);
563- std::transform (sequences.back ().begin (), sequences.back ().end (),
564- sequences.back ().begin (), std::ptr_fun (&toupper));
565- assert (i->get_width () == sequences.back ().length ());
566- }
567- }
568-
569- void
570- extract_regions_chrom_fasta (const string &chrom_name,
571- const string &filename,
572- const vector<GenomicRegion> ®ions,
573- vector<string> &sequences) {
574-
575- std::ifstream in (filename.c_str ());
576- if (!in)
577- throw runtime_error (" cannot read file: " + filename);
578-
579- for (auto i (regions.begin ()); i != regions.end (); ++i) {
580-
581- const size_t orig_start_pos = i->get_start ();
582- const size_t orig_end_pos = i->get_end ();
583- const size_t orig_region_size = orig_end_pos - orig_start_pos;
584-
585- const size_t start_pos = adjust_start_pos (orig_start_pos, chrom_name);
586- const size_t region_size =
587- adjust_region_size (orig_start_pos, orig_region_size);
588-
589- in.seekg (start_pos);
590- char buffer[region_size + 1 ];
591- buffer[region_size] = ' \0 ' ;
592- in.read (buffer, region_size);
593-
594- std::remove_if (buffer, buffer + region_size,
595- std::bind2nd (std::equal_to<char >(), ' \n ' ));
596- buffer[orig_region_size] = ' \0 ' ;
597-
598- sequences.push_back (buffer);
599- std::transform (sequences.back ().begin (), sequences.back ().end (),
600- sequences.back ().begin (), std::ptr_fun (&toupper));
601- if (i->neg_strand ())
602- revcomp_inplace (sequences.back ());
603- assert (i->get_width () == sequences.back ().length ());
604- }
605- }
606-
607- void
608- extract_regions_fasta (const string &dirname,
609- const vector<GenomicRegion> ®ions_in,
610- vector<string> &sequences) {
611-
612- static const string FASTA_SUFFIX (" .fa" );
613- assert (check_sorted (regions_in));
614-
615- vector<string> filenames;
616- read_dir (dirname, filenames);
617-
618- vector<vector<GenomicRegion> > regions;
619- separate_chromosomes (regions_in, regions);
620-
621- unordered_map<string, size_t > chrom_regions_map;
622- for (size_t i = 0 ; i < filenames.size (); ++i)
623- chrom_regions_map[strip_path (filenames[i])] = i;
624-
625- for (size_t i = 0 ; i < regions.size (); ++i) {
626- // GET THE RIGHT FILE
627- const string chrom_name (regions[i].front ().get_chrom ());
628- const string chrom_file (chrom_name + FASTA_SUFFIX);
629- unordered_map<string, size_t >::const_iterator f_idx =
630- chrom_regions_map.find (chrom_file);
631- if (f_idx == chrom_regions_map.end ())
632- throw SMITHLABException (" chrom not found:\t " + chrom_file);
633- extract_regions_chrom_fasta (chrom_name, filenames[f_idx->second ],
634- regions[i], sequences);
635- }
636- }
637-
638- void
639- extract_regions_fasta (const string &dirname,
640- const vector<SimpleGenomicRegion> ®ions_in,
641- vector<string> &sequences) {
642-
643- static const string FASTA_SUFFIX (" .fa" );
644- assert (check_sorted (regions_in));
645-
646- vector<string> filenames;
647- read_dir (dirname, filenames);
648-
649- vector<vector<SimpleGenomicRegion> > regions;
650- separate_chromosomes (regions_in, regions);
651-
652- unordered_map<string, size_t > chrom_regions_map;
653- for (size_t i = 0 ; i < filenames.size (); ++i)
654- chrom_regions_map[strip_path (filenames[i])] = i;
655-
656- for (size_t i = 0 ; i < regions.size (); ++i) {
657- // GET THE RIGHT FILE
658- const string chrom_name (regions[i].front ().get_chrom ());
659- const string chrom_file (chrom_name + FASTA_SUFFIX);
660- unordered_map<string, size_t >::const_iterator f_idx =
661- chrom_regions_map.find (chrom_file);
662- if (f_idx == chrom_regions_map.end ())
663- throw SMITHLABException (" chrom not found:\t " + chrom_file);
664- extract_regions_chrom_fasta (chrom_name, filenames[f_idx->second ],
665- regions[i], sequences);
666- }
667- }
0 commit comments