Skip to content

Commit 509a031

Browse files
Removing functions from GenomicRegion that are defined in chromosome_utils. These should probably be redesigned.
1 parent a695f7e commit 509a031

File tree

2 files changed

+0
-219
lines changed

2 files changed

+0
-219
lines changed

GenomicRegion.cpp

Lines changed: 0 additions & 191 deletions
Original file line numberDiff line numberDiff line change
@@ -474,194 +474,3 @@ ReadBEDFile(const string &filename, vector<SimpleGenomicRegion> &the_regions) {
474474
if (!is_header_line(line) && !is_track_line(line))
475475
the_regions.push_back(SimpleGenomicRegion(line));
476476
}
477-
478-
479-
static const char *digits = "0987654321";
480-
static const char *whitespace = " \t";
481-
482-
// define what to do if parts are missing: if no ':' or '-', is whole
483-
// thing chrom?
484-
485-
void
486-
parse_region_name(string region_name,
487-
string& chrom, size_t &start, size_t &end) {
488-
489-
const size_t colon_offset = region_name.find(":");
490-
491-
// get the chromosome
492-
size_t chr_offset = region_name.find_last_of(whitespace, colon_offset);
493-
if (chr_offset == string::npos)
494-
chr_offset = 0;
495-
else
496-
chr_offset += 1;
497-
chrom = region_name.substr(chr_offset, colon_offset - chr_offset);
498-
499-
// get the start
500-
const size_t start_end = region_name.find("-", colon_offset + 1);
501-
const string start_string = region_name.substr(colon_offset + 1,
502-
start_end - colon_offset + 1);
503-
start = static_cast<size_t>(atoi(start_string.c_str()));
504-
505-
// get the end
506-
const size_t end_end =
507-
region_name.find_first_not_of(digits, start_end + 1);
508-
const string end_string = region_name.substr(start_end + 1,
509-
end_end - start_end - 1);
510-
end = static_cast<size_t>(atoi(end_string.c_str()));
511-
}
512-
513-
static size_t adjust_start_pos(const size_t orig_start,
514-
const string &chrom_name) {
515-
static const double LINE_WIDTH = 50.0;
516-
const size_t name_offset = chrom_name.length() + 2; // For the '>' and the '\n';
517-
const size_t preceding_newlines =
518-
static_cast<size_t>(std::floor(orig_start / LINE_WIDTH));
519-
return orig_start + preceding_newlines + name_offset;
520-
}
521-
522-
static size_t
523-
adjust_region_size(const size_t orig_start, const size_t orig_size) {
524-
static const double LINE_WIDTH = 50.0;
525-
const size_t preceding_newlines_start =
526-
static_cast<size_t>(std::floor(orig_start / LINE_WIDTH));
527-
const size_t preceding_newlines_end =
528-
static_cast<size_t>(std::floor((orig_start + orig_size) / LINE_WIDTH));
529-
return (orig_size + (preceding_newlines_end - preceding_newlines_start));
530-
}
531-
532-
533-
void
534-
extract_regions_chrom_fasta(const string &chrom_name,
535-
const string &filename,
536-
const vector<SimpleGenomicRegion> &regions,
537-
vector<string> &sequences) {
538-
539-
std::ifstream in(filename.c_str());
540-
if (!in)
541-
throw runtime_error("cannot read file: " + filename);
542-
543-
for (auto i = begin(regions); i != end(regions); ++i) {
544-
545-
const size_t orig_start_pos = i->get_start();
546-
const size_t orig_end_pos = i->get_end();
547-
const size_t orig_region_size = orig_end_pos - orig_start_pos;
548-
549-
const size_t start_pos = adjust_start_pos(orig_start_pos, chrom_name);
550-
const size_t region_size =
551-
adjust_region_size(orig_start_pos, orig_region_size);
552-
553-
in.seekg(start_pos);
554-
char buffer[region_size + 1];
555-
buffer[region_size] = '\0';
556-
in.read(buffer, region_size);
557-
558-
std::remove_if(buffer, buffer + region_size,
559-
std::bind2nd(std::equal_to<char>(), '\n'));
560-
buffer[orig_region_size] = '\0';
561-
562-
sequences.push_back(buffer);
563-
std::transform(sequences.back().begin(), sequences.back().end(),
564-
sequences.back().begin(), std::ptr_fun(&toupper));
565-
assert(i->get_width() == sequences.back().length());
566-
}
567-
}
568-
569-
void
570-
extract_regions_chrom_fasta(const string &chrom_name,
571-
const string &filename,
572-
const vector<GenomicRegion> &regions,
573-
vector<string> &sequences) {
574-
575-
std::ifstream in(filename.c_str());
576-
if (!in)
577-
throw runtime_error("cannot read file: " + filename);
578-
579-
for (auto i(regions.begin()); i != regions.end(); ++i) {
580-
581-
const size_t orig_start_pos = i->get_start();
582-
const size_t orig_end_pos = i->get_end();
583-
const size_t orig_region_size = orig_end_pos - orig_start_pos;
584-
585-
const size_t start_pos = adjust_start_pos(orig_start_pos, chrom_name);
586-
const size_t region_size =
587-
adjust_region_size(orig_start_pos, orig_region_size);
588-
589-
in.seekg(start_pos);
590-
char buffer[region_size + 1];
591-
buffer[region_size] = '\0';
592-
in.read(buffer, region_size);
593-
594-
std::remove_if(buffer, buffer + region_size,
595-
std::bind2nd(std::equal_to<char>(), '\n'));
596-
buffer[orig_region_size] = '\0';
597-
598-
sequences.push_back(buffer);
599-
std::transform(sequences.back().begin(), sequences.back().end(),
600-
sequences.back().begin(), std::ptr_fun(&toupper));
601-
if (i->neg_strand())
602-
revcomp_inplace(sequences.back());
603-
assert(i->get_width() == sequences.back().length());
604-
}
605-
}
606-
607-
void
608-
extract_regions_fasta(const string &dirname,
609-
const vector<GenomicRegion> &regions_in,
610-
vector<string> &sequences) {
611-
612-
static const string FASTA_SUFFIX(".fa");
613-
assert(check_sorted(regions_in));
614-
615-
vector<string> filenames;
616-
read_dir(dirname, filenames);
617-
618-
vector<vector<GenomicRegion> > regions;
619-
separate_chromosomes(regions_in, regions);
620-
621-
unordered_map<string, size_t> chrom_regions_map;
622-
for (size_t i = 0; i < filenames.size(); ++i)
623-
chrom_regions_map[strip_path(filenames[i])] = i;
624-
625-
for (size_t i = 0; i < regions.size(); ++i) {
626-
// GET THE RIGHT FILE
627-
const string chrom_name(regions[i].front().get_chrom());
628-
const string chrom_file(chrom_name + FASTA_SUFFIX);
629-
unordered_map<string, size_t>::const_iterator f_idx =
630-
chrom_regions_map.find(chrom_file);
631-
if (f_idx == chrom_regions_map.end())
632-
throw SMITHLABException("chrom not found:\t" + chrom_file);
633-
extract_regions_chrom_fasta(chrom_name, filenames[f_idx->second],
634-
regions[i], sequences);
635-
}
636-
}
637-
638-
void
639-
extract_regions_fasta(const string &dirname,
640-
const vector<SimpleGenomicRegion> &regions_in,
641-
vector<string> &sequences) {
642-
643-
static const string FASTA_SUFFIX(".fa");
644-
assert(check_sorted(regions_in));
645-
646-
vector<string> filenames;
647-
read_dir(dirname, filenames);
648-
649-
vector<vector<SimpleGenomicRegion> > regions;
650-
separate_chromosomes(regions_in, regions);
651-
652-
unordered_map<string, size_t> chrom_regions_map;
653-
for (size_t i = 0; i < filenames.size(); ++i)
654-
chrom_regions_map[strip_path(filenames[i])] = i;
655-
656-
for (size_t i = 0; i < regions.size(); ++i) {
657-
// GET THE RIGHT FILE
658-
const string chrom_name(regions[i].front().get_chrom());
659-
const string chrom_file(chrom_name + FASTA_SUFFIX);
660-
unordered_map<string, size_t>::const_iterator f_idx =
661-
chrom_regions_map.find(chrom_file);
662-
if (f_idx == chrom_regions_map.end())
663-
throw SMITHLABException("chrom not found:\t" + chrom_file);
664-
extract_regions_chrom_fasta(chrom_name, filenames[f_idx->second],
665-
regions[i], sequences);
666-
}
667-
}

GenomicRegion.hpp

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -419,11 +419,6 @@ WriteBEDFile(const std::string filename,
419419
out.close();
420420
}
421421

422-
423-
void
424-
parse_region_name(std::string region_name,
425-
std::string &chrom, size_t &start, size_t &end);
426-
427422
template <class T>
428423
std::string
429424
assemble_region_name(const T &region) {
@@ -438,28 +433,5 @@ assemble_region_name(const T &region, const std::string sep) {
438433
smithlab::toa(region.get_end()));
439434
}
440435

441-
void
442-
extract_regions_chrom_fasta(const std::string &chrom_name,
443-
const std::string &filename,
444-
const std::vector<GenomicRegion> &regions,
445-
std::vector<std::string> &sequences);
446-
447-
448-
void
449-
extract_regions_chrom_fasta(const std::string &chrom_name,
450-
const std::string &filename,
451-
const std::vector<SimpleGenomicRegion> &regions,
452-
std::vector<std::string> &sequences);
453-
454-
void
455-
extract_regions_fasta(const std::string &dirname,
456-
const std::vector<SimpleGenomicRegion> &regions_in,
457-
std::vector<std::string> &sequences);
458-
459-
void
460-
extract_regions_fasta(const std::string &dirname,
461-
const std::vector<GenomicRegion> &regions_in,
462-
std::vector<std::string> &sequences);
463-
464436

465437
#endif

0 commit comments

Comments
 (0)