Skip to content

Commit b391683

Browse files
authored
Beta 19 (#61)
* init commit * Storing p values as doubles Created a score to p value function (not log space) Changed how sequence names are written to out * First draft of TabFileWriter written * TabFileWriter is passing internal tests * Multiple outputs and new output formats seem to be working * Added show_seq support to json and ultra formats, removed score from BED
1 parent a32e775 commit b391683

15 files changed

+354
-123
lines changed

CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ set(
3232
src/umodel.hpp
3333
src/umodeldriver.hpp
3434
src/BEDFileWriter.hpp
35+
src/TabFileWriter.hpp
3536
src/RepeatFileWriter.hpp
3637
src/JSONFileWriter.hpp
3738
src/RepeatSplitter.hpp
@@ -52,10 +53,11 @@ set(
5253
src/umodel.cpp
5354
src/umodeldriver.cpp
5455
src/BEDFileWriter.cpp
56+
src/TabFileWriter.cpp
5557
src/JSONFileWriter.cpp
5658
src/RepeatSplitter.cpp
5759
src/mask.cpp
58-
src/cli.cpp)
60+
src/cli.cpp src/TabFileWriter.hpp)
5961

6062
find_package(Threads REQUIRED)
6163

src/BEDFileWriter.cpp

Lines changed: 10 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#include "ultra.hpp"
88
#include <algorithm>
99
#include <iostream>
10-
void BEDFileWriter::InitializeWriter(Ultra *ultra) { owner = ultra; }
10+
void BEDFileWriter::InitializeWriter(Ultra *ultra, FILE *out_file) { owner = ultra; out=out_file; }
1111

1212
void BEDFileWriter::WriteRepeat(RepeatRegion *repeat) {
1313

@@ -16,69 +16,29 @@ void BEDFileWriter::WriteRepeat(RepeatRegion *repeat) {
1616
for (int i = 0; i < name.size(); ++i) {
1717
if ((name[i] >= 'a' && name[i] <= 'z') ||
1818
(name[i] >= 'A' && name[i] <= 'Z') ||
19-
(name[i] >= '0' && name[i] <= '9')) {
19+
(name[i] >= '0' && name[i] <= '9') || name[i] == '-' ||
20+
name[i] == '_' || name[i] == '.' || name[i] == ':' || name[i] == '*' ||
21+
name[i] == '#') {
2022
continue;
2123
}
2224

23-
else if (name[i] == ' ') {
24-
name[i] = '\0';
25-
}
26-
2725
else {
28-
name[i] = '_';
26+
name = name.substr(0, i);
27+
break;
2928
}
3029
}
3130

3231
// Columns 1 (name) 2(start) 3 (end)
33-
fprintf(owner->out, "%s\t%lu\t%lu", name.c_str(), repeat->sequenceStart,
32+
fprintf(out, "%s\t%lu\t%lu", name.c_str(), repeat->sequenceStart,
3433
repeat->sequenceStart + repeat->repeatLength);
3534

3635
// We need to decide what to do with the overall sequence
3736

38-
std::string rep_con = ".";
39-
if (!repeat->string_consensus.empty())
37+
std::string rep_con = std::to_string(repeat->repeatPeriod);
38+
if (owner->settings->max_consensus_period >= repeat->repeatPeriod && !repeat->string_consensus.empty())
4039
rep_con = repeat->string_consensus;
41-
// Columns 4 (name) 5 (score) 6 (strand=.) 7 thickstart 8 thickend 9 rgb
42-
fprintf(owner->out, "\t%s\t%f\t.\t%lu\t%lu\t0,0,0\t", rep_con.c_str(), repeat->regionScore,
43-
repeat->sequenceStart, repeat->sequenceStart + repeat->repeatLength);
44-
45-
if (owner->settings->max_split > 0) {
46-
47-
std::string sizes = "";
48-
std::string starts = "0";
49-
int numberOfValidSplits = 0;
50-
51-
int cstart = 0;
52-
if (repeat->splits != nullptr && !repeat->splits->empty()) {
53-
for (int i = 0; i < repeat->splits->size(); ++i) {
54-
int split_i = repeat->splits->at(i);
55-
if (split_i > 0) {
56-
if (numberOfValidSplits > 0)
57-
sizes.push_back(',');
58-
sizes += std::to_string(split_i - cstart);
59-
starts.push_back(',');
60-
starts += std::to_string(split_i);
61-
62-
++numberOfValidSplits;
63-
cstart = split_i;
64-
}
65-
}
66-
67-
if (numberOfValidSplits > 0)
68-
sizes.push_back(',');
69-
sizes += std::to_string(repeat->repeatLength - cstart);
70-
71-
fprintf(owner->out, "%i\t%s\t%s", numberOfValidSplits + 1, sizes.c_str(),
72-
starts.c_str());
73-
74-
}
75-
76-
else {
77-
fprintf(owner->out, "1\t%lu\t0", repeat->repeatLength);
78-
}
79-
}
8040

81-
fprintf(owner->out, "\n");
41+
fprintf(out, "\t%s\n", rep_con.c_str());
8242
}
8343

8444
void BEDFileWriter::EndWriter() {}

src/BEDFileWriter.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@ class Ultra;
1111
class BEDFileWriter : virtual public RepeatFileWriter {
1212
public:
1313
Ultra *owner;
14-
14+
FILE *out;
1515
bool outputMultilineSplitRepeats = false;
1616

17-
void InitializeWriter(Ultra *ultra);
17+
void InitializeWriter(Ultra *ultra, FILE *out);
1818
void WriteRepeat(RepeatRegion *repeat);
1919
void EndWriter();
2020
};

src/JSONFileWriter.cpp

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,15 @@
99
void JSONFileWriter::OutputJSONKeyValue(std::string key, std::string value,
1010
bool quotes) {
1111
if (!quotes)
12-
fprintf(owner->out, ",\n\"%s\": %s", key.c_str(), value.c_str());
12+
fprintf(out, ",\n\"%s\": %s", key.c_str(), value.c_str());
1313
else
14-
fprintf(owner->out, ",\n\"%s\": \"%s\"", key.c_str(), value.c_str());
14+
fprintf(out, ",\n\"%s\": \"%s\"", key.c_str(), value.c_str());
1515
}
1616

17-
void JSONFileWriter::InitializeWriter(Ultra *ultra) {
17+
void JSONFileWriter::InitializeWriter(Ultra *ultra, FILE *out_f) {
1818
owner = ultra;
19-
fprintf(owner->out, "{\"Repeats\": [\n");
19+
out = out_f;
20+
fprintf(out, "{\"Repeats\": [\n");
2021
}
2122

2223
std::string JSONFileWriter::StringForSubRepeat(RepeatRegion *r, int split_index,
@@ -77,33 +78,29 @@ std::string JSONFileWriter::SubRepeatsString(RepeatRegion *r) {
7778
void JSONFileWriter::WriteRepeat(RepeatRegion *repeat) {
7879

7980
if (this->repeatsOutput > 0) {
80-
fprintf(owner->out, ",\n\n");
81+
fprintf(out, ",\n\n");
8182
}
8283

8384
++this->repeatsOutput;
8485

85-
fprintf(owner->out, "{\"SequenceName\": \"%s\"",
86+
fprintf(out, "{\"SequenceName\": \"%s\"",
8687
repeat->sequenceName.c_str());
8788

8889
this->OutputJSONKeyValue("Start", std::to_string(repeat->sequenceStart));
8990
this->OutputJSONKeyValue("Length", std::to_string(repeat->repeatLength));
9091
this->OutputJSONKeyValue("Period", std::to_string(repeat->repeatPeriod));
9192
this->OutputJSONKeyValue("Score", std::to_string(repeat->regionScore));
9293
if (owner->settings->pval) {
93-
this->OutputJSONKeyValue("Log2PVal", std::to_string(repeat->logPVal));
94+
double pval = owner->PvalForScore(repeat->regionScore);
95+
this->OutputJSONKeyValue("PVal", std::to_string(pval));
9496
}
9597

9698
this->OutputJSONKeyValue("Substitutions", std::to_string(repeat->mismatches));
9799
this->OutputJSONKeyValue("Insertions", std::to_string(repeat->insertions));
98100
this->OutputJSONKeyValue("Deletions", std::to_string(repeat->deletions));
99101
this->OutputJSONKeyValue("Consensus", repeat->string_consensus, true);
100102

101-
if (owner->outputReadID) {
102-
this->OutputJSONKeyValue("ReadID", std::to_string(repeat->readID));
103-
this->OutputJSONKeyValue("OC", std::to_string(repeat->overlapCorrection));
104-
}
105-
106-
if (owner->outputRepeatSequence) {
103+
if (owner->settings->show_seq) {
107104
this->OutputJSONKeyValue("Sequence", repeat->sequence, true);
108105
}
109106

@@ -147,7 +144,7 @@ void JSONFileWriter::WriteRepeat(RepeatRegion *repeat) {
147144
std::string subRepeats = SubRepeatsString(repeat);
148145
this->OutputJSONKeyValue("SubRepeats", subRepeats);
149146
}
150-
fprintf(owner->out, "}");
147+
fprintf(out, "}");
151148
}
152149

153-
void JSONFileWriter::EndWriter() { fprintf(owner->out, "]\n}"); }
150+
void JSONFileWriter::EndWriter() { fprintf(out, "]\n}"); }

src/JSONFileWriter.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,13 @@ class RepeatRegion;
1111
class Ultra;
1212
class JSONFileWriter : virtual public RepeatFileWriter {
1313
int repeatsOutput = 0;
14-
14+
FILE *out;
1515
void OutputJSONKeyValue(std::string key, std::string value,
1616
bool quotes = false);
1717

1818
public:
1919
Ultra *owner;
20-
void InitializeWriter(Ultra *ultra);
20+
void InitializeWriter(Ultra *ultra, FILE *out);
2121
void WriteRepeat(RepeatRegion *repeat);
2222
void EndWriter();
2323
std::string StringForSubRepeat(RepeatRegion *r, int split_index,

src/RepeatFileWriter.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@
44

55
#ifndef ULTRA_REPEATFILEWRITER_HPP
66
#define ULTRA_REPEATFILEWRITER_HPP
7-
7+
#include <stdio.h>
88
class Ultra;
99
class RepeatRegion;
1010

1111
class RepeatFileWriter {
1212
public:
13-
virtual void InitializeWriter(Ultra *ultra) {}
13+
virtual void InitializeWriter(Ultra *ultra, FILE *out) {}
1414
virtual void WriteRepeat(RepeatRegion *repeat) {}
1515
virtual void EndWriter() {}
1616
};

src/TabFileWriter.cpp

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
//
2+
// Created by Daniel Olson on 08/07/24.
3+
//
4+
5+
#include "TabFileWriter.hpp"
6+
#include "repeat.hpp"
7+
#include "ultra.hpp"
8+
#include <algorithm>
9+
#include <iostream>
10+
11+
void TabFileWriter::InitializeWriter(Ultra *ultra, FILE *out_f) {
12+
owner = ultra;
13+
out = out_f;
14+
fprintf(out, "SeqID");
15+
fprintf(out, "\tStart");
16+
fprintf(out, "\tEnd");
17+
fprintf(out, "\tPeriod");
18+
fprintf(out, "\tScore");
19+
if (owner->settings->pval)
20+
fprintf(out, ",PValue");
21+
if (owner->settings->max_consensus_period != 0)
22+
fprintf(out, "\tConsensus");
23+
if (owner->settings->max_split > 0) {
24+
fprintf(out, "\t#Subrepeats");
25+
fprintf(out, "\tSubrepeatStarts");
26+
if (owner->settings->max_consensus_period != 0)
27+
fprintf(out, "\tSubrepeatConsensi");
28+
}
29+
30+
if (owner->settings->show_seq) {
31+
fprintf(out, "\tSequence");
32+
}
33+
34+
fprintf(out, "\n");
35+
36+
}
37+
38+
void TabFileWriter::WriteRepeat(RepeatRegion *repeat) {
39+
40+
// We need a better behavior here - check with Travis
41+
std::string name = repeat->sequenceName;
42+
for (int i = 0; i < name.size(); ++i) {
43+
if ((name[i] >= 'a' && name[i] <= 'z') ||
44+
(name[i] >= 'A' && name[i] <= 'Z') ||
45+
(name[i] >= '0' && name[i] <= '9') ||
46+
name[i] == '-' || name[i] == '_' || name[i] == '.' ||
47+
name[i] == ':' || name[i] == '*' || name[i] == '#') {
48+
continue;
49+
}
50+
51+
else {
52+
name = name.substr(0, i);
53+
break;
54+
}
55+
56+
/* else {
57+
name[i] = '_';
58+
}*/
59+
}
60+
61+
// Columns 1 (name) 2 (start) 3 (end) 4 (score)
62+
fprintf(out, "%s\t%lu\t%lu\t%i\t%f", name.c_str(), repeat->sequenceStart,
63+
repeat->sequenceStart + repeat->repeatLength, repeat->repeatPeriod, repeat->regionScore);
64+
if (owner->settings->pval) {
65+
fprintf(out, ",%g", owner->PvalForScore(repeat->regionScore));
66+
}
67+
68+
// We need to decide what to do with the overall sequence
69+
std::string rep_con = ".";
70+
if (owner->settings->max_consensus_period != 0) {
71+
if (owner->settings->max_consensus_period >= repeat->repeatPeriod &&
72+
!repeat->string_consensus.empty())
73+
rep_con = repeat->string_consensus;
74+
75+
fprintf(out, "\t%s", rep_con.c_str());
76+
}
77+
78+
if (owner->settings->max_split > 0) {
79+
std::string sizes = "";
80+
std::string starts = "0";
81+
std::string consensi = "";
82+
int numberOfValidSplits = 0;
83+
84+
int cstart = 0;
85+
if (repeat->splits != nullptr && !repeat->splits->empty()) {
86+
for (int i = 0; i < repeat->splits->size(); ++i) {
87+
int split_i = repeat->splits->at(i);
88+
if (split_i > 0) {
89+
if (numberOfValidSplits > 0) {
90+
sizes.push_back(',');
91+
}
92+
93+
sizes += std::to_string(split_i - cstart);
94+
starts.push_back(',');
95+
starts += std::to_string(split_i);
96+
97+
++numberOfValidSplits;
98+
cstart = split_i;
99+
}
100+
}
101+
102+
if (owner->settings->max_consensus_period != 0) {
103+
for (int i = 0; i < repeat->consensi->size(); ++i) {
104+
std::string con = ".";
105+
if (owner->settings->max_consensus_period >= repeat->repeatPeriod) {
106+
if (repeat->consensi != nullptr && repeat->consensi->size() > i) {
107+
con = repeat->consensi->at(i);
108+
}
109+
}
110+
if (i > 0)
111+
consensi.push_back(',');
112+
consensi += con;
113+
}
114+
}
115+
116+
if (numberOfValidSplits > 0)
117+
sizes.push_back(',');
118+
sizes += std::to_string(repeat->repeatLength - cstart);
119+
120+
fprintf(out, "\t%i\t%s", numberOfValidSplits + 1,
121+
starts.c_str());
122+
if (owner->settings->max_consensus_period != 0) {
123+
fprintf(out, "\t%s", consensi.c_str());
124+
}
125+
126+
}
127+
128+
else {
129+
fprintf(out, "\t1\t0");
130+
if (owner->settings->max_consensus_period != 0) {
131+
fprintf(out, "\t%s", rep_con.c_str());
132+
}
133+
}
134+
}
135+
136+
if (owner->settings->show_seq) {
137+
fprintf(out, "\t%s", repeat->sequence.c_str());
138+
}
139+
fprintf(out, "\n");
140+
}
141+
142+
void TabFileWriter::EndWriter() {}

src/TabFileWriter.hpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
//
2+
// Created by Daniel Olson on 08/07/24.
3+
//
4+
5+
#ifndef ULTRA_TABFILEWRITER_HPP
6+
#define ULTRA_TABFILEWRITER_HPP
7+
#include "RepeatFileWriter.hpp"
8+
9+
class RepeatRegion;
10+
class Ultra;
11+
class TabFileWriter : virtual public RepeatFileWriter {
12+
public:
13+
Ultra *owner;
14+
FILE *out;
15+
16+
void InitializeWriter(Ultra *ultra, FILE *out);
17+
void WriteRepeat(RepeatRegion *repeat);
18+
void EndWriter();
19+
};
20+
21+
#endif // ULTRA_TABFILEWRITER_HPP

0 commit comments

Comments
 (0)