1616#include < omp.h>
1717#endif
1818
19+ #ifndef SIZE_T_MAX
20+ #define SIZE_T_MAX ((size_t ) -1 )
21+ #endif
22+
23+
1924#define ABS_DIFF (a, b ) ( ((a) > (b)) ? ((a) - (b)) : ((b) - (a)) )
2025
2126class UniProtConverter {
2227public:
23- size_t to_structured_number (std::string &uniprot_id ) const {
24- if (uniprot_id .find (' -' ) != std::string::npos) {
25- uniprot_id = uniprot_id .substr (0 , uniprot_id .find (' -' ));
28+ size_t toStructuredNumber (std::string &uniprotId ) const {
29+ if (uniprotId .find (' -' ) != std::string::npos) {
30+ uniprotId = uniprotId .substr (0 , uniprotId .find (' -' ));
2631 }
2732
28- if (uniprot_id .empty ()) return 0 ;
33+ if (uniprotId .empty ()) return 0 ;
2934
30- const size_t len = uniprot_id .length ();
31- const char first_char = static_cast <char >(std::toupper (uniprot_id [0 ]));
35+ const size_t len = uniprotId .length ();
36+ const char firstChar = static_cast <char >(std::toupper (uniprotId [0 ]));
3237
33- if (len == 6 && (first_char == ' O' || first_char == ' P' || first_char == ' Q' )) {
34- return convertOpqPattern (uniprot_id );
38+ if (len == 6 && (firstChar == ' O' || firstChar == ' P' || firstChar == ' Q' )) {
39+ return convertOpqPattern (uniprotId );
3540 }
3641
3742 if ((len == 6 || len == 10 )) {
38- return convertAnrzPattern (uniprot_id );
43+ return convertAnrzPattern (uniprotId );
3944 }
4045
41- if (uniprot_id [0 ]== ' U' && uniprot_id [1 ]== ' P' && uniprot_id [2 ] ==' I' ) {
42- std::string hex_part (uniprot_id .substr (3 ));
46+ if (uniprotId [0 ] == ' U' && uniprotId [1 ] == ' P' && uniprotId [2 ] == ' I' ) {
47+ std::string hex_part (uniprotId .substr (3 ));
4348 return std::stoll (hex_part, nullptr , 16 );
4449 }
4550
46- return 0 ; // Invalid length or pattern
51+ return 0 ;
4752 }
4853
4954private:
@@ -52,16 +57,18 @@ class UniProtConverter {
5257 size_t multiplier = 1 ;
5358
5459 for (int i = 5 ; i >= 0 ; --i) {
55- const char current_char = static_cast <char >(std::toupper (id[i]));
56- int char_val = -1 ;
60+ const char currentChar = static_cast <char >(std::toupper (id[i]));
61+ int charVal = -1 ;
5762 int radix = 0 ;
5863 switch (i) {
59- case 0 : char_val = getOpqValue (current_char ); radix = 3 ; break ;
60- case 1 : case 5 : char_val = getDigitValue (current_char ); radix = 10 ; break ;
61- case 2 : case 3 : case 4 : char_val = getAlphanumValue (current_char ); radix = 36 ; break ;
64+ case 0 : charVal = getOpqValue (currentChar ); radix = 3 ; break ;
65+ case 1 : case 5 : charVal = getDigitValue (currentChar ); radix = 10 ; break ;
66+ case 2 : case 3 : case 4 : charVal = getAlphanumValue (currentChar ); radix = 36 ; break ;
6267 }
63- if (char_val == -1 ) return 0 ;
64- number += static_cast <size_t >(char_val) * multiplier;
68+ if (charVal == -1 ) {
69+ return 0 ;
70+ }
71+ number += static_cast <size_t >(charVal) * multiplier;
6572 multiplier *= radix;
6673 }
6774 return number;
@@ -73,30 +80,30 @@ class UniProtConverter {
7380 size_t len = id.length ();
7481
7582 for (int i = len - 1 ; i >= 0 ; --i) {
76- const char current_char = static_cast <char >(std::toupper (id[i]));
77- int char_val = -1 ;
83+ const char currentChar = static_cast <char >(std::toupper (id[i]));
84+ int charVal = -1 ;
7885 int radix = 0 ;
7986
8087 // --- Logic using a switch statement ---
8188 switch (i) {
8289 case 0 :
83- char_val = getAnrzValue (current_char ); radix = 23 ;
90+ charVal = getAnrzValue (currentChar ); radix = 23 ;
8491 break ;
8592 case 1 : case 5 : case 9 :
86- char_val = getDigitValue (current_char ); radix = 10 ;
93+ charVal = getDigitValue (currentChar ); radix = 10 ;
8794 break ;
8895 case 2 : case 6 :
89- char_val = getAlphaValue (current_char ); radix = 26 ;
96+ charVal = getAlphaValue (currentChar ); radix = 26 ;
9097 break ;
9198 case 3 : case 4 : case 7 : case 8 :
92- char_val = getAlphanumValue (current_char ); radix = 36 ;
99+ charVal = getAlphanumValue (currentChar ); radix = 36 ;
93100 break ;
94101 default :
95102 return 0 ;
96103 }
97- if (char_val == -1 ) return 0 ;
104+ if (charVal == -1 ) return 0 ;
98105
99- number += static_cast <size_t >(char_val ) * multiplier;
106+ number += static_cast <size_t >(charVal ) * multiplier;
100107 multiplier *= radix;
101108 }
102109 return number;
@@ -127,48 +134,36 @@ struct CompareByTaxon {
127134size_t findNearestPartner (
128135 const unsigned int taxon_id, const Matcher::result_t & query, const std::vector<Matcher::result_t >& results2)
129136{
130- // Use a typedef to simplify the long iterator type name, a common C++98 practice.
131137 typedef std::vector<Matcher::result_t >::const_iterator ResultConstIterator;
132-
133- // 1. Find the sub-range for the given taxon in both vectors.
134138 std::pair<ResultConstIterator, ResultConstIterator> range;
135139 range = std::equal_range (results2.begin (), results2.end (), taxon_id, CompareByTaxon ());
136140
137- // If the taxon isn't in both lists, no pair is possible.
138141 if (range.first == range.second ) {
139- // Use std::make_pair instead of modern brace initialization.
140142 return SIZE_T_MAX;
141143 }
142144
143145 size_t bestIdx = SIZE_T_MAX;
144146 size_t min_dist = SIZE_T_MAX;
145147
146-
147- // 2. For each result in the first taxon range...
148- uint64_t query_uniprot_num = CompareUniProt::getUniProtNumber (query);
149-
150- // 3. ...find the closest partner in the second taxon range using binary search.
148+ size_t query_uniprot_num = CompareUniProt::getUniProtNumber (query);
151149 ResultConstIterator it2 = std::lower_bound (range.first , range.second , query_uniprot_num, CompareUniProt ());
152-
153- // 4. Check the element at the found position.
154150 if (it2 != range.second ) {
155- uint64_t target_uniprot_num = CompareUniProt::getUniProtNumber (*it2);
156- // Calculate absolute difference for unsigned integers without using std::abs on signed types.
157- uint64_t dist = ABS_DIFF (target_uniprot_num, query_uniprot_num);
151+ size_t target_uniprot_num = CompareUniProt::getUniProtNumber (*it2);
152+ size_t dist = ABS_DIFF (target_uniprot_num, query_uniprot_num);
158153
159154 if (dist < min_dist) {
160155 min_dist = dist;
161156 bestIdx = std::distance (results2.begin (), it2);
162157 }
163158 }
164159
165- // 5. Check the element just before the found position.
160+ // Check the element just before the found position.
166161 if (it2 != range.first ) {
167162 ResultConstIterator prev_it2 = it2;
168163 --prev_it2; // Use pre-decrement to get the previous iterator.
169164
170- uint64_t target_uniprot_num = CompareUniProt::getUniProtNumber (*prev_it2);
171- uint64_t dist = ABS_DIFF (query_uniprot_num, target_uniprot_num);
165+ size_t target_uniprot_num = CompareUniProt::getUniProtNumber (*prev_it2);
166+ size_t dist = ABS_DIFF (query_uniprot_num, target_uniprot_num);
172167
173168 if (dist < min_dist) {
174169 bestIdx = std::distance (results2.begin (), prev_it2);
@@ -177,18 +172,15 @@ size_t findNearestPartner(
177172 return bestIdx;
178173}
179174
180- // need for sorting the results
181175static bool compareByTaxId (const Matcher::result_t &first, const Matcher::result_t &second) {
182176 return (first.dbOrfStartPos < second.dbOrfStartPos );
183177}
184178
185179static bool compareByTaxIdAndUniProtNum (const Matcher::result_t &first, const Matcher::result_t &second) {
186- // 1. Primary Sort Key: Compare the taxon ID.
187180 if (first.dbOrfStartPos != second.dbOrfStartPos ) {
188181 return first.dbOrfStartPos < second.dbOrfStartPos ;
189182 }
190183
191- // 2. Secondary Sort Key: Compare the 64-bit UniProt number.
192184 if (first.queryOrfStartPos != second.queryOrfStartPos ) {
193185 return first.queryOrfStartPos < second.queryOrfStartPos ;
194186 }
@@ -212,12 +204,14 @@ int pairaln(int argc, const char **argv, const Command& command) {
212204 fileToIds[lookup[i].fileNumber ].push_back (lookup[i].id );
213205 }
214206 IndexReader *targetHeaderReaderIdx = NULL ;
215- uint16_t extended = DBReader<unsigned int >::getExtendedDbtype (FileUtil::parseDbType (par.db3 .c_str ()));
216- bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP);
217- targetHeaderReaderIdx = new IndexReader (par.db2 , par.threads ,
218- extended & Parameters::DBTYPE_EXTENDED_INDEX_NEED_SRC ? IndexReader::SRC_HEADERS : IndexReader::HEADERS,
219- (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0 );
220-
207+ if (par.pairfilter == Parameters::PAIRALN_FILTER_PROXIMITY) {
208+ uint16_t extended = DBReader<unsigned int >::getExtendedDbtype (FileUtil::parseDbType (par.db3 .c_str ()));
209+ bool touch = (par.preloadMode != Parameters::PRELOAD_MODE_MMAP);
210+ targetHeaderReaderIdx = new IndexReader (par.db2 , par.threads ,
211+ extended & Parameters::DBTYPE_EXTENDED_INDEX_NEED_SRC
212+ ? IndexReader::SRC_HEADERS : IndexReader::HEADERS,
213+ (touch) ? (IndexReader::PRELOAD_INDEX | IndexReader::PRELOAD_DATA) : 0 );
214+ }
221215
222216 std::string db2NoIndexName = PrefilteringIndexReader::dbPathWithoutIndex (par.db2 );
223217 MappingReader* mapping = new MappingReader (db2NoIndexName);
@@ -315,11 +309,10 @@ int pairaln(int argc, const char **argv, const Command& command) {
315309 unsigned int taxon = mapping->lookup (resultPerId[i][resIdx].dbKey );
316310 // we don't want to introduce a new field, reuse existing unused field here
317311 resultPerId[i][resIdx].dbOrfStartPos = taxon;
318-
319312 size_t headerId = targetHeaderReaderIdx->sequenceReader ->getId (resultPerId[i][resIdx].dbKey );
320313 char *headerData = targetHeaderReaderIdx->sequenceReader ->getData (headerId, thread_idx);
321314 std::string targetAccession = Util::parseFastaHeader (headerData);
322- size_t uniProtNumber = converter.to_structured_number (targetAccession);
315+ size_t uniProtNumber = converter.toStructuredNumber (targetAccession);
323316 resultPerId[i][resIdx].queryOrfStartPos = static_cast <int >(uniProtNumber >> 32 );
324317 resultPerId[i][resIdx].queryOrfEndPos = static_cast <int >(uniProtNumber & 0xFFFFFFFF );
325318 }
@@ -417,7 +410,6 @@ int pairaln(int argc, const char **argv, const Command& command) {
417410
418411
419412 unsigned int prevTaxon = UINT_MAX;
420- // iterate over taxonToPair
421413 size_t resIdxStart = 0 ;
422414 for (size_t taxonInList: taxonToPair) {
423415 bool taxonFound = false ;
@@ -453,10 +445,14 @@ int pairaln(int argc, const char **argv, const Command& command) {
453445 }
454446 resultWriter.close ();
455447 qdbr.close ();
456- // clean up
448+ // clean up
457449 delete mapping;
458- delete targetHeaderReaderIdx;
450+ if (par.pairfilter == Parameters::PAIRALN_FILTER_PROXIMITY) {
451+ delete targetHeaderReaderIdx;
452+ }
459453 alnDbr.close ();
460454 return EXIT_SUCCESS;
461455}
462456
457+ #undef ABS_DIFF
458+ #undef SIZE_T_MAX
0 commit comments