|
| 1 | +#ifndef EDLIB_H |
| 2 | +#define EDLIB_H |
| 3 | + |
| 4 | +/** |
| 5 | + * @file |
| 6 | + * @author Martin Sosic |
| 7 | + * @brief Main header file, containing all public functions and structures. |
| 8 | + */ |
| 9 | + |
| 10 | +// Define EDLIB_API macro to properly export symbols |
| 11 | +#ifdef EDLIB_SHARED |
| 12 | +# ifdef _WIN32 |
| 13 | +# ifdef EDLIB_BUILD |
| 14 | +# define EDLIB_API __declspec(dllexport) |
| 15 | +# else |
| 16 | +# define EDLIB_API __declspec(dllimport) |
| 17 | +# endif |
| 18 | +# else |
| 19 | +# define EDLIB_API __attribute__ ((visibility ("default"))) |
| 20 | +# endif |
| 21 | +#else |
| 22 | +# define EDLIB_API |
| 23 | +#endif |
| 24 | + |
| 25 | +#ifdef __cplusplus |
| 26 | +extern "C" { |
| 27 | +#endif |
| 28 | + |
| 29 | +// Status codes |
| 30 | +#define EDLIB_STATUS_OK 0 |
| 31 | +#define EDLIB_STATUS_ERROR 1 |
| 32 | + |
| 33 | + /** |
| 34 | + * Alignment methods - how should Edlib treat gaps before and after query? |
| 35 | + */ |
| 36 | + typedef enum { |
| 37 | + /** |
| 38 | + * Global method. This is the standard method. |
| 39 | + * Useful when you want to find out how similar is first sequence to second sequence. |
| 40 | + */ |
| 41 | + EDLIB_MODE_NW, |
| 42 | + /** |
| 43 | + * Prefix method. Similar to global method, but with a small twist - gap at query end is not penalized. |
| 44 | + * What that means is that deleting elements from the end of second sequence is "free"! |
| 45 | + * For example, if we had "AACT" and "AACTGGC", edit distance would be 0, because removing "GGC" from the end |
| 46 | + * of second sequence is "free" and does not count into total edit distance. This method is appropriate |
| 47 | + * when you want to find out how well first sequence fits at the beginning of second sequence. |
| 48 | + */ |
| 49 | + EDLIB_MODE_SHW, |
| 50 | + /** |
| 51 | + * Infix method. Similar as prefix method, but with one more twist - gaps at query end and start are |
| 52 | + * not penalized. What that means is that deleting elements from the start and end of second sequence is "free"! |
| 53 | + * For example, if we had ACT and CGACTGAC, edit distance would be 0, because removing CG from the start |
| 54 | + * and GAC from the end of second sequence is "free" and does not count into total edit distance. |
| 55 | + * This method is appropriate when you want to find out how well first sequence fits at any part of |
| 56 | + * second sequence. |
| 57 | + * For example, if your second sequence was a long text and your first sequence was a sentence from that text, |
| 58 | + * but slightly scrambled, you could use this method to discover how scrambled it is and where it fits in |
| 59 | + * that text. In bioinformatics, this method is appropriate for aligning read to a sequence. |
| 60 | + */ |
| 61 | + EDLIB_MODE_HW |
| 62 | + } EdlibAlignMode; |
| 63 | + |
| 64 | + /** |
| 65 | + * Alignment tasks - what do you want Edlib to do? |
| 66 | + */ |
| 67 | + typedef enum { |
| 68 | + EDLIB_TASK_DISTANCE, //!< Find edit distance and end locations. |
| 69 | + EDLIB_TASK_LOC, //!< Find edit distance, end locations and start locations. |
| 70 | + EDLIB_TASK_PATH //!< Find edit distance, end locations and start locations and alignment path. |
| 71 | + } EdlibAlignTask; |
| 72 | + |
| 73 | + /** |
| 74 | + * Describes cigar format. |
| 75 | + * @see http://samtools.github.io/hts-specs/SAMv1.pdf |
| 76 | + * @see http://drive5.com/usearch/manual/cigar.html |
| 77 | + */ |
| 78 | + typedef enum { |
| 79 | + EDLIB_CIGAR_STANDARD, //!< Match: 'M', Insertion: 'I', Deletion: 'D', Mismatch: 'M'. |
| 80 | + EDLIB_CIGAR_EXTENDED //!< Match: '=', Insertion: 'I', Deletion: 'D', Mismatch: 'X'. |
| 81 | + } EdlibCigarFormat; |
| 82 | + |
| 83 | +// Edit operations. |
| 84 | +#define EDLIB_EDOP_MATCH 0 //!< Match. |
| 85 | +#define EDLIB_EDOP_INSERT 1 //!< Insertion to target = deletion from query. |
| 86 | +#define EDLIB_EDOP_DELETE 2 //!< Deletion from target = insertion to query. |
| 87 | +#define EDLIB_EDOP_MISMATCH 3 //!< Mismatch. |
| 88 | + |
| 89 | + /** |
| 90 | + * @brief Defines two given characters as equal. |
| 91 | + */ |
| 92 | + typedef struct { |
| 93 | + char first; |
| 94 | + char second; |
| 95 | + } EdlibEqualityPair; |
| 96 | + |
| 97 | + /** |
| 98 | + * @brief Configuration object for edlibAlign() function. |
| 99 | + */ |
| 100 | + typedef struct { |
| 101 | + /** |
| 102 | + * Set k to non-negative value to tell edlib that edit distance is not larger than k. |
| 103 | + * Smaller k can significantly improve speed of computation. |
| 104 | + * If edit distance is larger than k, edlib will set edit distance to -1. |
| 105 | + * Set k to negative value and edlib will internally auto-adjust k until score is found. |
| 106 | + */ |
| 107 | + int k; |
| 108 | + |
| 109 | + /** |
| 110 | + * Alignment method. |
| 111 | + * EDLIB_MODE_NW: global (Needleman-Wunsch) |
| 112 | + * EDLIB_MODE_SHW: prefix. Gap after query is not penalized. |
| 113 | + * EDLIB_MODE_HW: infix. Gaps before and after query are not penalized. |
| 114 | + */ |
| 115 | + EdlibAlignMode mode; |
| 116 | + |
| 117 | + /** |
| 118 | + * Alignment task - tells Edlib what to calculate. Less to calculate, faster it is. |
| 119 | + * EDLIB_TASK_DISTANCE - find edit distance and end locations of optimal alignment paths in target. |
| 120 | + * EDLIB_TASK_LOC - find edit distance and start and end locations of optimal alignment paths in target. |
| 121 | + * EDLIB_TASK_PATH - find edit distance, alignment path (and start and end locations of it in target). |
| 122 | + */ |
| 123 | + EdlibAlignTask task; |
| 124 | + |
| 125 | + /** |
| 126 | + * List of pairs of characters, where each pair defines two characters as equal. |
| 127 | + * This way you can extend edlib's definition of equality (which is that each character is equal only |
| 128 | + * to itself). |
| 129 | + * This can be useful if you have some wildcard characters that should match multiple other characters, |
| 130 | + * or e.g. if you want edlib to be case insensitive. |
| 131 | + * Can be set to NULL if there are none. |
| 132 | + */ |
| 133 | + const EdlibEqualityPair* additionalEqualities; |
| 134 | + |
| 135 | + /** |
| 136 | + * Number of additional equalities, which is non-negative number. |
| 137 | + * 0 if there are none. |
| 138 | + */ |
| 139 | + int additionalEqualitiesLength; |
| 140 | + } EdlibAlignConfig; |
| 141 | + |
| 142 | + /** |
| 143 | + * Helper method for easy construction of configuration object. |
| 144 | + * @return Configuration object filled with given parameters. |
| 145 | + */ |
| 146 | + EDLIB_API EdlibAlignConfig edlibNewAlignConfig( |
| 147 | + int k, EdlibAlignMode mode, EdlibAlignTask task, |
| 148 | + const EdlibEqualityPair* additionalEqualities, |
| 149 | + int additionalEqualitiesLength |
| 150 | + ); |
| 151 | + |
| 152 | + /** |
| 153 | + * @return Default configuration object, with following defaults: |
| 154 | + * k = -1, mode = EDLIB_MODE_NW, task = EDLIB_TASK_DISTANCE, no additional equalities. |
| 155 | + */ |
| 156 | + EDLIB_API EdlibAlignConfig edlibDefaultAlignConfig(void); |
| 157 | + |
| 158 | + |
| 159 | + /** |
| 160 | + * Container for results of alignment done by edlibAlign() function. |
| 161 | + */ |
| 162 | + typedef struct { |
| 163 | + /** |
| 164 | + * EDLIB_STATUS_OK or EDLIB_STATUS_ERROR. If error, all other fields will have undefined values. |
| 165 | + */ |
| 166 | + int status; |
| 167 | + |
| 168 | + /** |
| 169 | + * -1 if k is non-negative and edit distance is larger than k. |
| 170 | + */ |
| 171 | + int editDistance; |
| 172 | + |
| 173 | + /** |
| 174 | + * Array of zero-based positions in target where optimal alignment paths end. |
| 175 | + * If gap after query is penalized, gap counts as part of query (NW), otherwise not. |
| 176 | + * Set to NULL if edit distance is larger than k. |
| 177 | + * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free(). |
| 178 | + */ |
| 179 | + int* endLocations; |
| 180 | + |
| 181 | + /** |
| 182 | + * Array of zero-based positions in target where optimal alignment paths start, |
| 183 | + * they correspond to endLocations. |
| 184 | + * If gap before query is penalized, gap counts as part of query (NW), otherwise not. |
| 185 | + * Set to NULL if not calculated or if edit distance is larger than k. |
| 186 | + * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free(). |
| 187 | + */ |
| 188 | + int* startLocations; |
| 189 | + |
| 190 | + /** |
| 191 | + * Number of end (and start) locations. |
| 192 | + */ |
| 193 | + int numLocations; |
| 194 | + |
| 195 | + /** |
| 196 | + * Alignment is found for first pair of start and end locations. |
| 197 | + * Set to NULL if not calculated. |
| 198 | + * Alignment is sequence of numbers: 0, 1, 2, 3. |
| 199 | + * 0 stands for match. |
| 200 | + * 1 stands for insertion to target. |
| 201 | + * 2 stands for insertion to query. |
| 202 | + * 3 stands for mismatch. |
| 203 | + * Alignment aligns query to target from beginning of query till end of query. |
| 204 | + * If gaps are not penalized, they are not in alignment. |
| 205 | + * If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free(). |
| 206 | + */ |
| 207 | + unsigned char* alignment; |
| 208 | + |
| 209 | + /** |
| 210 | + * Length of alignment. |
| 211 | + */ |
| 212 | + int alignmentLength; |
| 213 | + |
| 214 | + /** |
| 215 | + * Number of different characters in query and target together. |
| 216 | + */ |
| 217 | + int alphabetLength; |
| 218 | + } EdlibAlignResult; |
| 219 | + |
| 220 | + /** |
| 221 | + * Frees memory in EdlibAlignResult that was allocated by edlib. |
| 222 | + * If you do not use it, make sure to free needed members manually using free(). |
| 223 | + */ |
| 224 | + EDLIB_API void edlibFreeAlignResult(EdlibAlignResult result); |
| 225 | + |
| 226 | + |
| 227 | + /** |
| 228 | + * Aligns two sequences (query and target) using edit distance (levenshtein distance). |
| 229 | + * Through config parameter, this function supports different alignment methods (global, prefix, infix), |
| 230 | + * as well as different modes of search (tasks). |
| 231 | + * It always returns edit distance and end locations of optimal alignment in target. |
| 232 | + * It optionally returns start locations of optimal alignment in target and alignment path, |
| 233 | + * if you choose appropriate tasks. |
| 234 | + * @param [in] query First sequence. |
| 235 | + * @param [in] queryLength Number of characters in first sequence. |
| 236 | + * @param [in] target Second sequence. |
| 237 | + * @param [in] targetLength Number of characters in second sequence. |
| 238 | + * @param [in] config Additional alignment parameters, like alignment method and wanted results. |
| 239 | + * @return Result of alignment, which can contain edit distance, start and end locations and alignment path. |
| 240 | + * Make sure to clean up the object using edlibFreeAlignResult() or by manually freeing needed members. |
| 241 | + */ |
| 242 | + EDLIB_API EdlibAlignResult edlibAlign( |
| 243 | + const char* query, int queryLength, |
| 244 | + const char* target, int targetLength, |
| 245 | + const EdlibAlignConfig config |
| 246 | + ); |
| 247 | + |
| 248 | + |
| 249 | + /** |
| 250 | + * Builds cigar string from given alignment sequence. |
| 251 | + * @param [in] alignment Alignment sequence. |
| 252 | + * 0 stands for match. |
| 253 | + * 1 stands for insertion to target. |
| 254 | + * 2 stands for insertion to query. |
| 255 | + * 3 stands for mismatch. |
| 256 | + * @param [in] alignmentLength |
| 257 | + * @param [in] cigarFormat Cigar will be returned in specified format. |
| 258 | + * @return Cigar string. |
| 259 | + * I stands for insertion. |
| 260 | + * D stands for deletion. |
| 261 | + * X stands for mismatch. (used only in extended format) |
| 262 | + * = stands for match. (used only in extended format) |
| 263 | + * M stands for (mis)match. (used only in standard format) |
| 264 | + * String is null terminated. |
| 265 | + * Needed memory is allocated and given pointer is set to it. |
| 266 | + * Do not forget to free it later using free()! |
| 267 | + */ |
| 268 | + EDLIB_API char* edlibAlignmentToCigar( |
| 269 | + const unsigned char* alignment, int alignmentLength, |
| 270 | + EdlibCigarFormat cigarFormat |
| 271 | + ); |
| 272 | + |
| 273 | +#ifdef __cplusplus |
| 274 | +} |
| 275 | +#endif |
| 276 | + |
| 277 | +#endif // EDLIB_H |
0 commit comments