Skip to content

Commit 6ccc6e2

Browse files
committed
add edlib
1 parent 880cb91 commit 6ccc6e2

File tree

4 files changed

+1763
-6
lines changed

4 files changed

+1763
-6
lines changed

.gitmodules

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,4 @@
1111
[submodule "WFA2-lib"]
1212
path = WFA2-lib
1313
url = https://github.com/smarco/WFA2-lib.git
14-
ignore = dirty
15-
16-
[submodule "edlib"]
17-
path = edlib
18-
url = https://github.com/Martinsos/edlib.git
14+
ignore = dirty

edlib

Lines changed: 0 additions & 1 deletion
This file was deleted.

edlib/include/edlib.h

Lines changed: 277 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,277 @@
1+
#ifndef EDLIB_H
2+
#define EDLIB_H
3+
4+
/**
5+
* @file
6+
* @author Martin Sosic
7+
* @brief Main header file, containing all public functions and structures.
8+
*/
9+
10+
// Define EDLIB_API macro to properly export symbols
11+
#ifdef EDLIB_SHARED
12+
# ifdef _WIN32
13+
# ifdef EDLIB_BUILD
14+
# define EDLIB_API __declspec(dllexport)
15+
# else
16+
# define EDLIB_API __declspec(dllimport)
17+
# endif
18+
# else
19+
# define EDLIB_API __attribute__ ((visibility ("default")))
20+
# endif
21+
#else
22+
# define EDLIB_API
23+
#endif
24+
25+
#ifdef __cplusplus
26+
extern "C" {
27+
#endif
28+
29+
// Status codes
30+
#define EDLIB_STATUS_OK 0
31+
#define EDLIB_STATUS_ERROR 1
32+
33+
/**
34+
* Alignment methods - how should Edlib treat gaps before and after query?
35+
*/
36+
typedef enum {
37+
/**
38+
* Global method. This is the standard method.
39+
* Useful when you want to find out how similar is first sequence to second sequence.
40+
*/
41+
EDLIB_MODE_NW,
42+
/**
43+
* Prefix method. Similar to global method, but with a small twist - gap at query end is not penalized.
44+
* What that means is that deleting elements from the end of second sequence is "free"!
45+
* For example, if we had "AACT" and "AACTGGC", edit distance would be 0, because removing "GGC" from the end
46+
* of second sequence is "free" and does not count into total edit distance. This method is appropriate
47+
* when you want to find out how well first sequence fits at the beginning of second sequence.
48+
*/
49+
EDLIB_MODE_SHW,
50+
/**
51+
* Infix method. Similar as prefix method, but with one more twist - gaps at query end and start are
52+
* not penalized. What that means is that deleting elements from the start and end of second sequence is "free"!
53+
* For example, if we had ACT and CGACTGAC, edit distance would be 0, because removing CG from the start
54+
* and GAC from the end of second sequence is "free" and does not count into total edit distance.
55+
* This method is appropriate when you want to find out how well first sequence fits at any part of
56+
* second sequence.
57+
* For example, if your second sequence was a long text and your first sequence was a sentence from that text,
58+
* but slightly scrambled, you could use this method to discover how scrambled it is and where it fits in
59+
* that text. In bioinformatics, this method is appropriate for aligning read to a sequence.
60+
*/
61+
EDLIB_MODE_HW
62+
} EdlibAlignMode;
63+
64+
/**
65+
* Alignment tasks - what do you want Edlib to do?
66+
*/
67+
typedef enum {
68+
EDLIB_TASK_DISTANCE, //!< Find edit distance and end locations.
69+
EDLIB_TASK_LOC, //!< Find edit distance, end locations and start locations.
70+
EDLIB_TASK_PATH //!< Find edit distance, end locations and start locations and alignment path.
71+
} EdlibAlignTask;
72+
73+
/**
74+
* Describes cigar format.
75+
* @see http://samtools.github.io/hts-specs/SAMv1.pdf
76+
* @see http://drive5.com/usearch/manual/cigar.html
77+
*/
78+
typedef enum {
79+
EDLIB_CIGAR_STANDARD, //!< Match: 'M', Insertion: 'I', Deletion: 'D', Mismatch: 'M'.
80+
EDLIB_CIGAR_EXTENDED //!< Match: '=', Insertion: 'I', Deletion: 'D', Mismatch: 'X'.
81+
} EdlibCigarFormat;
82+
83+
// Edit operations.
84+
#define EDLIB_EDOP_MATCH 0 //!< Match.
85+
#define EDLIB_EDOP_INSERT 1 //!< Insertion to target = deletion from query.
86+
#define EDLIB_EDOP_DELETE 2 //!< Deletion from target = insertion to query.
87+
#define EDLIB_EDOP_MISMATCH 3 //!< Mismatch.
88+
89+
/**
90+
* @brief Defines two given characters as equal.
91+
*/
92+
typedef struct {
93+
char first;
94+
char second;
95+
} EdlibEqualityPair;
96+
97+
/**
98+
* @brief Configuration object for edlibAlign() function.
99+
*/
100+
typedef struct {
101+
/**
102+
* Set k to non-negative value to tell edlib that edit distance is not larger than k.
103+
* Smaller k can significantly improve speed of computation.
104+
* If edit distance is larger than k, edlib will set edit distance to -1.
105+
* Set k to negative value and edlib will internally auto-adjust k until score is found.
106+
*/
107+
int k;
108+
109+
/**
110+
* Alignment method.
111+
* EDLIB_MODE_NW: global (Needleman-Wunsch)
112+
* EDLIB_MODE_SHW: prefix. Gap after query is not penalized.
113+
* EDLIB_MODE_HW: infix. Gaps before and after query are not penalized.
114+
*/
115+
EdlibAlignMode mode;
116+
117+
/**
118+
* Alignment task - tells Edlib what to calculate. Less to calculate, faster it is.
119+
* EDLIB_TASK_DISTANCE - find edit distance and end locations of optimal alignment paths in target.
120+
* EDLIB_TASK_LOC - find edit distance and start and end locations of optimal alignment paths in target.
121+
* EDLIB_TASK_PATH - find edit distance, alignment path (and start and end locations of it in target).
122+
*/
123+
EdlibAlignTask task;
124+
125+
/**
126+
* List of pairs of characters, where each pair defines two characters as equal.
127+
* This way you can extend edlib's definition of equality (which is that each character is equal only
128+
* to itself).
129+
* This can be useful if you have some wildcard characters that should match multiple other characters,
130+
* or e.g. if you want edlib to be case insensitive.
131+
* Can be set to NULL if there are none.
132+
*/
133+
const EdlibEqualityPair* additionalEqualities;
134+
135+
/**
136+
* Number of additional equalities, which is non-negative number.
137+
* 0 if there are none.
138+
*/
139+
int additionalEqualitiesLength;
140+
} EdlibAlignConfig;
141+
142+
/**
143+
* Helper method for easy construction of configuration object.
144+
* @return Configuration object filled with given parameters.
145+
*/
146+
EDLIB_API EdlibAlignConfig edlibNewAlignConfig(
147+
int k, EdlibAlignMode mode, EdlibAlignTask task,
148+
const EdlibEqualityPair* additionalEqualities,
149+
int additionalEqualitiesLength
150+
);
151+
152+
/**
153+
* @return Default configuration object, with following defaults:
154+
* k = -1, mode = EDLIB_MODE_NW, task = EDLIB_TASK_DISTANCE, no additional equalities.
155+
*/
156+
EDLIB_API EdlibAlignConfig edlibDefaultAlignConfig(void);
157+
158+
159+
/**
160+
* Container for results of alignment done by edlibAlign() function.
161+
*/
162+
typedef struct {
163+
/**
164+
* EDLIB_STATUS_OK or EDLIB_STATUS_ERROR. If error, all other fields will have undefined values.
165+
*/
166+
int status;
167+
168+
/**
169+
* -1 if k is non-negative and edit distance is larger than k.
170+
*/
171+
int editDistance;
172+
173+
/**
174+
* Array of zero-based positions in target where optimal alignment paths end.
175+
* If gap after query is penalized, gap counts as part of query (NW), otherwise not.
176+
* Set to NULL if edit distance is larger than k.
177+
* If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
178+
*/
179+
int* endLocations;
180+
181+
/**
182+
* Array of zero-based positions in target where optimal alignment paths start,
183+
* they correspond to endLocations.
184+
* If gap before query is penalized, gap counts as part of query (NW), otherwise not.
185+
* Set to NULL if not calculated or if edit distance is larger than k.
186+
* If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
187+
*/
188+
int* startLocations;
189+
190+
/**
191+
* Number of end (and start) locations.
192+
*/
193+
int numLocations;
194+
195+
/**
196+
* Alignment is found for first pair of start and end locations.
197+
* Set to NULL if not calculated.
198+
* Alignment is sequence of numbers: 0, 1, 2, 3.
199+
* 0 stands for match.
200+
* 1 stands for insertion to target.
201+
* 2 stands for insertion to query.
202+
* 3 stands for mismatch.
203+
* Alignment aligns query to target from beginning of query till end of query.
204+
* If gaps are not penalized, they are not in alignment.
205+
* If you do not free whole result object using edlibFreeAlignResult(), do not forget to use free().
206+
*/
207+
unsigned char* alignment;
208+
209+
/**
210+
* Length of alignment.
211+
*/
212+
int alignmentLength;
213+
214+
/**
215+
* Number of different characters in query and target together.
216+
*/
217+
int alphabetLength;
218+
} EdlibAlignResult;
219+
220+
/**
221+
* Frees memory in EdlibAlignResult that was allocated by edlib.
222+
* If you do not use it, make sure to free needed members manually using free().
223+
*/
224+
EDLIB_API void edlibFreeAlignResult(EdlibAlignResult result);
225+
226+
227+
/**
228+
* Aligns two sequences (query and target) using edit distance (levenshtein distance).
229+
* Through config parameter, this function supports different alignment methods (global, prefix, infix),
230+
* as well as different modes of search (tasks).
231+
* It always returns edit distance and end locations of optimal alignment in target.
232+
* It optionally returns start locations of optimal alignment in target and alignment path,
233+
* if you choose appropriate tasks.
234+
* @param [in] query First sequence.
235+
* @param [in] queryLength Number of characters in first sequence.
236+
* @param [in] target Second sequence.
237+
* @param [in] targetLength Number of characters in second sequence.
238+
* @param [in] config Additional alignment parameters, like alignment method and wanted results.
239+
* @return Result of alignment, which can contain edit distance, start and end locations and alignment path.
240+
* Make sure to clean up the object using edlibFreeAlignResult() or by manually freeing needed members.
241+
*/
242+
EDLIB_API EdlibAlignResult edlibAlign(
243+
const char* query, int queryLength,
244+
const char* target, int targetLength,
245+
const EdlibAlignConfig config
246+
);
247+
248+
249+
/**
250+
* Builds cigar string from given alignment sequence.
251+
* @param [in] alignment Alignment sequence.
252+
* 0 stands for match.
253+
* 1 stands for insertion to target.
254+
* 2 stands for insertion to query.
255+
* 3 stands for mismatch.
256+
* @param [in] alignmentLength
257+
* @param [in] cigarFormat Cigar will be returned in specified format.
258+
* @return Cigar string.
259+
* I stands for insertion.
260+
* D stands for deletion.
261+
* X stands for mismatch. (used only in extended format)
262+
* = stands for match. (used only in extended format)
263+
* M stands for (mis)match. (used only in standard format)
264+
* String is null terminated.
265+
* Needed memory is allocated and given pointer is set to it.
266+
* Do not forget to free it later using free()!
267+
*/
268+
EDLIB_API char* edlibAlignmentToCigar(
269+
const unsigned char* alignment, int alignmentLength,
270+
EdlibCigarFormat cigarFormat
271+
);
272+
273+
#ifdef __cplusplus
274+
}
275+
#endif
276+
277+
#endif // EDLIB_H

0 commit comments

Comments
 (0)