Skip to content

Commit fa2872b

Browse files
authored
feat: output the HN tag and optionally add the MD to the XA tag (#25)
This requires us to patch bwa, which we store in the patches sub-directory. They are applied before we build, and reverted after the build completes (regardless of success). They can be removed when the bwa PRs are merged and submodule updated. See: lh3/bwa#438 See: lh3/bwa#439
1 parent 4019e84 commit fa2872b

File tree

7 files changed

+375
-43
lines changed

7 files changed

+375
-43
lines changed

build.py

Lines changed: 75 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,45 @@
11
import multiprocessing
2+
import os
23
import platform
4+
import subprocess
5+
from contextlib import contextmanager
36
from pathlib import Path
47
from typing import List
58

69
from Cython.Build import cythonize
710
from Cython.Distutils.build_ext import new_build_ext as cython_build_ext
811
from setuptools import Extension, Distribution
912

13+
@contextmanager
14+
def changedir(path):
15+
save_dir = os.getcwd()
16+
os.chdir(path)
17+
try:
18+
yield
19+
finally:
20+
os.chdir(save_dir)
21+
22+
@contextmanager
23+
def with_patches():
24+
patches = sorted([
25+
os.path.abspath(patch)
26+
for patch in Path("patches").iterdir()
27+
if patch.is_file() and patch.suffix == ".patch"
28+
])
29+
with changedir("bwa"):
30+
for patch in patches:
31+
retcode = subprocess.call(f"git apply {patch}", shell=True)
32+
if retcode != 0:
33+
raise RuntimeError(f"Failed to apply patch {patch}")
34+
try:
35+
yield
36+
finally:
37+
commands = ["git submodule deinit -f .", "git submodule update --init"]
38+
for command in commands:
39+
retcode = subprocess.call(command, shell=True)
40+
if retcode != 0:
41+
raise RuntimeError(f"Failed to reset submodules: {command}")
42+
1043
SOURCE_DIR = Path("pybwa")
1144
BUILD_DIR = Path("cython_build")
1245
compile_args = []
@@ -110,46 +143,48 @@ def cythonize_helper(extension_modules: List[Extension]) -> List[Extension]:
110143

111144

112145
def build():
113-
# Collect and cythonize all files
114-
extension_modules = cythonize_helper([
115-
libbwaindex_module,
116-
libbwaaln_module,
117-
libbwamem_module
118-
])
119-
120-
# Use Setuptools to collect files
121-
distribution = Distribution({
122-
"name": "pybwa",
123-
'version': '0.0.1',
124-
'description': 'Python bindings for BWA',
125-
'long_description': __doc__,
126-
'long_description_content_type': 'text/x-rst',
127-
'author': 'Nils Homer',
128-
'author_email': 'nils@fulcrumgenomics.com',
129-
'license': 'MIT',
130-
'platforms': ['POSIX', 'UNIX', 'MacOS'],
131-
'classifiers': [_f for _f in CLASSIFIERS.split('\n') if _f],
132-
'url': 'https://github.com/fulcrumgenomics/pybwa',
133-
'packages': ['pybwa', 'pybwa.include.bwa'],
134-
'package_dir': {'pybwa': 'pybwa', 'pybwa.include.bwa': 'bwa'},
135-
'package_data': {'': ['*.pxd', '*.h', '*.c', 'py.typed', '*.pyi'], },
136-
"ext_modules": extension_modules,
137-
"cmdclass": {
138-
"build_ext": cython_build_ext,
139-
},
140-
})
141-
142-
# Grab the build_ext command and copy all files back to source dir.
143-
# Done so Poetry grabs the files during the next step in its build.
144-
build_ext_cmd = distribution.get_command_obj("build_ext")
145-
build_ext_cmd.ensure_finalized()
146-
# Set the value to 1 for "inplace", with the goal to build extensions
147-
# in build directory, and then copy all files back to the source dir
148-
# (under the hood, "copy_extensions_to_source" will be called after
149-
# building the extensions). This is done so Poetry grabs the files
150-
# during the next step in its build.
151-
build_ext_cmd.inplace = 1
152-
build_ext_cmd.run()
146+
# apply patches to bwa, then revert them after
147+
with with_patches():
148+
# Collect and cythonize all files
149+
extension_modules = cythonize_helper([
150+
libbwaindex_module,
151+
libbwaaln_module,
152+
libbwamem_module
153+
])
154+
155+
# Use Setuptools to collect files
156+
distribution = Distribution({
157+
"name": "pybwa",
158+
'version': '0.0.1',
159+
'description': 'Python bindings for BWA',
160+
'long_description': __doc__,
161+
'long_description_content_type': 'text/x-rst',
162+
'author': 'Nils Homer',
163+
'author_email': 'nils@fulcrumgenomics.com',
164+
'license': 'MIT',
165+
'platforms': ['POSIX', 'UNIX', 'MacOS'],
166+
'classifiers': [_f for _f in CLASSIFIERS.split('\n') if _f],
167+
'url': 'https://github.com/fulcrumgenomics/pybwa',
168+
'packages': ['pybwa', 'pybwa.include.bwa'],
169+
'package_dir': {'pybwa': 'pybwa', 'pybwa.include.bwa': 'bwa'},
170+
'package_data': {'': ['*.pxd', '*.h', '*.c', 'py.typed', '*.pyi'], },
171+
"ext_modules": extension_modules,
172+
"cmdclass": {
173+
"build_ext": cython_build_ext,
174+
},
175+
})
176+
177+
# Grab the build_ext command and copy all files back to source dir.
178+
# Done so Poetry grabs the files during the next step in its build.
179+
build_ext_cmd = distribution.get_command_obj("build_ext")
180+
build_ext_cmd.ensure_finalized()
181+
# Set the value to 1 for "inplace", with the goal to build extensions
182+
# in build directory, and then copy all files back to the source dir
183+
# (under the hood, "copy_extensions_to_source" will be called after
184+
# building the extensions). This is done so Poetry grabs the files
185+
# during the next step in its build.
186+
build_ext_cmd.inplace = 1
187+
build_ext_cmd.run()
153188

154189

155190
if __name__ == "__main__":

docs/api.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,14 @@ As a result, the mapping quality may also differ slightly.
9999
This is due to an implementation detail in which the order of random numbers used is different between this wrapper
100100
and command-line.
101101

102+
Finally, the following additions have been made to :code:`bwa aln/samse`:
103+
104+
#. The standard SAM tag :code:`HN` is added. This is useful if we find too many hits
105+
(:attr:`~pybwa.BwaAlnOptions.max_hits`) and therefore no hits are reported in the :code:`XA` tag, we can still
106+
know how many were found.
107+
#. The :py:attr:`~pybwa.BwaAlnOptions.with_md` option will add the standard SAM tag :code:`MD` to the :code:`XA` tag,
108+
otherwise :code:`.` will be used. This provides additional information on the quality of alternative alignments.
109+
102110
===
103111
API
104112
===
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
From 7d36ae830fc7391cfa69ffcdf239755f01317c1c Mon Sep 17 00:00:00 2001
2+
From: Nils Homer <nh13@users.noreply.github.com>
3+
Date: Thu, 16 Jan 2025 22:35:13 -0800
4+
Subject: [PATCH 1/2] feat: add HN tag to bwa aln
5+
6+
---
7+
bwase.c | 17 +++++++++--------
8+
bwtaln.h | 1 +
9+
2 files changed, 10 insertions(+), 8 deletions(-)
10+
11+
diff --git a/bwase.c b/bwase.c
12+
index 18e8671..eb43c02 100644
13+
--- a/bwase.c
14+
+++ b/bwase.c
15+
@@ -21,7 +21,7 @@ int g_log_n[256];
16+
17+
void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi)
18+
{
19+
- int i, cnt, best;
20+
+ int i, k, cnt, best;
21+
if (n_aln == 0) {
22+
s->type = BWA_TYPE_NO_MATCH;
23+
s->c1 = s->c2 = 0;
24+
@@ -47,14 +47,14 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma
25+
s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE;
26+
}
27+
28+
+ for (k = s->n_occ = 0; k < n_aln; ++k) {
29+
+ const bwt_aln1_t *q = aln + k;
30+
+ s->n_occ += q->l - q->k + 1;
31+
+ }
32+
if (n_multi) {
33+
- int k, rest, n_occ, z = 0;
34+
- for (k = n_occ = 0; k < n_aln; ++k) {
35+
- const bwt_aln1_t *q = aln + k;
36+
- n_occ += q->l - q->k + 1;
37+
- }
38+
+ int rest, z = 0;
39+
if (s->multi) free(s->multi);
40+
- if (n_occ > n_multi + 1) { // if there are too many hits, generate none of them
41+
+ if (s->n_occ > n_multi + 1) { // if there are too many hits, generate none of them
42+
s->multi = 0; s->n_multi = 0;
43+
return;
44+
}
45+
@@ -62,7 +62,7 @@ void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_ma
46+
* here. In principle, due to the requirement above, we can
47+
* simply output all hits, but the following samples "rest"
48+
* number of random hits. */
49+
- rest = n_occ > n_multi + 1? n_multi + 1 : n_occ; // find one additional for ->sa
50+
+ rest = s->n_occ > n_multi + 1? n_multi + 1 : s->n_occ; // find one additional for ->sa
51+
s->multi = calloc(rest, sizeof(bwt_multi1_t));
52+
for (k = 0; k < n_aln; ++k) {
53+
const bwt_aln1_t *q = aln + k;
54+
@@ -477,6 +477,7 @@ void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, in
55+
}
56+
}
57+
}
58+
+ err_printf("\tHN:i:%d", p->n_occ);
59+
err_putchar('\n');
60+
} else { // this read has no match
61+
//ubyte_t *s = p->strand? p->rseq : p->seq;
62+
diff --git a/bwtaln.h b/bwtaln.h
63+
index 4616ff5..71ea627 100644
64+
--- a/bwtaln.h
65+
+++ b/bwtaln.h
66+
@@ -76,6 +76,7 @@ typedef struct {
67+
// multiple hits
68+
int n_multi;
69+
bwt_multi1_t *multi;
70+
+ int n_occ; // total # of hits found, not just those reported in XA, output in HN
71+
// alignment information
72+
bwtint_t sa, pos;
73+
uint64_t c1:28, c2:28, seQ:8; // number of top1 and top2 hits; single-end mapQ
74+
--
75+
2.39.5 (Apple Git-154)
76+

0 commit comments

Comments
 (0)