Skip to content

Commit f73fd28

Browse files
committed
- Relaxed database parsing for V.para.
- Fixed typing for V.para. - Added nomenclature section to docs.
1 parent 6d8a348 commit f73fd28

File tree

6 files changed

+42
-17
lines changed

6 files changed

+42
-17
lines changed

docs/source/Databases.rst

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,28 @@ Example piece of input Genbank file::
5959
CDS 1..897
6060
/gene="galF"
6161

62+
Nomenclature
63+
^^^^^^^^^^^^^^^
64+
In constructing the databases included with Kaptive, we have used the following nomenclature rules:
65+
66+
* Loci are named after their respective antigen (**K**, **O**, or **OC**) followed by the letter **L** (which
67+
stands for **Locus**), which separates the label for the genotype from the phenotype (e.g. KL1 -> K1). These
68+
letters should be in upper case.
69+
* Loci are numbered, first, by their corresponding antigen, and second, in the order in which they were discovered.
70+
For example, *Klebsiella* K-loci 1-79 correspond to K-types 1-79. K-loci 101 and greater correspond to K-loci with
71+
unknown antigens in the order in which they were discovered. We intentionally started at 101 to leave room to assign
72+
phenotype-genotype pairs.
73+
* Locus genes are named in three parts delimited by an **underscore** (**_**):
74+
75+
#. The locus the gene belongs to, e.g. ``KL1_`` for a gene in the ``KL1`` locus.
76+
#. The position of the gene in the locus, e.g. ``KL1_01`` for the first gene in the ``KL1`` locus.
77+
#. The name of the gene as a three-letter italicized symbol written in lower case letters and usually suffixed with
78+
an italicized capital letter, e.g. ``KL1_01_galF`` for the *galF* gene in the ``KL1`` locus.
79+
If the gene name is unknown, this part will be blank and the gene instead would be called ``KL1_01``.
80+
81+
.. note::
82+
Databases **must** follow this nomenclature system for distribution within Kaptive.
83+
6284
.. _Phenotype-logic:
6385

6486
Phenotype logic

docs/source/Outputs.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ Extra genes, details Gene names for the extra genes found in
3636
.. note::
3737
Numbers beside gene names indicate the percent identity and percent coverage of the gene in the assembly.
3838

39+
.. note::
40+
You may sometimes see two copies of the same gene in the ``Expected genes in locus, details `` column.
41+
These represent (likely) parts of the same gene which have usually been split over contigs.
42+
In Kaptive v3.0.0 onwards, we adopted this behaviour to allow users to see where locus splitting as occurred,
43+
and determine the total percent identity of a gene that has been split.
44+
3945
The default is to print this table to **stdout**.
4046
You can use UNIX redirection operators (``>`` or ``>>``) or the ``-o``/``--out`` flag to write to a file.
4147

kaptive/assembly.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,7 @@ def typing_pipeline(
213213
alignments += (alns := list(alns)) # Add all alignments to the list
214214
# Use the best alignment for each gene for scoring, if the coverage is above the minimum
215215
if ((best := max(alns, key=lambda x: x.mlen)).blen / best.q_len) * 100 >= min_cov:
216-
scores[idx[q.split("_", 1)[0]]] += [best.tags['AS'], best.mlen, best.blen, best.q_len, 1, 0]
216+
scores[idx[db.genes[q].locus.name]] += [best.tags['AS'], best.mlen, best.blen, best.q_len, 1, 0]
217217
# For each gene, add: AS, mlen, blen, q_len, genes_found (1), genes_expected (0 but will update later)
218218

219219
if scores.max() == 0: # If no gene alignments were found, return None so pipeline can continue
@@ -295,13 +295,13 @@ def typing_pipeline(
295295

296296
# FINALISE RESULT -------------------------------------------------------------------------------------------------
297297
# Sort the pieces by the sum of the expected gene order to get the expected order of the pieces
298-
result.pieces.sort(key=lambda x: min(int(i.gene.name.split("_")[1]) for i in x.expected_genes))
299-
[l.sort(key=lambda x: int(x.gene.name.split("_")[1])) for l in (
298+
result.pieces.sort(key=lambda x: min(i.gene.position_in_locus for i in x.expected_genes))
299+
[l.sort(key=lambda x: gene.position_in_locus) for l in (
300300
result.expected_genes_inside_locus, result.expected_genes_outside_locus, result.unexpected_genes_inside_locus,
301301
result.unexpected_genes_outside_locus)]
302-
result.missing_genes = sorted(list(set(best_match.genes) - {
303-
i.gene.name for i in chain(result.expected_genes_inside_locus, result.expected_genes_outside_locus)}),
304-
key=lambda x: int(x.split("_")[1]))
302+
result.missing_genes = list(set(best_match.genes) - {
303+
i.gene.name for i in chain(result.expected_genes_inside_locus, result.expected_genes_outside_locus)
304+
})
305305
result.get_confidence(allow_below_threshold, max_other_genes, percent_expected_genes)
306306
log(f"Finished typing {result}", verbose=verbose)
307307
return result

kaptive/database.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -153,14 +153,13 @@ def from_seqrecord(cls, record: SeqRecord, locus_name: str, type_name: str, load
153153
n = 1
154154
for feature in record.features: # type: SeqFeature
155155
if feature.type == 'CDS':
156-
gene = Gene.from_feature(record, feature, position_in_locus=n)
156+
gene = Gene.from_feature(record, feature, position_in_locus=n, locus=self)
157157
if gene.name in self.genes:
158158
raise LocusError(f'Gene {gene} already exists in locus {self}')
159159
if gene.locus and gene.locus != self:
160160
raise LocusError(f'Gene {gene} is from a different locus than locus {self}')
161161
if extract_translations: # Force translation of the gene
162162
gene.extract_translation()
163-
gene.locus = self
164163
self.genes[gene.name] = gene
165164
n += 1
166165
self.type_label = type_name if not self.extra() else None # Extra genes don't have a type
@@ -250,10 +249,8 @@ def from_feature(cls, record: SeqRecord, feature: SeqFeature, **kwargs):
250249
self = cls(
251250
start=feature.location.start, end=feature.location.end, strand='+' if feature.location.strand == 1 else '-',
252251
dna_seq=feature.extract(record.seq), product=feature.qualifiers.get('product', [''])[0], **kwargs)
253-
if not (locus_tag := feature.qualifiers.get('locus_tag', [None])[0]):
254-
raise GeneError(f'{feature} does not have a locus tag.')
255-
self.name = locus_tag
256-
self.gene_name = feature.qualifiers.get('gene', [self.name])[0] # Use locus tag if gene name is not present
252+
self.name = f"{self.locus.name}_{str(self.position_in_locus).zfill(2)}" + (f"_{x}" if (x := feature.qualifiers.get('gene', [''])[0]) else '')
253+
self.gene_name = x
257254
assert len(self.dna_seq) % 3 == 0, quit_with_error(f'DNA sequence of {self} is not a multiple of 3')
258255
return self
259256

kaptive/misc.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,14 @@ def check_programs(progs: list[str], verbose: bool = False):
4747
quit_with_error(f'{program} not found')
4848

4949

50-
def check_file(path: str | Path) -> Path | None:
50+
def check_file(path: str | Path) -> Path:
5151
path = Path(path) if isinstance(path, str) else path
5252
if not path.exists():
53-
return warning(f'{path} does not exist')
53+
quit_with_error(f'{path} does not exist')
5454
if not path.is_file():
55-
return warning(f'{path} is not a file')
55+
quit_with_error(f'{path} is not a file')
5656
elif path.stat().st_size == 0:
57-
return warning(f'{path} is empty')
57+
quit_with_error(f'{path} is empty')
5858
else:
5959
return path.absolute()
6060

kaptive/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,4 @@
1414
If not, see <https://www.gnu.org/licenses/>.
1515
"""
1616

17-
__version__ = '3.0.0b4'
17+
__version__ = '3.0.0b5'

0 commit comments

Comments
 (0)