- Relaxed database parsing for V.para.

tomdstanton · tomdstanton · commit f73fd28f1138 · 2024-06-25T13:45:56.000+10:00
- Fixed typing for V.para.
- Added nomenclature section to docs.
diff --git a/docs/source/Databases.rst b/docs/source/Databases.rst
@@ -59,6 +59,28 @@ Example piece of input Genbank file::
     CDS             1..897
                     /gene="galF"
 
+Nomenclature
+^^^^^^^^^^^^^^^
+In constructing the databases included with Kaptive, we have used the following nomenclature rules:
+
+* Loci are named after their respective antigen (**K**, **O**, or **OC**) followed by the letter **L** (which
+  stands for **Locus**), which separates the label for the genotype from the phenotype (e.g. KL1 -> K1). These
+  letters should be in upper case.
+* Loci are numbered, first, by their corresponding antigen, and second, in the order in which they were discovered.
+  For example, *Klebsiella* K-loci 1-79 correspond to K-types 1-79. K-loci 101 and greater correspond to K-loci with
+  unknown antigens in the order in which they were discovered. We intentionally started at 101 to leave room to assign
+  phenotype-genotype pairs.
+* Locus genes are named in three parts delimited by an **underscore** (**_**):
+
+  #. The locus the gene belongs to, e.g. ``KL1_`` for a gene in the ``KL1`` locus.
+  #. The position of the gene in the locus, e.g. ``KL1_01`` for the first gene in the ``KL1`` locus.
+  #. The name of the gene as a three-letter italicized symbol written in lower case letters and usually suffixed with
+     an italicized capital letter, e.g. ``KL1_01_galF`` for the *galF* gene in the ``KL1`` locus.
+     If the gene name is unknown, this part will be blank and the gene instead would be called ``KL1_01``.
+
+.. note::
+ Databases **must** follow this nomenclature system for distribution within Kaptive.
+
 .. _Phenotype-logic:
 
 Phenotype logic
diff --git a/docs/source/Outputs.rst b/docs/source/Outputs.rst
@@ -36,6 +36,12 @@ Extra genes, details                     Gene names for the extra genes found in
 .. note::
  Numbers beside gene names indicate the percent identity and percent coverage of the gene in the assembly.
 
+.. note::
+ You may sometimes see two copies of the same gene in the ``Expected genes in locus, details `` column.
+ These represent (likely) parts of the same gene which have usually been split over contigs.
+ In Kaptive v3.0.0 onwards, we adopted this behaviour to allow users to see where locus splitting as occurred,
+ and determine the total percent identity of a gene that has been split.
+
 The default is to print this table to **stdout**.
 You can use UNIX redirection operators (``>`` or ``>>``) or the ``-o``/``--out`` flag to write to a file.
 
diff --git a/kaptive/assembly.py b/kaptive/assembly.py
@@ -213,7 +213,7 @@ def typing_pipeline(
             alignments += (alns := list(alns))  # Add all alignments to the list
             # Use the best alignment for each gene for scoring, if the coverage is above the minimum
             if ((best := max(alns, key=lambda x: x.mlen)).blen / best.q_len) * 100 >= min_cov:
-                scores[idx[q.split("_", 1)[0]]] += [best.tags['AS'], best.mlen, best.blen, best.q_len, 1, 0]
+                scores[idx[db.genes[q].locus.name]] += [best.tags['AS'], best.mlen, best.blen, best.q_len, 1, 0]
             # For each gene, add: AS, mlen, blen, q_len, genes_found (1), genes_expected (0 but will update later)
 
     if scores.max() == 0:  # If no gene alignments were found, return None so pipeline can continue
@@ -295,13 +295,13 @@ def typing_pipeline(
 
     # FINALISE RESULT -------------------------------------------------------------------------------------------------
     # Sort the pieces by the sum of the expected gene order to get the expected order of the pieces
-    result.pieces.sort(key=lambda x: min(int(i.gene.name.split("_")[1]) for i in x.expected_genes))
-    [l.sort(key=lambda x: int(x.gene.name.split("_")[1])) for l in (
+    result.pieces.sort(key=lambda x: min(i.gene.position_in_locus for i in x.expected_genes))
+    [l.sort(key=lambda x: gene.position_in_locus) for l in (
         result.expected_genes_inside_locus, result.expected_genes_outside_locus, result.unexpected_genes_inside_locus,
         result.unexpected_genes_outside_locus)]
-    result.missing_genes = sorted(list(set(best_match.genes) - {
-        i.gene.name for i in chain(result.expected_genes_inside_locus, result.expected_genes_outside_locus)}),
-                                  key=lambda x: int(x.split("_")[1]))
+    result.missing_genes = list(set(best_match.genes) - {
+        i.gene.name for i in chain(result.expected_genes_inside_locus, result.expected_genes_outside_locus)
+    })
     result.get_confidence(allow_below_threshold, max_other_genes, percent_expected_genes)
     log(f"Finished typing {result}", verbose=verbose)
     return result
diff --git a/kaptive/database.py b/kaptive/database.py
@@ -153,14 +153,13 @@ def from_seqrecord(cls, record: SeqRecord, locus_name: str, type_name: str, load
         n = 1
         for feature in record.features:  # type: SeqFeature
             if feature.type == 'CDS':
-                gene = Gene.from_feature(record, feature, position_in_locus=n)
+                gene = Gene.from_feature(record, feature, position_in_locus=n, locus=self)
                 if gene.name in self.genes:
                     raise LocusError(f'Gene {gene} already exists in locus {self}')
                 if gene.locus and gene.locus != self:
                     raise LocusError(f'Gene {gene} is from a different locus than locus {self}')
                 if extract_translations:  # Force translation of the gene
                     gene.extract_translation()
-                gene.locus = self
                 self.genes[gene.name] = gene
                 n += 1
         self.type_label = type_name if not self.extra() else None  # Extra genes don't have a type
@@ -250,10 +249,8 @@ def from_feature(cls, record: SeqRecord, feature: SeqFeature, **kwargs):
         self = cls(
             start=feature.location.start, end=feature.location.end, strand='+' if feature.location.strand == 1 else '-',
             dna_seq=feature.extract(record.seq), product=feature.qualifiers.get('product', [''])[0], **kwargs)
-        if not (locus_tag := feature.qualifiers.get('locus_tag', [None])[0]):
-            raise GeneError(f'{feature} does not have a locus tag.')
-        self.name = locus_tag
-        self.gene_name = feature.qualifiers.get('gene', [self.name])[0]  # Use locus tag if gene name is not present
+        self.name = f"{self.locus.name}_{str(self.position_in_locus).zfill(2)}" + (f"_{x}" if (x := feature.qualifiers.get('gene', [''])[0]) else '')
+        self.gene_name = x
         assert len(self.dna_seq) % 3 == 0, quit_with_error(f'DNA sequence of {self} is not a multiple of 3')
         return self
 
diff --git a/kaptive/misc.py b/kaptive/misc.py
@@ -47,14 +47,14 @@ def check_programs(progs: list[str], verbose: bool = False):
             quit_with_error(f'{program} not found')
 
 
-def check_file(path: str | Path) -> Path | None:
+def check_file(path: str | Path) -> Path:
     path = Path(path) if isinstance(path, str) else path
     if not path.exists():
-        return warning(f'{path} does not exist')
+        quit_with_error(f'{path} does not exist')
     if not path.is_file():
-        return warning(f'{path} is not a file')
+        quit_with_error(f'{path} is not a file')
     elif path.stat().st_size == 0:
-        return warning(f'{path} is empty')
+        quit_with_error(f'{path} is empty')
     else:
         return path.absolute()
 
diff --git a/kaptive/version.py b/kaptive/version.py
@@ -14,4 +14,4 @@
 If not, see <https://www.gnu.org/licenses/>.
 """
 
-__version__ = '3.0.0b4'
+__version__ = '3.0.0b5'