tskit-dev
diff --git a/‎sc2ts/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎sc2ts/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎sc2ts/cli.py‎
Lines changed: 4 additions & 2 deletions b/‎sc2ts/cli.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎sc2ts/core.py‎
Lines changed: 7 additions & 169 deletions b/‎sc2ts/core.py‎
Lines changed: 7 additions & 169 deletions
@@ -1,5 +1,8 @@
 from .core import __version__
 
+
+from .dataset import decode_alignment, Dataset
+
 from .stats import *
 
 # FIXME
 
@@ -23,6 +23,8 @@
 
 import sc2ts
 from . import core
+from . import data_import
+from . import jit
 
 logger = logging.getLogger(__name__)
 
@@ -141,7 +143,7 @@ def import_alignments(dataset, fastas, initialise, progress, verbose):
 
     f_bar = tqdm.tqdm(sorted(fastas), desc="Files", disable=not progress, position=0)
     for fasta_path in f_bar:
-        reader = core.FastaReader(fasta_path, add_zero_base=False)
+        reader = data_import.FastaReader(fasta_path, add_zero_base=False)
         logger.info(f"Reading {len(reader)} alignments from {fasta_path}")
         alignments = {}
         a_bar = tqdm.tqdm(
@@ -152,7 +154,7 @@ def import_alignments(dataset, fastas, initialise, progress, verbose):
             position=1,
         )
         for k, v in a_bar:
-            alignments[k] = sc2ts.encode_alignment(v)
+            alignments[k] = jit.encode_alignment(v)
         sc2ts.Dataset.append_alignments(dataset, alignments)
 
 
 
@@ -1,15 +1,6 @@
 import dataclasses
-import json
-import pathlib
-import collections.abc
-import csv
 
 import tskit
-import numba
-import pyfaidx
-import numpy as np
-
-from . import jit
 
 __version__ = "undefined"
 try:
@@ -26,6 +17,13 @@
 REFERENCE_GENBANK = "MN908947"
 REFERENCE_SEQUENCE_LENGTH = 29904
 
+# We omit N here as it's mapped to -1. Make "-" the 5th allele
+# as this is a valid allele for us.
+# NOTE!! This string is also used in the jit module where it's
+# hard-coded into a numba function, so if this ever changes
+# it needs to be updated there also!
+IUPAC_ALLELES = "ACGT-RYSWKMBDHV."
+
 NODE_IS_MUTATION_OVERLAP = 1 << 21
 NODE_IS_REVERSION_PUSH = 1 << 22
 NODE_IS_RECOMBINANT = 1 << 23
@@ -97,163 +95,3 @@ def decode_flags(f):
 
 def flags_summary(f):
     return "".join([v.short if (v.value & f) > 0 else "_" for v in flag_values])
-
-
-class FastaReader(collections.abc.Mapping):
-    def __init__(self, path, add_zero_base=True):
-        self.reader = pyfaidx.Fasta(str(path))
-        self._keys = list(self.reader.keys())
-        self.add_zero_base = add_zero_base
-
-    def __getitem__(self, key):
-        x = self.reader[key]
-        h = np.array(x).astype(str)
-        h = np.char.upper(h)
-        if self.add_zero_base:
-            return np.append(["X"], h)
-        return h
-
-    def __iter__(self):
-        return iter(self._keys)
-
-    def __len__(self):
-        return len(self._keys)
-
-
-data_path = pathlib.Path(__file__).parent / "data"
-
-
-def get_problematic_regions():
-    """
-    These regions have been reported to have highly recurrent or unusual
-    patterns of deletions.
-
-    https://github.com/jeromekelleher/sc2ts/issues/231#issuecomment-2401405355
-
-    Region: NTD domain
-    Coords: [21602-22472)
-    Multiple highly recurrent deleted regions in NTD domain in Spike
-    https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7971772/
-
-    Region: ORF8
-    https://virological.org/t/repeated-loss-of-orf8-expression-in-circulating-sars-cov-2-lineages/931/1
-
-    The 1-based (half-open) coordinates were taken from the UCSC Genome Browser.
-    """
-    orf8 = get_gene_coordinates()["ORF8"]
-    return np.concatenate(
-        [
-            np.arange(21602, 22472, dtype=np.int64),  # NTD domain in S
-            np.arange(*orf8, dtype=np.int64),
-        ]
-    )
-
-
-def get_flank_coordinates():
-    """
-    Return the coordinates at either end of the genome for masking out.
-    """
-    genes = get_gene_coordinates()
-    start = genes["ORF1ab"][0]
-    end = genes["ORF10"][1]
-    return np.concatenate(
-        (np.arange(1, start), np.arange(end, REFERENCE_SEQUENCE_LENGTH))
-    )
-
-
-def get_masked_sites(ts):
-    """
-    Return the set of sites not used in the sequence.
-    """
-    unused = np.ones(int(ts.sequence_length), dtype=bool)
-    unused[ts.sites_position.astype(int)] = False
-    unused[0] = False
-    return np.where(unused)[0]
-
-
-@dataclasses.dataclass
-class CovLineage:
-    name: str
-    earliest_date: str
-    latest_date: str
-    description: str
-
-
-def get_cov_lineages_data():
-    with open(data_path / "lineages.json") as f:
-        data = json.load(f)
-    ret = {}
-    for record in data:
-        lineage = CovLineage(
-            record["Lineage"],
-            record["Earliest date"],
-            record["Latest date"],
-            record["Description"],
-        )
-        assert lineage.name not in ret
-        ret[lineage.name] = lineage
-    return ret
-
-
-__cached_reference = None
-
-
-def get_reference_sequence(as_array=False):
-    global __cached_reference
-    if __cached_reference is None:
-        reader = pyfaidx.Fasta(str(data_path / "reference.fasta"))
-        __cached_reference = reader[REFERENCE_GENBANK]
-    if as_array:
-        h = np.array(__cached_reference).astype(str)
-        return np.append(["X"], h)
-    else:
-        return "X" + str(__cached_reference)
-
-
-__cached_genes = None
-
-
-def get_gene_coordinates():
-    """
-    Returns a map of gene name to interval, (start, stop). These are
-    half-open, left-inclusive, right-exclusive.
-    """
-    global __cached_genes
-    if __cached_genes is None:
-        d = {}
-        with open(data_path / "annotation.csv") as f:
-            reader = csv.DictReader(f, delimiter=",")
-            for row in reader:
-                d[row["gene"]] = (int(row["start"]), int(row["end"]))
-        __cached_genes = d
-    return __cached_genes
-
-
-# We omit N here as it's mapped to -1. Make "-" the 5th allele
-# as this is a valid allele for us.
-IUPAC_ALLELES = "ACGT-RYSWKMBDHV."
-
-
-# FIXME make cache optional
-@numba.njit(cache=True)
-def encode_alignment(h):
-    # Just so numba knows this is a constant string
-    alleles = "ACGT-RYSWKMBDHV."
-    n = h.shape[0]
-    a = np.full(n, -1, dtype=np.int8)
-    for j in range(n):
-        if h[j] == "N":
-            a[j] = -1
-        else:
-            for k, c in enumerate(alleles):
-                if c == h[j]:
-                    break
-            else:
-                raise ValueError(f"Allele {h[j]} not recognised")
-            a[j] = k
-    return a
-
-
-def decode_alignment(a):
-    alleles = np.array(tuple(IUPAC_ALLELES + "N"), dtype=str)
-    return alleles[a]