DessimozLab
diff --git a/‎.github/workflows/python-publish.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/python-publish.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/test_run.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/test_run.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 9 additions & 0 deletions b/‎README.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎omamer/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎omamer/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎omamer/_clock.py‎
Lines changed: 11 additions & 4 deletions b/‎omamer/_clock.py‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎omamer/_runners.py‎
Lines changed: 15 additions & 5 deletions b/‎omamer/_runners.py‎
Lines changed: 15 additions & 5 deletions
diff --git a/‎omamer/_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎omamer/_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎omamer/alphabets.py‎
Lines changed: 14 additions & 1 deletion b/‎omamer/alphabets.py‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎omamer/database.py‎
Lines changed: 34 additions & 19 deletions b/‎omamer/database.py‎
Lines changed: 34 additions & 19 deletions
diff --git a/‎omamer/hierarchy.py‎
Lines changed: 1 addition & 1 deletion b/‎omamer/hierarchy.py‎
Lines changed: 1 addition & 1 deletion
@@ -20,9 +20,9 @@ jobs:
       id-token: write
 
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v5
     - name: Set up Python
-      uses: actions/setup-python@v5
+      uses: actions/setup-python@v6
       with:
         python-version: '3.x'
     - name: Install dependencies
@@ -32,4 +32,4 @@ jobs:
     - name: Build package
       run: python -m build
     - name: Publish package
-      uses: pypa/gh-action-pypi-publish@61da13deb5f5124fb1536194f82ed3d9bbc7e8f3
+      uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e
@@ -11,9 +11,9 @@ jobs:
         python-version: ["3.8", "3.9", "3.10", "3.11"]
 
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
         with:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
 
@@ -130,6 +130,15 @@ Required arguments: ``--db``, ``--oma_path``
 
 # Change log
 
+#### Version 2.1.1
+
+- Performance improvements to the mkdb command with orthoxml input
+- Added a check for non-unique protein IDs in the input fasta files. Now it gives a more informative error message
+- fixed #49
+
+#### Version 2.1.0
+- significant improvements to classification speed 
+
 #### Version 2.0.4
 - fixes issue #34 (numpy2 incompatibility)
 - experimental support to build omamer databases from orthoxml/fasta files
 
@@ -1,7 +1,7 @@
 """
     OMAmer - tree-driven and alignment-free protein assignment to sub-families
 
-    (C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
+    (C) 2024-2025 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
     (C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
     (C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
                   Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
@@ -24,7 +24,7 @@
 from datetime import date
 
 __packagename__ = "omamer"
-__version__ = "2.1.0"
+__version__ = "2.1.2"
 __copyright__ = "(C) 2019-{:d} Victor Rossier <victor.rossier@unil.ch> and Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk> and Nikolai Romashchenko <nikolai.romashchenko@unil.ch>".format(
     date.today().year
 )
@@ -1,7 +1,7 @@
 """
     OMAmer - tree-driven and alignment-free protein assignment to sub-families
 
-    (C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
+    (C) 2024-2025 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
     (C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
     (C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
                   Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
@@ -23,10 +23,17 @@
 """
 
 import ctypes
+import sys
+
+
+if sys.version_info[1] < 13:
+    clock = ctypes.pythonapi._PyTime_GetSystemClock
+    as_seconds = ctypes.pythonapi._PyTime_AsSecondsDouble
+else:
+    # in python 3.13 and later, PyTime C API was made public
+    clock = ctypes.pythonapi.PyTime_TimeRaw
+    as_seconds = ctypes.pythonapi.PyTime_AsSecondsDouble
 
-# Access the _PyTime_AsSecondsDouble and _PyTime_GetSystemClock functions from pythonapi
-clock = ctypes.pythonapi._PyTime_GetSystemClock
-as_seconds = ctypes.pythonapi._PyTime_AsSecondsDouble
 
 # Set the argument types and return types of the functions
 clock.argtypes = []
 
@@ -1,7 +1,7 @@
 """
     OMAmer - tree-driven and alignment-free protein assignment to sub-families
 
-    (C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
+    (C) 2024-2025 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
     (C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
     (C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
                   Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
@@ -22,8 +22,8 @@
     along with OMAmer. If not, see <http://www.gnu.org/licenses/>.
 """
 import numpy as np
+import os
 from omamer.index import update_with_elias_fano
-
 from ._utils import LOG, check_file_exists
 
 
@@ -131,7 +131,7 @@ def mkdb_oma(args):
 
 def search(args):
     from alive_progress import alive_bar
-    from ._utils import print_message, print_line
+    from ._utils import print_message
     import sys
 
     if args.out is None:
@@ -163,6 +163,7 @@ def search(args):
         bar.text(" [DONE]")
 
     print_run_data(args)
+    check_args(args)
 
     t0 = time()
 
@@ -247,7 +248,7 @@ def search(args):
                     # write the top header
                     print("!omamer-version: {}".format(__version__), file=args.out)
                     print(
-                        "!query-md5: {}".format(compute_file_md5(args.query.name)),
+                        "!query-md5: {}".format(compute_file_md5(args.query)),
                         file=args.out,
                     )
                     print(
@@ -415,7 +416,7 @@ def print_run_data(args):
     print_line(80)
     print_message("\nRunning OMAmer on {}, using:".format(platform.node()))
     print_message(" - database: {}".format(args.db))
-    print_message(" - query: {}".format(args.query.name))
+    print_message(" - query: {}".format(args.query))
     print_message(" - version: {}".format(__version__))
     print_message("")
     print_line(80)
@@ -453,3 +454,12 @@ def goodbye(args, time_taken, search_rate):
     )
     print_message("")
     print_line(80)
+
+
+def check_args(args):
+    # Enforce query existence check before loading DB
+    with open(args.query, "r") as _:
+        pass
+
+    if os.path.getsize(args.query) == 0:
+        raise RuntimeError(f"Input file {args.query} is empty")
@@ -1,7 +1,7 @@
 """
     OMAmer - tree-driven and alignment-free protein assignment to sub-families
 
-    (C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
+    (C) 2024-2025 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
     (C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
     (C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
                   Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
 
@@ -1,6 +1,7 @@
 """
     OMAmer - tree-driven and alignment-free protein assignment to sub-families
 
+    (C) 2024-2025 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
     (C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
     (C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
                   Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
@@ -30,8 +31,16 @@ def __init__(self, n=21):
         self.setup()
 
     def setup(self):
+        self.sanitize_translate = None
+
         if self.n == 21:
             chars = set("ACDEFGHIKLMNPQRSTVWXY")
+            # 256-byte sanitizer translation map: maps allowed characters into
+            # themselves, everything else into X
+            self.sanitize_translate = bytes(
+                bytearray((b if chr(b) in chars else ord("X") for b in range(256)))
+            )
+
             digits = np.frombuffer(b"ACDEFGHIKLMNPQRSTVWXY", dtype=np.uint8)
             lookup = np.zeros(np.max(digits) + 1, dtype=np.uint8)
             lookup[digits] = np.arange(len(digits))
@@ -93,7 +102,11 @@ def translate(self, x):
         else:
             return self.trans[x.view(np.uint8)].view("|S1")
 
-    def sanitise_seq(self, seq):
+    def sanitise_seq(self, seq: str) -> str:
+        if self.sanitize_translate:
+            b = seq.encode("ascii", "ignore").upper()
+            return b.translate(self.sanitize_translate).decode("ascii")
+
         return "".join([x if x in self.chars else "X" for x in seq])
 
     @property
 
@@ -1,7 +1,7 @@
 """
     OMAmer - tree-driven and alignment-free protein assignment to sub-families
 
-    (C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
+    (C) 2024-2025  Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
     (C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
     (C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
                   Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
@@ -499,13 +499,13 @@ def _get_child_prots(hogs, hog2protoffs, child_prots_off):
             # TODO: check what else would break. this could be used if someone wanted to build a
             # database for flat OGs.
             LOG.warning("No nesting structure in HOGs defined in OrthoXML.")
-        else:
-            self.db.create_carray(
-                "/",
-                "ChildrenHOG",
-                obj=np.array(child_hogs, dtype=np.uint32),
-                filters=self._compr,
-            )
+            child_hogs = [0]  # adding sentinel in case no nested HOGs are defined.
+        self.db.create_carray(
+            "/",
+            "ChildrenHOG",
+            obj=np.array(child_hogs, dtype=np.uint32),
+            filters=self._compr,
+        )
         self.db.create_carray(
             "/",
             "ChildrenProt",
@@ -1063,14 +1063,14 @@ def __init__(
         self.include_younger_fams = include_younger_fams
 
     def setup_hogparser(self, oxml_path):
-        from .HOGParser import HOGParser
+        from .orthoxml_parser import OrthoxmlParser
 
         LOG.debug("loading species table from orthoxml")
         self.oxml_path = oxml_path
-        self.hog_parser = HOGParser(oxml_path)
+        self.hog_parser = OrthoxmlParser(oxml_path)
 
     def parse_oxml(self, nspecies_below):
-        from .HOGParser import is_orthologGroup_node
+        from .orthoxml_parser import is_orthologGroup_node
 
         def generate_hog_tab(hog):
             # this is more memory intensive (we cache the results)
@@ -1142,7 +1142,7 @@ def generate_hogs_for_prot(hog, gene_data):
         hogs = []
         entries = []
         LOG.debug("loading hog structure and membership from orthoxml")
-        for hog in tqdm(self.hog_parser.HOGs(auto_clean=True), desc="Parsing HOGs"):
+        for hog in tqdm(self.hog_parser.iter_hogs(auto_clean=True), desc="Parsing HOGs"):
             hog_data = list(filter(lambda x: x is not None, generate_hog_tab(hog)))
             gene_data = list(generate_prot_tab(hog))
             hog_data += list(generate_hogs_for_prot(hog, gene_data))
@@ -1169,7 +1169,7 @@ def generate_hogs_for_prot(hog, gene_data):
         # set index as the protein id so that we can look up the hog id.
         ent_tab = ent_tab[ent_tab["hogid"].isin(set(hog_tab["ID"]))].set_index("protId")
 
-        return (hog_tab.to_records(), ent_tab)
+        return hog_tab.to_records(), ent_tab
 
     ### main function ###
     def build_database(self, oxml_path, sequence_files, structure_files, stree_path):
@@ -1183,7 +1183,7 @@ def build_database(self, oxml_path, sequence_files, structure_files, stree_path)
         self.add_taxid_col()
 
         LOG.debug("build hog_table from orthoxml")
-        (hog_tab, ent_tab) = self.parse_oxml(nspecies_below)
+        hog_tab, ent_tab = self.parse_oxml(nspecies_below)
 
         LOG.debug("select and strip OMA HOGs")
         (
@@ -1271,13 +1271,26 @@ def select_and_filter_OMA_proteins(
 
         ent_tab = ent_tab[ent_tab["hogid"].map(lambda x: x in oma_hog2hog)]
 
-        LOG.debug(" - loading proteins from FASTA sequences, for selected HOGs")
+        # Make ent_tab into a dict {protID -> (hogid, species)} as it's
+        # more efficient for querying individual protID than pd.DataFrame.loc
+
+        duplicates = ent_tab[ent_tab.index.duplicated(keep=False)]
+
+        # if found duplicated keys (i.e. protId), raise error
+        if len(duplicates) > 0:
+            nonunique_keys = duplicates.index.unique()
+            error_message = "Found duplicated protein IDs:\n" + "\n\t".join(s for s in nonunique_keys)
+            LOG.error(error_message)
+            raise RuntimeError(error_message)
+
+        ent_tab = ent_tab[~ent_tab.index.duplicated(keep="first")].to_dict("index")
 
         prot_off = 0  # pointer to protein in protein table
 
         # store rows for species and protein tables and sequence buffer
         seq_buffs = []
 
+        LOG.debug(" - loading proteins from FASTA sequences for selected HOGs")
         prot_tab = self.db.create_table(
             "/",
             "Protein",
@@ -1299,20 +1312,20 @@ def select_and_filter_OMA_proteins(
                     SeqIO.parse(fp, "fasta"),
                     desc="Parsing sequences ({})".format(os.path.basename(fasta_fn)),
                 ):
-                    if rec.description in ent_tab.index:
+                    if rec.description in ent_tab:
                         # this seems to be most common, full header is used in standalone orthoXML
                         prot_id = rec.description
-                    elif rec.id in ent_tab.index:
+                    elif rec.id in ent_tab:
                         # otherwise we might only see the first part of the id.
                         prot_id = rec.id
                     else:
                         # otherwise we can't do anything...
                         continue
-
                     # get hog id, skip if we have filtered it out
-                    r = ent_tab.loc[prot_id]
+                    r = ent_tab[prot_id]
                     hog_id = r["hogid"]
                     sp = r["species"]
+
                     # (hog_id, sp) = entry_mapping[prot_id]
                     if hog_id not in oma_hog2hog:
                         continue
@@ -1345,6 +1358,7 @@ def select_and_filter_OMA_proteins(
                         # update offset of protein row in table
                         prot_off += 1
 
+
         # store species info
         sp_rows = [()] * len(species)  # keep sorted
         for sp in sp2sp_off:
@@ -1419,6 +1433,7 @@ def select_and_filter_OMA_proteins(
         structure_buff = np.concatenate(ss_buffs)
         return fam2hogs, hog2protoffs, hog2tax, hog2oma_hog, seq_buff, structure_buff
 
+
     def add_taxid_col(self):
         """
         Add the NCBI taxon id from orthoxml
 
@@ -1,7 +1,7 @@
 """
     OMAmer - tree-driven and alignment-free protein assignment to sub-families
 
-    (C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
+    (C) 2024-2025 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
     (C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
     (C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
                   Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>