Skip to content

Commit e73a614

Browse files
committed
Merge branch 'orthoxml' into structure
2 parents 290fd4f + dec9baa commit e73a614

20 files changed

+262
-104
lines changed

.github/workflows/python-publish.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@ jobs:
2020
id-token: write
2121

2222
steps:
23-
- uses: actions/checkout@v4
23+
- uses: actions/checkout@v5
2424
- name: Set up Python
25-
uses: actions/setup-python@v5
25+
uses: actions/setup-python@v6
2626
with:
2727
python-version: '3.x'
2828
- name: Install dependencies
@@ -32,4 +32,4 @@ jobs:
3232
- name: Build package
3333
run: python -m build
3434
- name: Publish package
35-
uses: pypa/gh-action-pypi-publish@61da13deb5f5124fb1536194f82ed3d9bbc7e8f3
35+
uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e

.github/workflows/test_run.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@ jobs:
1111
python-version: ["3.8", "3.9", "3.10", "3.11"]
1212

1313
steps:
14-
- uses: actions/checkout@v4
14+
- uses: actions/checkout@v5
1515
- name: Set up Python ${{ matrix.python-version }}
16-
uses: actions/setup-python@v5
16+
uses: actions/setup-python@v6
1717
with:
1818
python-version: ${{ matrix.python-version }}
1919
- name: Install dependencies

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,15 @@ Required arguments: ``--db``, ``--oma_path``
130130

131131
# Change log
132132

133+
#### Version 2.1.1
134+
135+
- Performance improvements to the mkdb command with orthoxml input
136+
- Added a check for non-unique protein IDs in the input fasta files. Now it gives a more informative error message
137+
- fixed #49
138+
139+
#### Version 2.1.0
140+
- significant improvements to classification speed
141+
133142
#### Version 2.0.4
134143
- fixes issue #34 (numpy2 incompatibility)
135144
- experimental support to build omamer databases from orthoxml/fasta files

omamer/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
22
OMAmer - tree-driven and alignment-free protein assignment to sub-families
33
4-
(C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
4+
(C) 2024-2025 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
55
(C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
66
(C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
77
Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
@@ -24,7 +24,7 @@
2424
from datetime import date
2525

2626
__packagename__ = "omamer"
27-
__version__ = "2.1.0"
27+
__version__ = "2.1.2"
2828
__copyright__ = "(C) 2019-{:d} Victor Rossier <victor.rossier@unil.ch> and Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk> and Nikolai Romashchenko <nikolai.romashchenko@unil.ch>".format(
2929
date.today().year
3030
)

omamer/_clock.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
22
OMAmer - tree-driven and alignment-free protein assignment to sub-families
33
4-
(C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
4+
(C) 2024-2025 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
55
(C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
66
(C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
77
Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
@@ -23,10 +23,17 @@
2323
"""
2424

2525
import ctypes
26+
import sys
27+
28+
29+
if sys.version_info[1] < 13:
30+
clock = ctypes.pythonapi._PyTime_GetSystemClock
31+
as_seconds = ctypes.pythonapi._PyTime_AsSecondsDouble
32+
else:
33+
# in python 3.13 and later, PyTime C API was made public
34+
clock = ctypes.pythonapi.PyTime_TimeRaw
35+
as_seconds = ctypes.pythonapi.PyTime_AsSecondsDouble
2636

27-
# Access the _PyTime_AsSecondsDouble and _PyTime_GetSystemClock functions from pythonapi
28-
clock = ctypes.pythonapi._PyTime_GetSystemClock
29-
as_seconds = ctypes.pythonapi._PyTime_AsSecondsDouble
3037

3138
# Set the argument types and return types of the functions
3239
clock.argtypes = []

omamer/_runners.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
22
OMAmer - tree-driven and alignment-free protein assignment to sub-families
33
4-
(C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
4+
(C) 2024-2025 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
55
(C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
66
(C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
77
Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
@@ -22,8 +22,8 @@
2222
along with OMAmer. If not, see <http://www.gnu.org/licenses/>.
2323
"""
2424
import numpy as np
25+
import os
2526
from omamer.index import update_with_elias_fano
26-
2727
from ._utils import LOG, check_file_exists
2828

2929

@@ -131,7 +131,7 @@ def mkdb_oma(args):
131131

132132
def search(args):
133133
from alive_progress import alive_bar
134-
from ._utils import print_message, print_line
134+
from ._utils import print_message
135135
import sys
136136

137137
if args.out is None:
@@ -163,6 +163,7 @@ def search(args):
163163
bar.text(" [DONE]")
164164

165165
print_run_data(args)
166+
check_args(args)
166167

167168
t0 = time()
168169

@@ -247,7 +248,7 @@ def search(args):
247248
# write the top header
248249
print("!omamer-version: {}".format(__version__), file=args.out)
249250
print(
250-
"!query-md5: {}".format(compute_file_md5(args.query.name)),
251+
"!query-md5: {}".format(compute_file_md5(args.query)),
251252
file=args.out,
252253
)
253254
print(
@@ -415,7 +416,7 @@ def print_run_data(args):
415416
print_line(80)
416417
print_message("\nRunning OMAmer on {}, using:".format(platform.node()))
417418
print_message(" - database: {}".format(args.db))
418-
print_message(" - query: {}".format(args.query.name))
419+
print_message(" - query: {}".format(args.query))
419420
print_message(" - version: {}".format(__version__))
420421
print_message("")
421422
print_line(80)
@@ -453,3 +454,12 @@ def goodbye(args, time_taken, search_rate):
453454
)
454455
print_message("")
455456
print_line(80)
457+
458+
459+
def check_args(args):
460+
# Enforce query existence check before loading DB
461+
with open(args.query, "r") as _:
462+
pass
463+
464+
if os.path.getsize(args.query) == 0:
465+
raise RuntimeError(f"Input file {args.query} is empty")

omamer/_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
22
OMAmer - tree-driven and alignment-free protein assignment to sub-families
33
4-
(C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
4+
(C) 2024-2025 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
55
(C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
66
(C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
77
Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>

omamer/alphabets.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""
22
OMAmer - tree-driven and alignment-free protein assignment to sub-families
33
4+
(C) 2024-2025 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
45
(C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
56
(C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
67
Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
@@ -30,8 +31,16 @@ def __init__(self, n=21):
3031
self.setup()
3132

3233
def setup(self):
34+
self.sanitize_translate = None
35+
3336
if self.n == 21:
3437
chars = set("ACDEFGHIKLMNPQRSTVWXY")
38+
# 256-byte sanitizer translation map: maps allowed characters into
39+
# themselves, everything else into X
40+
self.sanitize_translate = bytes(
41+
bytearray((b if chr(b) in chars else ord("X") for b in range(256)))
42+
)
43+
3544
digits = np.frombuffer(b"ACDEFGHIKLMNPQRSTVWXY", dtype=np.uint8)
3645
lookup = np.zeros(np.max(digits) + 1, dtype=np.uint8)
3746
lookup[digits] = np.arange(len(digits))
@@ -93,7 +102,11 @@ def translate(self, x):
93102
else:
94103
return self.trans[x.view(np.uint8)].view("|S1")
95104

96-
def sanitise_seq(self, seq):
105+
def sanitise_seq(self, seq: str) -> str:
106+
if self.sanitize_translate:
107+
b = seq.encode("ascii", "ignore").upper()
108+
return b.translate(self.sanitize_translate).decode("ascii")
109+
97110
return "".join([x if x in self.chars else "X" for x in seq])
98111

99112
@property

omamer/database.py

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
22
OMAmer - tree-driven and alignment-free protein assignment to sub-families
33
4-
(C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
4+
(C) 2024-2025 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
55
(C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
66
(C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
77
Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
@@ -499,13 +499,13 @@ def _get_child_prots(hogs, hog2protoffs, child_prots_off):
499499
# TODO: check what else would break. this could be used if someone wanted to build a
500500
# database for flat OGs.
501501
LOG.warning("No nesting structure in HOGs defined in OrthoXML.")
502-
else:
503-
self.db.create_carray(
504-
"/",
505-
"ChildrenHOG",
506-
obj=np.array(child_hogs, dtype=np.uint32),
507-
filters=self._compr,
508-
)
502+
child_hogs = [0] # adding sentinel in case no nested HOGs are defined.
503+
self.db.create_carray(
504+
"/",
505+
"ChildrenHOG",
506+
obj=np.array(child_hogs, dtype=np.uint32),
507+
filters=self._compr,
508+
)
509509
self.db.create_carray(
510510
"/",
511511
"ChildrenProt",
@@ -1063,14 +1063,14 @@ def __init__(
10631063
self.include_younger_fams = include_younger_fams
10641064

10651065
def setup_hogparser(self, oxml_path):
1066-
from .HOGParser import HOGParser
1066+
from .orthoxml_parser import OrthoxmlParser
10671067

10681068
LOG.debug("loading species table from orthoxml")
10691069
self.oxml_path = oxml_path
1070-
self.hog_parser = HOGParser(oxml_path)
1070+
self.hog_parser = OrthoxmlParser(oxml_path)
10711071

10721072
def parse_oxml(self, nspecies_below):
1073-
from .HOGParser import is_orthologGroup_node
1073+
from .orthoxml_parser import is_orthologGroup_node
10741074

10751075
def generate_hog_tab(hog):
10761076
# this is more memory intensive (we cache the results)
@@ -1142,7 +1142,7 @@ def generate_hogs_for_prot(hog, gene_data):
11421142
hogs = []
11431143
entries = []
11441144
LOG.debug("loading hog structure and membership from orthoxml")
1145-
for hog in tqdm(self.hog_parser.HOGs(auto_clean=True), desc="Parsing HOGs"):
1145+
for hog in tqdm(self.hog_parser.iter_hogs(auto_clean=True), desc="Parsing HOGs"):
11461146
hog_data = list(filter(lambda x: x is not None, generate_hog_tab(hog)))
11471147
gene_data = list(generate_prot_tab(hog))
11481148
hog_data += list(generate_hogs_for_prot(hog, gene_data))
@@ -1169,7 +1169,7 @@ def generate_hogs_for_prot(hog, gene_data):
11691169
# set index as the protein id so that we can look up the hog id.
11701170
ent_tab = ent_tab[ent_tab["hogid"].isin(set(hog_tab["ID"]))].set_index("protId")
11711171

1172-
return (hog_tab.to_records(), ent_tab)
1172+
return hog_tab.to_records(), ent_tab
11731173

11741174
### main function ###
11751175
def build_database(self, oxml_path, sequence_files, structure_files, stree_path):
@@ -1183,7 +1183,7 @@ def build_database(self, oxml_path, sequence_files, structure_files, stree_path)
11831183
self.add_taxid_col()
11841184

11851185
LOG.debug("build hog_table from orthoxml")
1186-
(hog_tab, ent_tab) = self.parse_oxml(nspecies_below)
1186+
hog_tab, ent_tab = self.parse_oxml(nspecies_below)
11871187

11881188
LOG.debug("select and strip OMA HOGs")
11891189
(
@@ -1271,13 +1271,26 @@ def select_and_filter_OMA_proteins(
12711271

12721272
ent_tab = ent_tab[ent_tab["hogid"].map(lambda x: x in oma_hog2hog)]
12731273

1274-
LOG.debug(" - loading proteins from FASTA sequences, for selected HOGs")
1274+
# Make ent_tab into a dict {protID -> (hogid, species)} as it's
1275+
# more efficient for querying individual protID than pd.DataFrame.loc
1276+
1277+
duplicates = ent_tab[ent_tab.index.duplicated(keep=False)]
1278+
1279+
# if found duplicated keys (i.e. protId), raise error
1280+
if len(duplicates) > 0:
1281+
nonunique_keys = duplicates.index.unique()
1282+
error_message = "Found duplicated protein IDs:\n" + "\n\t".join(s for s in nonunique_keys)
1283+
LOG.error(error_message)
1284+
raise RuntimeError(error_message)
1285+
1286+
ent_tab = ent_tab[~ent_tab.index.duplicated(keep="first")].to_dict("index")
12751287

12761288
prot_off = 0 # pointer to protein in protein table
12771289

12781290
# store rows for species and protein tables and sequence buffer
12791291
seq_buffs = []
12801292

1293+
LOG.debug(" - loading proteins from FASTA sequences for selected HOGs")
12811294
prot_tab = self.db.create_table(
12821295
"/",
12831296
"Protein",
@@ -1299,20 +1312,20 @@ def select_and_filter_OMA_proteins(
12991312
SeqIO.parse(fp, "fasta"),
13001313
desc="Parsing sequences ({})".format(os.path.basename(fasta_fn)),
13011314
):
1302-
if rec.description in ent_tab.index:
1315+
if rec.description in ent_tab:
13031316
# this seems to be most common, full header is used in standalone orthoXML
13041317
prot_id = rec.description
1305-
elif rec.id in ent_tab.index:
1318+
elif rec.id in ent_tab:
13061319
# otherwise we might only see the first part of the id.
13071320
prot_id = rec.id
13081321
else:
13091322
# otherwise we can't do anything...
13101323
continue
1311-
13121324
# get hog id, skip if we have filtered it out
1313-
r = ent_tab.loc[prot_id]
1325+
r = ent_tab[prot_id]
13141326
hog_id = r["hogid"]
13151327
sp = r["species"]
1328+
13161329
# (hog_id, sp) = entry_mapping[prot_id]
13171330
if hog_id not in oma_hog2hog:
13181331
continue
@@ -1345,6 +1358,7 @@ def select_and_filter_OMA_proteins(
13451358
# update offset of protein row in table
13461359
prot_off += 1
13471360

1361+
13481362
# store species info
13491363
sp_rows = [()] * len(species) # keep sorted
13501364
for sp in sp2sp_off:
@@ -1419,6 +1433,7 @@ def select_and_filter_OMA_proteins(
14191433
structure_buff = np.concatenate(ss_buffs)
14201434
return fam2hogs, hog2protoffs, hog2tax, hog2oma_hog, seq_buff, structure_buff
14211435

1436+
14221437
def add_taxid_col(self):
14231438
"""
14241439
Add the NCBI taxon id from orthoxml

omamer/hierarchy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
22
OMAmer - tree-driven and alignment-free protein assignment to sub-families
33
4-
(C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
4+
(C) 2024-2025 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
55
(C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
66
(C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
77
Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>

0 commit comments

Comments
 (0)