Skip to content

Commit 52fd5b0

Browse files
authored
Merge pull request #300 from KwanLab/release-2.2.0
Release 2.2.0
2 parents 350ff10 + 1f5773f commit 52fd5b0

35 files changed

+2498
-616
lines changed

.github/workflows/pytest_codecov.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ jobs:
2626
strategy:
2727
matrix:
2828
os: [ubuntu-latest]
29-
python-version: [3.7]
29+
python-version: [3.8]
3030
env:
3131
OS: ${{ matrix.os }}
3232
PYTHON: ${{ matrix.python-version }}

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.1.0
1+
2.2.0

autometa-env.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ dependencies:
1212
- gdown
1313
- hdbscan
1414
- hmmer
15+
- joblib==1.1.0 # See https://stackoverflow.com/a/73830525/12671809
1516
- numba>=0.47
1617
- numpy>=1.13
1718
- pandas>=1.1
@@ -23,8 +24,9 @@ dependencies:
2324
- rsync
2425
- samtools>=1.11
2526
- scikit-bio
26-
- scipy==1.8 #force scipy 1.8 until scikit-bio updates to 1.9, https://github.com/KwanLab/Autometa/issues/285
27+
- scipy==1.8.1 #force scipy 1.8 until scikit-bio updates to 1.9, https://github.com/KwanLab/Autometa/issues/285
2728
- scikit-learn==0.24 # prevent error from joblib in multiprocessing distance calculations
29+
- seqkit
2830
- tqdm
2931
- trimap
3032
- tsne

autometa/binning/large_data_mode.py

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
from autometa.common import kmers
4040

4141
from autometa.common.exceptions import TableFormatError, BinningError
42-
from autometa.taxonomy.ncbi import NCBI
42+
from autometa.taxonomy.database import TaxonomyDatabase
4343
from autometa.binning.recursive_dbscan import get_clusters
4444
from autometa.binning.utilities import (
4545
write_results,
@@ -306,17 +306,26 @@ def cluster_by_taxon_partitioning(
306306
Raises
307307
-------
308308
TableFormatError
309-
No marker information is availble for contigs to be binned.
309+
No marker information is available for contigs to be binned.
310310
FileNotFoundError
311311
Provided `binning_checkpoints_fpath` does not exist
312+
TableFormatError
313+
No marker information is availble for contigs to be binned.
312314
"""
315+
if binning_checkpoints_fpath and not os.path.exists(binning_checkpoints_fpath):
316+
raise FileNotFoundError(binning_checkpoints_fpath)
317+
313318
if reverse_ranks:
314319
# species, genus, family, order, class, phylum, superkingdom
315-
canonical_ranks = [rank for rank in NCBI.CANONICAL_RANKS if rank != "root"]
320+
canonical_ranks = [
321+
rank for rank in TaxonomyDatabase.CANONICAL_RANKS if rank != "root"
322+
]
316323
else:
317324
# superkingdom, phylum, class, order, family, genus, species
318325
canonical_ranks = [
319-
rank for rank in reversed(NCBI.CANONICAL_RANKS) if rank != "root"
326+
rank
327+
for rank in reversed(TaxonomyDatabase.CANONICAL_RANKS)
328+
if rank != "root"
320329
]
321330
# if stage is cached then we can first look to the cache before we begin subsetting main...
322331
clustered_contigs = set()
@@ -326,15 +335,17 @@ def cluster_by_taxon_partitioning(
326335
starting_rank_name_txt = None
327336
# Retrieve appropriate starting canonical rank and rank_name_txt from cached binning checkpoints if cache was provided
328337
if cache:
329-
if binning_checkpoints_fpath and not os.path.exists(binning_checkpoints_fpath):
330-
raise FileNotFoundError(binning_checkpoints_fpath)
338+
if not os.path.exists(cache):
339+
os.makedirs(os.path.realpath(cache), exist_ok=True)
340+
logger.debug(f"Created cache dir: {cache}")
341+
if not os.path.isdir(cache):
342+
raise NotADirectoryError(cache)
331343
if not binning_checkpoints_fpath:
332344
binning_checkpoints_fpath = os.path.join(
333345
cache, "binning_checkpoints.tsv.gz"
334346
)
335-
if os.path.exists(binning_checkpoints_fpath) and os.path.getsize(
336-
binning_checkpoints_fpath
337-
):
347+
if binning_checkpoints_fpath:
348+
if os.path.exists(binning_checkpoints_fpath) and os.path.getsize(binning_checkpoints_fpath):
338349
checkpoint_info = get_checkpoint_info(binning_checkpoints_fpath)
339350
binning_checkpoints = checkpoint_info["binning_checkpoints"]
340351
starting_rank = checkpoint_info["starting_rank"]
@@ -392,9 +403,9 @@ def cluster_by_taxon_partitioning(
392403
else None
393404
)
394405
# Create canonical rank cache outdir if it does not exist
395-
rank_cache_outdir = os.path.join(cache, canonical_rank)
406+
rank_cache_outdir = os.path.join(cache, canonical_rank) if cache else None
396407
if embedding_cache_fpath and not os.path.isdir(rank_cache_outdir):
397-
os.makedirs(rank_cache_outdir)
408+
os.makedirs(rank_cache_outdir, exist_ok=True)
398409
rank_embedding = get_kmer_embedding(
399410
rank_counts,
400411
cache_fpath=embedding_cache_fpath,
@@ -546,7 +557,7 @@ def cluster_by_taxon_partitioning(
546557
num_clusters += clustered.cluster.nunique()
547558
clusters.append(clustered)
548559
# Cache binning at rank_name_txt stage (rank-name-txt checkpointing)
549-
if cache:
560+
if binning_checkpoints_fpath:
550561
binning_checkpoints = checkpoint(
551562
checkpoints_df=binning_checkpoints,
552563
clustered=clustered,

autometa/binning/recursive_dbscan.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from autometa.common.markers import load as load_markers
2525

2626
from autometa.common.exceptions import TableFormatError, BinningError
27-
from autometa.taxonomy.ncbi import NCBI
27+
from autometa.taxonomy.database import TaxonomyDatabase
2828
from autometa.binning.utilities import (
2929
write_results,
3030
read_annotations,
@@ -628,10 +628,14 @@ def taxon_guided_binning(
628628
logger.info(f"Using {method} clustering method")
629629
if reverse_ranks:
630630
# species, genus, family, order, class, phylum, superkingdom
631-
ranks = [rank for rank in NCBI.CANONICAL_RANKS if rank != "root"]
631+
ranks = [rank for rank in TaxonomyDatabase.CANONICAL_RANKS if rank != "root"]
632632
else:
633633
# superkingdom, phylum, class, order, family, genus, species
634-
ranks = [rank for rank in reversed(NCBI.CANONICAL_RANKS) if rank != "root"]
634+
ranks = [
635+
rank
636+
for rank in reversed(TaxonomyDatabase.CANONICAL_RANKS)
637+
if rank != "root"
638+
]
635639
starting_rank_index = ranks.index(starting_rank)
636640
ranks = ranks[starting_rank_index:]
637641
logger.debug(f"Using ranks: {', '.join(ranks)}")

autometa/binning/summary.py

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@
1717

1818
from Bio import SeqIO
1919

20+
from autometa.taxonomy.database import TaxonomyDatabase
2021
from autometa.taxonomy.ncbi import NCBI
22+
from autometa.taxonomy.gtdb import GTDB
2123
from autometa.taxonomy import majority_vote
2224
from autometa.common.markers import load as load_markers
2325

@@ -226,16 +228,16 @@ def get_metabin_stats(
226228

227229

228230
def get_metabin_taxonomies(
229-
bin_df: pd.DataFrame, ncbi: NCBI, cluster_col: str = "cluster"
231+
bin_df: pd.DataFrame, taxa_db: TaxonomyDatabase, cluster_col: str = "cluster"
230232
) -> pd.DataFrame:
231233
"""Retrieve taxonomies of all clusters recovered from Autometa binning.
232234
233235
Parameters
234236
----------
235237
bin_df : pd.DataFrame
236238
Autometa binning table. index=contig, cols=['cluster','length','taxid', *canonical_ranks]
237-
ncbi : autometa.taxonomy.ncbi.NCBI instance
238-
Autometa NCBI class instance
239+
taxa_db : autometa.taxonomy.ncbi.TaxonomyDatabase instance
240+
Autometa NCBI or GTDB class instance
239241
cluster_col : str, optional
240242
Clustering column by which to group metabins
241243
@@ -246,7 +248,9 @@ def get_metabin_taxonomies(
246248
Indexed by cluster
247249
"""
248250
logger.info(f"Retrieving metabin taxonomies for {cluster_col}")
249-
canonical_ranks = [rank for rank in NCBI.CANONICAL_RANKS if rank != "root"]
251+
canonical_ranks = [
252+
rank for rank in TaxonomyDatabase.CANONICAL_RANKS if rank != "root"
253+
]
250254
is_clustered = bin_df[cluster_col].notnull()
251255
bin_df = bin_df[is_clustered]
252256
outcols = [cluster_col, "length", "taxid", *canonical_ranks]
@@ -277,11 +281,13 @@ def get_metabin_taxonomies(
277281
taxonomies[cluster][canonical_rank].update({taxid: length})
278282
else:
279283
taxonomies[cluster][canonical_rank][taxid] += length
280-
cluster_taxonomies = majority_vote.rank_taxids(taxonomies, ncbi)
284+
cluster_taxonomies = majority_vote.rank_taxids(taxonomies, taxa_db=taxa_db)
281285
# With our cluster taxonomies, let's place these into a dataframe for easy data accession
282286
cluster_taxa_df = pd.Series(data=cluster_taxonomies, name="taxid").to_frame()
283287
# With the list of taxids, we'll retrieve their complete canonical-rank information
284-
lineage_df = ncbi.get_lineage_dataframe(cluster_taxa_df.taxid.tolist(), fillna=True)
288+
lineage_df = taxa_db.get_lineage_dataframe(
289+
cluster_taxa_df.taxid.tolist(), fillna=True
290+
)
285291
# Now put it all together
286292
cluster_taxa_df = pd.merge(
287293
cluster_taxa_df, lineage_df, how="left", left_on="taxid", right_index=True
@@ -323,11 +329,18 @@ def main():
323329
required=True,
324330
)
325331
parser.add_argument(
326-
"--ncbi",
327-
help="Path to user NCBI databases directory (Required for retrieving metabin taxonomies)",
332+
"--dbdir",
333+
help="Path to user taxonomy database directory (Required for retrieving metabin taxonomies)",
328334
metavar="dirpath",
329335
required=False,
330336
)
337+
parser.add_argument(
338+
"--dbtype",
339+
help="Taxonomy database type to use (NOTE: must correspond to the same database type used during contig taxon assignment.)",
340+
choices=["ncbi", "gtdb"],
341+
required=False,
342+
default="ncbi",
343+
)
331344
parser.add_argument(
332345
"--binning-column",
333346
help="Binning column to use for grouping metabins",
@@ -377,14 +390,17 @@ def main():
377390
logger.info(f"Wrote metabin stats to {args.output_stats}")
378391
# Finally if taxonomy information is available then write out each metabin's taxonomy by modified majority voting method.
379392
if "taxid" in bin_df.columns:
380-
if not args.ncbi:
393+
if not args.dbdir:
381394
logger.warn(
382-
"taxid found in dataframe. --ncbi argument is required to retrieve metabin taxonomies. Skipping..."
395+
"taxid found in dataframe. --dbdir argument is required to retrieve metabin taxonomies. Skipping..."
383396
)
384397
else:
385-
ncbi = NCBI(dirpath=args.ncbi)
398+
if args.dbtype == "ncbi":
399+
taxa_db = NCBI(dbdir=args.dbdir)
400+
elif args.dbtype == "gtdb":
401+
taxa_db = GTDB(dbdir=args.dbdir)
386402
taxa_df = get_metabin_taxonomies(
387-
bin_df=bin_df, ncbi=ncbi, cluster_col=args.binning_column
403+
bin_df=bin_df, taxa_db=taxa_db, cluster_col=args.binning_column
388404
)
389405
taxa_df.to_csv(args.output_taxonomy, sep="\t", index=True, header=True)
390406

autometa/binning/utilities.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333

3434
from typing import Iterable, Tuple
3535

36-
from autometa.taxonomy.ncbi import NCBI
36+
from autometa.taxonomy.database import TaxonomyDatabase
3737

3838

3939
logger = logging.getLogger(__name__)
@@ -98,7 +98,7 @@ def filter_taxonomy(df: pd.DataFrame, rank: str, name: str) -> pd.DataFrame:
9898
Provided `name` not found in `rank` column.
9999
"""
100100
# First clean the assigned taxa by broadcasting lowercase and replacing any whitespace with underscores
101-
for canonical_rank in NCBI.CANONICAL_RANKS:
101+
for canonical_rank in TaxonomyDatabase.CANONICAL_RANKS:
102102
if canonical_rank not in df.columns:
103103
continue
104104
df[canonical_rank] = df[canonical_rank].map(
@@ -395,7 +395,11 @@ def write_results(
395395
outcols.extend(annotation_cols)
396396
# Add in taxonomy columns if taxa are present
397397
# superkingdom, phylum, class, order, family, genus, species
398-
taxa_cols = [rank for rank in reversed(NCBI.CANONICAL_RANKS) if rank != "root"]
398+
taxa_cols = [
399+
rank
400+
for rank in reversed(TaxonomyDatabase.CANONICAL_RANKS)
401+
if rank != "root"
402+
]
399403
taxa_cols.append("taxid")
400404
# superkingdom, phylum, class, order, family, genus, species, taxid
401405
for taxa_col in taxa_cols:

autometa/common/external/bedtools.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ def parse(bed: str, out: str = None, force: bool = False) -> pd.DataFrame:
104104
df = pd.read_csv(bed, sep="\t", names=names, index_col="contig")
105105
df = df[df.index != "genome"]
106106
df = df.assign(depth_product=lambda x: x.depth * x.bases)
107-
dff = df.groupby("contig")["depth_product", "bases"].sum()
107+
dff = df.groupby("contig")[["depth_product", "bases"]].sum()
108108
dff = dff.assign(coverage=lambda x: x.depth_product / x.bases)
109109
if out and (not os.path.exists(out) or (os.path.exists(out) and force)):
110110
dff.to_csv(out, sep="\t", index=True, header=True)

autometa/common/utilities.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,25 @@
2727
logger = logging.getLogger(__name__)
2828

2929

30+
def is_gz_file(filepath) -> bool:
31+
"""
32+
Check if the given file is gzipped compressed or not.
33+
34+
Parameters
35+
----------
36+
filepath : str
37+
Filepath to check
38+
39+
Returns
40+
-------
41+
bool
42+
True if file is gzipped else False
43+
"""
44+
# https://stackoverflow.com/a/47080739
45+
with open(filepath, "rb") as test_f:
46+
return test_f.read(2) == b"\x1f\x8b"
47+
48+
3049
def unpickle(fpath: str) -> Any:
3150
"""Load a serialized `fpath` from :func:`make_pickle`.
3251

0 commit comments

Comments
 (0)