Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pairtools/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,4 +202,5 @@ def wrapper(*args, **kwargs):
filterbycov,
header,
scaling,
coverage,
)
181 changes: 181 additions & 0 deletions pairtools/cli/coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Command-line interface for coverage analysis.

This module provides a Click-based CLI that wraps the coverage_api module.
"""

import click
import bioframe

from ..lib.coverage import calculate_coverage, read_pairs, save_coverage
from . import cli, common_io_options


UTIL_NAME = "pairtools_coverage"


@cli.command()
@click.argument(
"pairs_path",
type=str,
required=False,
)
@click.option(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of having two separate parameters "side" and "end", maybe it's better to allow the user to submit the list of columns to use for the coverage calculation? It's just a suggestion, might be not the smartest one.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting idea. How would you specify it in case you want to have the coverage by midpoint or whole length, as you suggest below and I also wanted to implement anyway?

"--side",
"-s",
type=click.Choice(["0", "1", "2"]),
default="1",
show_default=True,
help="0: both sides, 1: first side, 2: second side",
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be a good idea to allow using (1) the midpoints of the reads for calculation and (2) whole length of the read (each nucleotide of the read adds +1 to the coverage)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, will add it! At least the midpoint I was planning anyway for sure.

@click.option(
"--end",
type=click.Choice(["5", "3", "0"]),
default="0",
show_default=True,
help="5', 3' end, or 0 (default) - whatever is reported as pos1 and pos2 in pairs",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. What is default "0" here?
  2. How does this parameter interact with the "side" parameter?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Default is whatever is reported in pos1 or pos2 (so depends on how the pairs where generated).
  2. Well... It calculates coverage of 5', 3' or default fragment ends in the specified side (i.e. --side 1 --end 5 will use pos51, --side 2 --end 3 will use pos32)

)
@click.option(
"--shift",
type=int,
default=0,
show_default=True,
help="Shift value for strand-specific adjustment",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This description feels a bit incomplete. Maybe add a note on what is the zero point relative to which the shift wil be performed here? Also, what will be the direction of the shift?

)
@click.option(
"--window-size",
type=int,
default=10,
show_default=True,
help="Window size for binning",
)
@click.option(
"--chunksize",
type=int,
default=1000000,
show_default=True,
help="Number of pairs to process at once",
)
@click.option(
"--threads",
"-t",
type=int,
default=1,
show_default=True,
help="Number of threads",
)
@click.option(
"--chromsizes",
type=str,
default=None,
help="Chromosome order used to filter pairs: "
"path to a chromosomes file (e.g. UCSC chrom.sizes or similar) whose "
"first column lists scaffold names. If not provided, will be extracted "
"from the header of the input pairs file.",
)
@click.option(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a way to disable the output bgzip-/lz4c-file and have bigwig only? If bgzip-/lz4c- output is required, it shall have an enormous burden on the IO if you need bigwig only...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think in pairtools we always have a text output, no? How bioframe.to_bigwig works is it saves to text and then converts anyway, so the burden is only double... But I guess we can at least default to saving to stdout and then if you don't want it you dump it to /dev/null?

"-o",
"--output",
type=str,
default="",
help="output file."
" If the path ends with .gz or .lz4, the output is bgzip-/lz4c-compressed."
" By default, the output is printed into stdout.",
)
@click.option(
"--output-bigwig",
type=str,
default=None,
help="Output BigWig file (optional)",
)
@common_io_options
def coverage(
pairs_path,
side,
end,
shift,
window_size,
chunksize,
threads,
chromsizes,
output,
output_bigwig,
**kwargs,
):
"""Generate coverage from pairs file.

PAIRS_PATH : input .pairs/.pairsam file. If the path ends with .gz or .lz4, the
input is decompressed by bgzip/lz4c.
By default, the input is read from stdin.
"""
coverage_py(
pairs_path=pairs_path,
output=output,
side=side,
end=end,
shift=shift,
window_size=window_size,
chunksize=chunksize,
threads=threads,
chromsizes=chromsizes,
output_bigwig=output_bigwig,
**kwargs,
)


if __name__ == "__main__":
coverage()


def coverage_py(
pairs_path,
output,
side,
end,
shift,
window_size,
chunksize,
threads,
chromsizes,
output_bigwig,
**kwargs,
):
"""Generate coverage from pairs file.

PAIRS_PATH : input .pairs/.pairsam file. If the path ends with .gz or .lz4, the
input is decompressed by bgzip/lz4c.
By default, the input is read from stdin.
"""

# Handle chromsizes - either from file or extract from pairs header
# Read pairs and get chromsizes from header
pairs_chunks, chromsizes_dict = read_pairs(
pairs_path if pairs_path else "-", chunksize, threads
)
if chromsizes is not None:
chromsizes_dict = bioframe.read_chromsizes(
chromsizes, filter_chroms=False, natsort=False
)

# Calculate coverage using the API
coverage_df = calculate_coverage(
pairs_chunks=pairs_chunks,
chromsizes=chromsizes_dict,
side=side,
end=end,
shift=shift,
window_size=window_size,
threads=threads,
)

# Save the results
coverage_df.to_csv(output, sep="\t", index=False, header=False)

if output_bigwig is not None:
bioframe.to_bigwig(
coverage_df[["chrom", "start", "end", "count"]],
chromsizes_dict,
output_bigwig,
)
1 change: 1 addition & 0 deletions pairtools/lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@
from . import restrict
from . import stats
from . import select
from . import coverage
Loading