Skip to content

Commit 83d9ada

Browse files
committed
MSA documentation
Added the CLI information to the documentation in its own page. Cleaned up the CLI docstrings and added information about required docstrings. Added the preprocessing/msa functions to the API documentation. Thanks to Hope for helping test and take notes!
1 parent 9b21130 commit 83d9ada

File tree

10 files changed

+130
-43
lines changed

10 files changed

+130
-43
lines changed

docs/conf.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
sys.path.insert(0, os.path.abspath("../src"))
1313

14+
1415
import atomworks
1516

1617
project = "atomworks"
@@ -44,11 +45,17 @@
4445
"sphinx.ext.viewcode", # Add source code links
4546
"sphinx.ext.napoleon", # Google/NumPy style docstrings
4647
"sphinx_gallery.gen_gallery", # Generates auto_examples/ from examples/
48+
#"sphinx_click",
49+
"sphinxcontrib.typer"
4750
]
4851

4952
templates_path = ["_templates"]
50-
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "examples/GALLERY_HEADER.rst", "ml/preprocessing.rst"]
53+
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "examples/GALLERY_HEADER.rst"]#, "ml/preprocessing.rst"]
5154

55+
#autodoc_mock_imports = [
56+
# "zstandard",
57+
# "torch",
58+
#]
5259
# -- Options for HTML output -------------------------------------------------
5360
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
5461

docs/docs_requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@ sphinx-autodoc-typehints>=1.20.0,<2
55
nbsphinx>=0.8.9,<1
66
sphinx-gallery>=0.8.1,<1
77
ghp-import>=2.0.0,<3
8-
pandoc>=2.0.0,<3
8+
pandoc>=2.0.0,<3
9+
sphinxcontrib-typer>=0.7.2

docs/index.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,5 @@ Welcome to **atomworks** — a toolkit for converting, parsing, and manipulating
2323
api_reference
2424
auto_examples/index
2525
contributor_guide
26-
mirrors
26+
mirrors
27+
msa

docs/ml.rst

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ Core Modules
1111
ml/samplers
1212

1313
Data Processing Modules
14-
----------------------
14+
-----------------------
1515

1616
.. toctree::
1717
:maxdepth: 2
@@ -23,4 +23,5 @@ Data Processing Modules
2323
ml/transforms/dna
2424
ml/transforms/feature_aggregation
2525
ml/transforms/msa
26-
ml/utils
26+
ml/utils
27+
ml/preprocessing

docs/ml/preprocessing.rst

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,45 @@ Utilities
2323
---------
2424

2525
.. automodule:: atomworks.ml.preprocessing.utils
26+
:members:
27+
:undoc-members:
28+
:show-inheritance:
29+
30+
31+
MSA
32+
---
33+
34+
Note that the following functions can be called via the command line. See :doc:`../msa`
35+
for more details.
36+
37+
Finding
38+
^^^^^^^
39+
40+
.. automodule:: atomworks.ml.preprocessing.msa.finding
41+
:members:
42+
:undoc-members:
43+
:show-inheritance:
44+
45+
Filtering
46+
^^^^^^^^^
47+
48+
.. automodule:: atomworks.ml.preprocessing.msa.filtering
49+
:members:
50+
:undoc-members:
51+
:show-inheritance:
52+
53+
Generating
54+
^^^^^^^^^^
55+
56+
.. automodule:: atomworks.ml.preprocessing.msa.generating
57+
:members:
58+
:undoc-members:
59+
:show-inheritance:
60+
61+
Organizing
62+
^^^^^^^^^^
63+
64+
.. automodule:: atomworks.ml.preprocessing.msa.organizing
2665
:members:
2766
:undoc-members:
2867
:show-inheritance:

docs/msa.rst

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
Multiple Sequence Alignment in AtomWorks
2+
========================================
3+
4+
AtomWorks provides several command-line tools for Multiple Sequence Alignment (MSA) operations.
5+
6+
--------------
7+
8+
Find
9+
----
10+
11+
.. typer:: atomworks_cli.find:app
12+
:prog: atomworks msa find
13+
:show-nested:
14+
15+
Filter
16+
------
17+
.. typer:: atomworks_cli.filter:app
18+
:prog: atomworks msa filter
19+
:show-nested:
20+
21+
Generate
22+
--------
23+
.. typer:: atomworks_cli.generate:app
24+
:prog: atomworks msa generate
25+
:show-nested:
26+
27+
Organize
28+
--------
29+
.. typer:: atomworks_cli.organize:app
30+
:prog: atomworks msa organize
31+
:show-nested:

src/atomworks_cli/filter.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
def filter(
2222
input_dir: str = typer.Argument(
2323
...,
24-
help="Source directory containing MSA files to filter (supports glob patterns like '0*')",
24+
help="Source directory containing MSA files to filter (supports glob patterns like '0*'),",
2525
),
2626
output_dir: Path = typer.Argument(
2727
None,
@@ -30,49 +30,49 @@ def filter(
3030
dir_okay=True,
3131
writable=True,
3232
resolve_path=True,
33-
help="Destination directory for filtered files. If not provided uses the input paths with the specified output extension.",
33+
help="Destination directory for filtered files. If not provided, this uses the input paths with the specified output extension.",
3434
),
3535
input_extension: str = typer.Option(
3636
MSAFileExtension.A3M_GZ.value,
3737
"--input-extension",
3838
"-i",
39-
help="File extension for input MSA files (e.g., .a3m, .a3m.gz, .a3m.zst, .afa, .afa.gz, .afa.zst)",
39+
help="File extension for input MSA files (e.g., .a3m, .a3m.gz, .a3m.zst, .afa, .afa.gz, .afa.zst).",
4040
),
4141
output_extension: str = typer.Option(
4242
MSAFileExtension.A3M_GZ.value,
4343
"--output-extension",
4444
"-o",
45-
help="File extension for output MSA files (e.g., .a3m, .a3m.gz, .a3m.zst, .afa, .afa.gz, .afa.zst)",
45+
help="File extension for output MSA files (e.g., .a3m, .a3m.gz, .a3m.zst, .afa, .afa.gz, .afa.zst).",
4646
),
4747
max_sequences: int = typer.Option(
4848
10_000,
4949
"--max-sequences",
5050
"--maxseq",
51-
help="Maximum number of sequences to keep in each MSA",
51+
help="Maximum number of sequences to keep in each MSA.",
5252
),
5353
max_identity: float = typer.Option(
5454
90.0,
5555
"--max-identity",
5656
"--id",
57-
help="Maximum pairwise sequence identity (%)",
57+
help="Maximum pairwise sequence identity (%).",
5858
),
5959
min_coverage: float = typer.Option(
6060
50.0,
6161
"--min-coverage",
6262
"--cov",
63-
help="Minimum coverage with query (%)",
63+
help="Minimum coverage with query (%).",
6464
),
6565
num_workers: int | None = typer.Option(
6666
None,
6767
"--num-workers",
6868
"-j",
69-
help="Number of parallel workers (defaults to min(CPU_COUNT, 16))",
69+
help="Number of parallel workers (defaults to min(CPU_COUNT, 16)).",
7070
),
7171
verbose: bool = typer.Option(
7272
False,
7373
"--verbose",
7474
"-v",
75-
help="Enable verbose logging",
75+
help="Enable verbose logging.",
7676
),
7777
) -> None:
7878
"""Filter MSA files using HHfilter to reduce sequence count and redundancy.
@@ -85,6 +85,8 @@ def filter(
8585
Can be applied to organized MSA files or any directory of MSA files.
8686
Automatic compression/decompression is applied based on the input and output file extensions.
8787
88+
Before using this command users must have HH-Filter installed and the path set. HH-Filter is part of the HH-suite package.
89+
8890
Examples:
8991
# Filter files in a separate output directory
9092
atomworks msa filter ./msas ./filtered_msas --max-sequences 1000

src/atomworks_cli/find.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,41 +25,44 @@ def find(
2525
dir_okay=False,
2626
readable=True,
2727
resolve_path=True,
28-
help="CSV file containing protein sequences",
28+
help="Path and file name for a CSV file containing protein sequences.",
2929
),
3030
sequence_column: str | None = typer.Option(
3131
None,
3232
"--sequence-column",
3333
"-c",
34-
help="Name of column containing sequences (required if CSV has multiple columns)",
34+
help="Name of column containing sequences (required if CSV has multiple columns).",
3535
),
3636
existing_msa_dirs: str | None = typer.Option(
3737
None,
3838
"--existing-msa-dirs",
39-
help="Comma-separated MSA directories to find (uses LOCAL_MSA_DIRS env var if not specified)",
39+
help="Comma-separated list of directories containing MSA information. (Uses LOCAL_MSA_DIRS env var if not specified.)",
4040
),
4141
missing_output: Path | None = typer.Option(
4242
None,
4343
"--missing-output",
44-
help="Optional path to save CSV with missing sequences",
44+
help="Optional path and file name to save CSV with missing sequences.",
4545
),
4646
found_output: Path | None = typer.Option(
4747
None,
4848
"--found-output",
49-
help="Optional path to save CSV with found sequences and their MSA paths",
49+
help="Optional path and file name to save CSV with found sequences and their MSA paths.",
5050
),
5151
verbose: bool = typer.Option(
5252
False,
5353
"--verbose",
5454
"-v",
55-
help="Enable verbose logging",
55+
help="Enable verbose logging.",
5656
),
5757
) -> None:
5858
"""Find MSA files for sequences in a CSV file.
5959
6060
Analyzes a CSV file to find existing MSA files for sequences and
6161
optionally saves missing and found sequences to separate CSV files.
6262
63+
You will need to set the LOCAL_MSA_DIRS environment variable to the directory
64+
where your MSA information is stored.
65+
6366
Examples:
6467
# Find MSAs for single-column CSV
6568
atomworks msa find sequences.csv

src/atomworks_cli/generate.py

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def generate(
3030
dir_okay=False,
3131
readable=True,
3232
resolve_path=True,
33-
help="CSV file containing protein sequences",
33+
help="Path to and file name of the CSV file containing protein sequences.",
3434
),
3535
output_dir: Path = typer.Argument(
3636
...,
@@ -39,78 +39,80 @@ def generate(
3939
dir_okay=True,
4040
writable=True,
4141
resolve_path=True,
42-
help="Output directory for generated MSA files",
42+
help="Output directory for generated MSA files.",
4343
),
4444
sequence_column: str | None = typer.Option(
4545
None,
4646
"--sequence-column",
4747
"-c",
48-
help="Name of column containing sequences (required if CSV has multiple columns)",
48+
help="Name of column containing sequences (required if CSV has multiple columns).",
4949
),
5050
# MSAGenerationConfig parameters
5151
sharding_pattern: str = typer.Option(
5252
"/0:2/",
5353
"--sharding-pattern",
5454
"-s",
55-
help="Directory sharding pattern (e.g., '/0:2/')",
55+
help="Directory sharding pattern (e.g., '/0:2/').",
5656
),
5757
output_extension: str = typer.Option(
5858
MSAFileExtension.A3M_GZ.value,
5959
"--output-extension",
6060
"-o",
61-
help="Output file extension (.a3m, .a3m.gz, .a3m.zst, .afa, .afa.gz, .afa.zst)",
61+
help="Output file extension (.a3m, .a3m.gz, .a3m.zst, .afa, .afa.gz, .afa.zst).",
6262
),
6363
gpu: bool | None = typer.Option(
6464
None,
6565
"--gpu/--no-gpu",
66-
help="Use GPU acceleration (auto-detects if not specified)",
66+
help="Use GPU acceleration (auto-detects if not specified).",
6767
),
6868
num_iterations: int = typer.Option(
6969
3,
7070
"--num-iterations",
7171
"-n",
72-
help="Number of MMseqs2 search iterations",
72+
help="Number of MMseqs2 search iterations.",
7373
),
7474
max_final_sequences: int = typer.Option(
7575
10_000,
7676
"--max-final-sequences",
77-
help="Maximum number of sequences in final MSAs",
77+
help="Maximum number of sequences in final MSAs.",
7878
),
7979
use_env: bool = typer.Option(
8080
True,
8181
"--use-env/--no-env",
82-
help="Include environmental (metagenomic) database",
82+
help="Include environmental (metagenomic) database.",
8383
),
8484
num_workers: int = typer.Option(
8585
32,
8686
"--num-workers",
8787
"-j",
88-
help="Number of CPU threads",
88+
help="Number of CPU threads.",
8989
),
9090
sensitivity: float | None = typer.Option(
9191
8.0,
9292
"--sensitivity",
93-
help="MMseqs2 sensitivity (lower = faster, sparser MSAs)",
93+
help="MMseqs2 sensitivity (lower = faster, sparser MSAs).",
9494
),
9595
verbose: bool = typer.Option(
9696
False,
9797
"--verbose",
9898
"-v",
99-
help="Enable verbose logging",
99+
help="Enable verbose logging.",
100100
),
101101
check_existing: bool = typer.Option(
102102
False,
103103
"--check-existing/--no-check-existing",
104-
help="Check for existing MSAs before generation",
104+
help="Check for existing MSAs before generation.",
105105
),
106106
existing_msa_dirs: str | None = typer.Option(
107107
None,
108108
"--existing-msa-dirs",
109-
help="Comma-separated MSA directories to check (uses LOCAL_MSA_DIRS env var if not specified)",
109+
help="Comma-separated list of MSA directories to check (uses LOCAL_MSA_DIRS env var if not specified).",
110110
),
111111
) -> None:
112112
"""Generate MSAs from sequences in a CSV file using MMseqs2.
113113
114+
Before using this command users must first install MMseqs2.
115+
114116
Examples:
115117
# Single-column CSV
116118
atomworks msa generate sequences.csv output_msas/

0 commit comments

Comments
 (0)