Skip to content

Commit c789d59

Browse files
authored
Merge pull request #104 from Genentech/more-docs
2 parents 21227de + 5ca38f5 commit c789d59

37 files changed

+158
-45
lines changed

src/grelu/data/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
"""
2+
`gReLU.data` contains modules related to processing and QC of genomic data, and loading
3+
and augmenting genomic data for training, validation and testing sequence-to-function models.
4+
"""

src/grelu/data/augment.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,16 @@
11
"""
2-
Functions to augment data. All functions assume that the input is a numpy array containing an integer
3-
encoded DNA sequence of shape (L,) or a numpy array containing a label of shape (T, L).
4-
The augmented output will be in the same format.
2+
`grelu.data.augment` contains functions to augment genomic sequences or functional genomic data.
3+
4+
All functions assume that the input is either:
5+
6+
(1) a 1-D numpy array containing an integer encoded DNA sequence of shape (length,) or;
7+
(2) a 2-D numpy array containing a label of shape (tasks, length).
8+
9+
The augmented output must be returned in the same format. All augmentation functions also
10+
require an index (idx) which is an integer or boolean value.
11+
12+
This module also contains the `Augmenter` class which is responsible for applying multiple
13+
augmentations to a given DNA sequence or (sequence, label) pair.
514
"""
615

716
import warnings
@@ -12,7 +21,7 @@
1221
from grelu.sequence.mutate import random_mutate
1322
from grelu.sequence.utils import reverse_complement
1423

15-
# This is the number of output sequences expected from each type of augmentation
24+
# This is the number of output sequences expected from a single input sequence using each type of augmentation
1625
AUGMENTATION_MULTIPLIER_FUNCS = {
1726
"rc": lambda x: 2**x,
1827
"max_seq_shift": lambda x: (2 * x) + 1,
@@ -197,7 +206,7 @@ def __call__(
197206
else:
198207
raise NotImplementedError
199208

200-
# Augment the sequence
209+
# Apply all sequence augmentation functions here
201210

202211
# Shift sequence
203212
if self.shift_seq:
@@ -224,7 +233,7 @@ def __call__(
224233
return seq
225234

226235
else:
227-
# Augment the label too
236+
# Apply all label augmentation functions here
228237
if self.shift_label:
229238
# Shift label
230239
label = shift(label, seq_len=self.label_len, idx=pair_shift_idx)

src/grelu/data/dataset.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
11
"""
2-
Pytorch dataset classes to load sequence data
2+
`grelu.data.dataset` contains specialized PyTorch Dataset classes to load genomic data.
3+
All dataset classes must inherit from `torch.utils.Data.Dataset`.
34
4-
All dataset classes produce either one-hot encoded sequences of shape (4, L)
5-
or sequence-label pairs of shape (4, L) and (T, L).
5+
Dataset classes intended for inference produce 2-D tensors of shape (4, length),
6+
containing one-hot encoded sequences.
7+
8+
Dataset classes intended for training and validation produce (sequence, label)
9+
pairs, wherein the sequence is a 2-D tensor of shape (4, length) containing a one-hot
10+
encoded sequence, and the label is a 2-D tensor of shape (tasks, length).
611
"""
712

813
import os

src/grelu/data/preprocess.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
"""
2-
Functions to preprocess genomic datasets.
2+
`grelu.data.preprocess` contains functions to preprocess genomic datasets in standard
3+
formats, in order to produce data suitable for deep learning. This includes filtering
4+
and checking data, splitting data into sets for training and validation, and converting
5+
between data formats.
36
"""
47

58
import os

src/grelu/data/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Dataset-related utility functions.
2+
`grelu.data.utils` contains Dataset-related utility functions.
33
"""
44

55
from typing import List, Union

src/grelu/design.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
"""
2+
`grelu.design` contains methods to design novel DNA sequences
3+
using trained sequence-to-function deep learning models.
4+
"""
5+
16
from typing import Callable, List, Optional, Union
27

38
import numpy as np

src/grelu/interpret/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
"""
2+
`grelu.interpret` contains modules related to sequence interpretation using trained models.
3+
"""

src/grelu/interpret/modisco.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
"""
2+
`grelu.interpret.modisco` contains functions that enable the user to run TF-MoDISco
3+
(Shrikumar et al. 2018) on trained models. Many of the functions here are based on
4+
https://github.com/jmschrei/tfmodisco-lite.
5+
"""
6+
17
import os
28
from typing import Callable, List, Optional, Union
39

src/grelu/interpret/motifs.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
"""
2-
Functions related to manipulating sequence motifs and scanning DNA sequences with motifs.
2+
`grelu.interpret.motifs contains functions related to manipulating sequence motifs
3+
and scanning DNA sequences with motifs. Note that the aim here is not to provide
4+
a comprehensive suite of functions related to motif analysis, but only the
5+
functionality necessary for interpreting sequence-to-function deep learning models
6+
using these motifs.
37
"""
48

59
from typing import Callable, Dict, Generator, List, Optional, Tuple, Union

src/grelu/interpret/score.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
"""
2-
Functions related to scoring the importance of individual DNA bases.
2+
`grelu.interpret.score` contains functions related to scoring the importance of
3+
individual DNA bases or regions using a trained model.
4+
5+
gReLU uses Captum for several attribution methods, including InputXGradient,
6+
IntegratedGradients, and Saliency.
37
"""
48

59
import warnings

0 commit comments

Comments
 (0)