feat: add xy embedding to alignment

flavioschneider · flavioschneider · commit 4396eab93fea · 2022-11-29T19:37:12.000+01:00
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 
-# Aligner - PyTorch 
+# Aligner - PyTorch
 
-Sequence alignement methods with helpers for PyTorch. 
+Sequence alignement methods with helpers for PyTorch.
 
 ## Install
 
@@ -16,12 +16,12 @@ pip install aligner-pytorch
 
 ### MAS
 
-MAS (Monotonic Alignment Search) from GlowTTS. This can be used to get the alignment of any (similarity) matrix. Implementation in optimized Cython. 
+MAS (Monotonic Alignment Search) from GlowTTS. This can be used to get the alignment of any (similarity) matrix. Implementation in optimized Cython.
 
 ```py
-from aligner_pytorch import mas 
+from aligner_pytorch import mas
 
-sim = torch.rand(1, 4, 6) # [batch_size, m_rows, n_cols]
+sim = torch.rand(1, 4, 6) # [batch_size, x_length, y_length]
 alignment = mas(sim)
 
 """
@@ -41,14 +41,60 @@ alignment = tensor([[
 """
 ```
 
+### XY Embedding to Alignment
+Used during training to get the alignement of a `x_embedding` with `y_embedding`, computes the log probability from a normal distribution and the alignment with MAS.
+```py
+from aligner_pytorch import get_alignment_from_embeddings
+
+x_embedding = torch.randn(1, 4, 10)
+y_embedding = torch.randn(1, 6, 10)
+
+alignment = get_alignment_from_embeddings(
+    x_embedding=torch.randn(1, 4, 10),  # [batch_size, x_length, features]
+    y_embedding=torch.randn(1, 6, 10),  # [batch_size, y_length, features]
+)                                       # [batch_size, x_length, y_length]
+
+"""
+alignment = tensor([[
+    [1, 0, 0, 0, 0, 0],
+    [0, 1, 0, 0, 0, 0],
+    [0, 0, 1, 0, 0, 0],
+    [0, 0, 0, 1, 1, 1]
+]], dtype=torch.int32)
+"""
+```
+
+### Duration Embedding to Alignment
+Used during inference to compute the alignment from a trained duration embedding.
+```py
+from aligner_pytorch import get_alignment_from_duration_embedding
+
+alignment = get_alignment_from_duration_embedding(
+    embedding=torch.randn(1, 5),    # Embedding: [batch_size, x_length]
+    scale=1.0,                      # Duration scale
+    y_length=10                     # (Optional) fixes maximum output y_length
+)                                   # Output alignment [batch_size, x_length, y_length]
+
+"""
+alignment  = tensor([[
+    [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+    [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
+    [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
+    [0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
+    [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
+]])
+"""
+```
+
+
 ## Citations
 
-Monotonic Alignment Search 
+Monotonic Alignment Search
 ```bibtex
 @misc{2005.11129,
 Author = {Jaehyeon Kim and Sungwon Kim and Jungil Kong and Sungroh Yoon},
 Title = {Glow-TTS: A Generative Flow for Text-to-Speech via Monotonic Alignment Search},
 Year = {2020},
 Eprint = {arXiv:2005.11129},
 }
-```
+```
diff --git a/aligner_pytorch/__init__.py b/aligner_pytorch/__init__.py
@@ -1 +1,2 @@
 from aligner_pytorch.mas import mas
+from .aligner import *  # noqa
diff --git a/aligner_pytorch/aligner.py b/aligner_pytorch/aligner.py
@@ -0,0 +1,82 @@
+import torch
+import math
+from torch.nn import functional as F
+from torch import Tensor
+from typing import Optional
+from einops import rearrange, reduce, repeat
+from .utils import exists
+from .mas import mas
+
+
+@torch.no_grad()
+def get_alignment_from_embeddings(
+    x_embedding: Tensor,
+    y_embedding: Tensor,
+    x_mask: Optional[Tensor] = None,
+) -> Tensor:
+    _, ty, d = y_embedding.shape
+    # Compute multivariate gaussian log PDF: log N(x|mu=y, Σ=I)
+    const = -0.5 * math.log(2 * math.pi) * d
+    factor = -0.5 * torch.ones(x_embedding.shape).to(x_embedding)
+    y_sq = torch.einsum("b x d, b y d -> b x y", factor, y_embedding**2)
+    y_mu = torch.einsum("b x d, b y d -> b x y", 2 * factor * x_embedding, y_embedding)
+    x_sq = reduce(factor * x_embedding**2, "b tx d -> b tx 1", "sum")
+    log_prior = y_sq - y_mu + x_sq + const
+    # Mask xs if provided
+    a_mask = repeat(x_mask, "b tx -> b tx ty", ty=ty) if exists(x_mask) else None
+    # Compute MAS alignment
+    alignment = mas(log_prior, mask=a_mask)
+    return alignment
+
+
+def get_sequential_masks(lengths: Tensor, length_max: Optional[int] = None) -> Tensor:
+    if not exists(length_max):
+        length_max = int(lengths.max().item())
+    length_max = int(length_max)
+    x = rearrange(torch.arange(length_max).to(lengths), "n -> 1 n")
+    y = rearrange(lengths, "b -> b 1")
+    return x < y
+
+
+def get_alignment_from_duration(
+    duration: Tensor,
+    mask: Tensor,
+) -> Tensor:
+    b, tx, ty = mask.shape
+    duration_cum = torch.cumsum(duration, dim=1)
+    # Compute paths matrix filled with True on the lower diagonal
+    paths = get_sequential_masks(
+        lengths=rearrange(duration_cum, "b tx -> (b tx)"), length_max=ty
+    )
+    paths = rearrange(paths, "(b tx) ty -> b tx ty", b=b)
+    # Get mask paths matrix to get only a single path by padding top and inverting
+    paths_mask = ~F.pad(paths, pad=(0, 0, 1, 0))[:, :-1, :]
+    # Get single path and mask unused
+    paths = paths * paths_mask * mask
+    return paths.long()
+
+
+@torch.no_grad()
+def get_alignment_from_duration_embedding(
+    embedding: Tensor,  # [b, tx]
+    scale: float = 1.0,
+    mask: Optional[Tensor] = None,  # [b, tx]
+    y_length: Optional[int] = None,
+) -> Tensor:  # [b, tx, ty]
+    b, tx, device = *embedding.shape, embedding.device
+    # Default mask to all xs if not provided
+    x_mask = mask if exists(mask) else torch.ones((b, tx), device=device).bool()
+    assert x_mask.shape == embedding.shape, "mask must have same shape as embedding"
+    # Get int duration by exponentiating and ceiling, then scaling by duration scale
+    duration = torch.exp(embedding)
+    duration = torch.ceil(duration) * scale
+    duration = duration * x_mask
+    # Compute total duration per item (clamp if below 1)
+    duration_total = torch.clamp_min(reduce(duration, "b tx -> b", "sum"), 1).long()
+    # Get max duration over all items
+    duration_max = y_length if exists(y_length) else int(duration_total.max())
+    # Get ys mask and attn matrix mask
+    y_mask = get_sequential_masks(lengths=duration_total, length_max=duration_max)  # type: ignore # noqa
+    a_mask = rearrange(x_mask, "b tx -> b tx 1") * rearrange(y_mask, "b ty -> b 1 ty")
+    # Get masked attn paths from duration
+    return get_alignment_from_duration(duration=duration, mask=a_mask)
diff --git a/aligner_pytorch/mas.py b/aligner_pytorch/mas.py
@@ -2,14 +2,22 @@
 import torch
 from torch import Tensor
 from typing import Optional
+from .utils import exists
 
 
 def mas(x: Tensor, mask: Optional[Tensor] = None) -> Tensor:
-    b, m, n, device = *x.shape, x.device
+    device = x.device
 
     values = x.detach().clone().to(dtype=torch.float32, device="cpu").numpy()
     paths = torch.zeros_like(x, dtype=torch.int32, device="cpu").numpy()
 
+    mask = mask.clone() if exists(mask) else torch.ones_like(x)
+    mask = mask.to(dtype=torch.int32, device="cpu").numpy()
+
+    # ms = reduce(mask, 'b m n -> b m', 'sum')[:, 0]
+    # ns = reduce(mask, 'b m n -> b n', 'sum')[:, 0]
+
+    b, m, n = x.shape
     ms = torch.tensor([m], dtype=torch.int32).repeat(b).numpy()
     ns = torch.tensor([n], dtype=torch.int32).repeat(b).numpy()
 
diff --git a/aligner_pytorch/utils.py b/aligner_pytorch/utils.py
@@ -1,64 +1,8 @@
-import torch
-from torch.nn import functional as F
-from torch import Tensor
 from typing import TypeVar, Optional
 from typing_extensions import TypeGuard
-from einops import rearrange, reduce
 
 T = TypeVar("T")
 
 
 def exists(val: Optional[T]) -> TypeGuard[T]:
     return val is not None
-
-
-def get_sequential_masks(lengths: Tensor, length_max: Optional[int] = None) -> Tensor:
-    if not exists(length_max):
-        length_max = int(lengths.max().item())
-    length_max = int(length_max)
-    x = rearrange(torch.arange(length_max).to(lengths), "n -> 1 n")
-    y = rearrange(lengths, "b -> b 1")
-    return x < y
-
-
-def get_alignment_from_duration(
-    duration: Tensor,
-    mask: Tensor,
-) -> Tensor:
-    b, tx, ty = mask.shape
-    duration_cum = torch.cumsum(duration, dim=1)
-    # Compute paths matrix filled with True on the lower diagonal
-    paths = get_sequential_masks(
-        lengths=rearrange(duration_cum, "b tx -> (b tx)"), length_max=ty
-    )
-    paths = rearrange(paths, "(b tx) ty -> b tx ty", b=b)
-    # Get mask paths matrix to get only a single path by padding top and inverting
-    paths_mask = ~F.pad(paths, pad=(0, 0, 1, 0))[:, :-1, :]
-    # Get single path and mask unused
-    paths = paths * paths_mask * mask
-    return paths.long()
-
-
-def get_alignment_from_duration_embedding(
-    embedding: Tensor,  # [b, tx]
-    scale: float = 1.0,
-    mask: Optional[Tensor] = None,  # [b, tx]
-    max_length: Optional[int] = None,
-) -> Tensor:  # [b, tx, ty]
-    b, tx, device = *embedding.shape, embedding.device
-    # Default mask to all xs if not provided
-    x_mask = mask if exists(mask) else torch.ones((b, tx), device=device).bool()
-    assert x_mask.shape == embedding.shape, "mask must have same shape as embedding"
-    # Get int duration by exponentiating and ceiling, then scaling by duration scale
-    duration = torch.exp(embedding)
-    duration = torch.ceil(duration) * scale
-    duration = duration * x_mask
-    # Compute total duration per item (clamp if below 1)
-    duration_total = torch.clamp_min(reduce(duration, "b tx -> b", "sum"), 1).long()
-    # Get max duration over all items
-    duration_max = max_length if exists(max_length) else int(duration_total.max())
-    # Get ys mask and attn matrix mask
-    y_mask = get_sequential_masks(lengths=duration_total, length_max=duration_max)  # type: ignore # noqa
-    a_mask = rearrange(x_mask, "b tx -> b tx 1") * rearrange(y_mask, "b ty -> b 1 ty")
-    # Get masked attn paths from duration
-    return get_alignment_from_duration(duration=duration, mask=a_mask)
diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@
 
 setup(
     name="aligner-pytorch",
-    version="0.0.17",
+    version="0.0.18",
     packages=find_packages(),
     license="MIT",
     description="Aligner - PyTorch",

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`	`1`	`from aligner_pytorch.mas import mas`
	`2`	`+from .aligner import * # noqa`