latency script more, mostly finished

albertz · albertz · commit 9e9124b74a54 · 2023-09-10T00:26:13.000+02:00
diff --git a/users/zeyer/experiments/exp2023_02_16_chunked_attention/scripts/latency.py b/users/zeyer/experiments/exp2023_02_16_chunked_attention/scripts/latency.py
@@ -26,11 +26,15 @@ class Deps:
     """deps"""
 
     phone_alignments: Union[FileArchiveBundle, FileArchive]
-    phone_alignment_ms_per_frame: float
+    phone_alignment_sec_per_frame: Decimal
     lexicon: Lexicon
-    labels_with_eoc_hdf: HDFDataset
     corpus: Dict[str, BlissItem]
-    bpe_vocab: Vocabulary
+    chunk_labels: HDFDataset
+    eoc_idx: int
+    chunk_bpe_vocab: Vocabulary
+    chunk_left_padding: Decimal
+    chunk_stride: Decimal
+    chunk_size: Decimal
 
 
 def uopen(path: str, *args, **kwargs):
@@ -310,11 +314,21 @@ def from_element(cls, e):
         return Lemma(orth, phon, synt, eval, special)
 
 
-def get_sprint_word_ends(deps: Deps, segment_name: str) -> List[int]:
-    pass
+def handle_segment(deps: Deps, segment_name: str) -> List[Decimal]:
+    """handle segment"""
+    corpus_entry = deps.corpus[segment_name]
+    words = corpus_entry.orth.split()
+    phone_align_ends = get_phone_alignment_word_ends(deps, segment_name)
+    chunk_ends = get_chunk_ends(deps, segment_name)
+    assert len(phone_align_ends) == len(chunk_ends) == len(words)
+    res = []
+    for word, phone_align_end, chunk_end in zip(words, phone_align_ends, chunk_ends):
+        print(f"{word}: {phone_align_end} vs {chunk_end}, latency: {chunk_end - phone_align_end}sec")
+        res.append(chunk_end - phone_align_end)
+    return res
 
 
-def handle_segment(deps: Deps, segment_name: str):
+def get_phone_alignment_word_ends(deps: Deps, segment_name: str) -> List[Decimal]:
     """handle segment"""
     phone_alignment = deps.phone_alignments.read(segment_name, "align")
     corpus_entry = deps.corpus[segment_name]
@@ -323,6 +337,7 @@ def handle_segment(deps: Deps, segment_name: str):
     next_time_idx = 0
     word_idx = 0
     cur_word_phones = []
+    res = []
     for time_idx, allophone_idx, state, weight in phone_alignment:
         assert next_time_idx == time_idx
         next_time_idx += 1
@@ -341,50 +356,119 @@ def handle_segment(deps: Deps, segment_name: str):
         if is_final:
             lemma = deps.lexicon.orth_to_lemma[words[word_idx]]
             phones_s = " ".join(cur_word_phones)
-            print(f"end time {time_idx * deps.phone_alignment_ms_per_frame / 1000.}sec:", lemma.orth[0], "/", phones_s)
+            print(f"end time {time_idx * deps.phone_alignment_sec_per_frame}sec:", lemma.orth[0], "/", phones_s)
             if phones_s not in lemma.phon:
                 raise Exception(f"Phones {phones_s} not in lemma {lemma}?")
+            res.append(time_idx * deps.phone_alignment_sec_per_frame)
 
             cur_word_phones.clear()
             word_idx += 1
     assert word_idx == len(words)
+    return res
+
+
+def get_chunk_ends(deps: Deps, segment_name: str) -> List[Decimal]:
+    """
+    Example:
+
+    chunk_size_dim = SpatialDim("chunk-size", 25)
+    input_chunk_size_dim = SpatialDim("input-chunk-size", 150)
+    sliced_chunk_size_dim = SpatialDim("sliced-chunk-size", 20)
+
+    "_input_chunked": {
+        "class": "window",
+        "from": "source",
+        "out_spatial_dim": chunked_time_dim,
+        "stride": 120,
+        "window_dim": input_chunk_size_dim,
+        "window_left": 0,
+    },
+
+    # audio_features is 16.000 Hz, i.e. 16.000 frames per sec, 16 frames per ms.
+    layer /'source':  # 100 frames per sec, 0.01 sec per frame, 10 ms per frame
+        [B,T|'⌈(-199+time:var:extern_data:audio_features+-200)/160⌉'[B],F|F'mel_filterbank:feature-dense'(80)] float32
+    layer /'_input_chunked':  # 1.2 sec per frame
+        [B,T|'⌈(-199+time:var:extern_data:audio_features+-200)/19200⌉'[B],
+         'input-chunk-size'(150),F|F'mel_filterbank:feature-dense'(80)] float32
+    """
+    corpus_entry = deps.corpus[segment_name]
+    words = corpus_entry.orth.split()
+    bpe_labels = deps.chunk_labels.get_data_by_seq_tag(segment_name, "data")
+    bpe_labels_s = deps.chunk_bpe_vocab.get_seq_labels(bpe_labels)
+    print(bpe_labels)
+    print(bpe_labels_s)
+    chunk_idx = 0
+    cur_chunk_end_pos = deps.chunk_left_padding + deps.chunk_size
+    cur_word = ""
+    word_idx = 0
+    res = []
+    for label_idx in bpe_labels:
+        if label_idx == deps.eoc_idx:
+            chunk_idx += 1
+            cur_chunk_end_pos += deps.chunk_stride
+            continue
+        assert word_idx < len(words), f"{bpe_labels_s!r} does not fit to {corpus_entry.orth!r}"
+        label = deps.chunk_bpe_vocab.id_to_label(label_idx)
+        if label.endswith("@@"):
+            cur_word += label[:-2]
+            continue
+        cur_word += label
+        assert (
+            cur_word == words[word_idx]
+        ), f"{cur_word!r} != {words[word_idx]!r} in {bpe_labels_s!r} != {corpus_entry.orth!r}"
+        print(f"end time {cur_chunk_end_pos}sec:", cur_word)
+        res.append(cur_chunk_end_pos)
+        word_idx += 1
+        cur_word = ""
+    assert word_idx == len(words) and not cur_word
+    return res
 
 
 def main():
     """main"""
     arg_parser = argparse.ArgumentParser()
     arg_parser.add_argument("--phone-alignments", required=True, help="From RASR")
-    arg_parser.add_argument("--phone-alignment-ms-per-frame", type=float, default=10.0)
+    arg_parser.add_argument("--phone-alignment-sec-per-frame", type=Decimal, default=Decimal("0.01"))
     arg_parser.add_argument("--allophone-file", required=True, help="From RASR")
     arg_parser.add_argument("--lexicon", required=True, help="XML")
     arg_parser.add_argument("--corpus", required=True, help="Bliss XML")
-    arg_parser.add_argument("--labels-with-eoc", required=True, help="HDF dataset")
+    arg_parser.add_argument("--chunk-labels", required=True, help="HDF dataset")
+    arg_parser.add_argument("--eoc-idx", default=0, type=int, help="End-of-chunk idx")
+    arg_parser.add_argument("--chunk-bpe-vocab", required=True, help="BPE vocab dict")
+    arg_parser.add_argument(
+        "--chunk-left-padding", type=Decimal, required=True, help="window_left in window layer, in sec"
+    )
+    arg_parser.add_argument("--chunk-stride", type=Decimal, required=True, help="stride in window layer, in sec")
+    arg_parser.add_argument("--chunk-size", type=Decimal, required=True, help="window_dim in window layer, in sec")
     arg_parser.add_argument("--segment", nargs="*")
-    arg_parser.add_argument("--bpe-vocab", required=True, help="BPE vocab dict")
     args = arg_parser.parse_args()
 
     phone_alignments = open_file_archive(args.phone_alignments)
     phone_alignments.set_allophones(args.allophone_file)
 
     lexicon = Lexicon(args.lexicon)
 
-    dataset = HDFDataset([args.labels_with_eoc])
+    dataset = HDFDataset([args.chunk_labels])
     dataset.initialize()
     dataset.init_seq_order(epoch=1)
 
     corpus = {}
     for item in iter_bliss(args.corpus):
         corpus[item.segment_name] = item
 
-    bpe_vocab = Vocabulary(args.bpe_vocab, unknown_label=None)
+    bpe_vocab = Vocabulary(args.chunk_bpe_vocab, unknown_label=None)
 
     deps = Deps(
         phone_alignments=phone_alignments,
-        phone_alignment_ms_per_frame=args.phone_alignment_ms_per_frame,
+        phone_alignment_sec_per_frame=args.phone_alignment_sec_per_frame,
         lexicon=lexicon,
-        labels_with_eoc_hdf=dataset,
         corpus=corpus,
-        bpe_vocab=bpe_vocab,
+        chunk_labels=dataset,
+        eoc_idx=args.eoc_idx,
+        chunk_bpe_vocab=bpe_vocab,
+        chunk_left_padding=args.chunk_left_padding,
+        chunk_stride=args.chunk_stride,
+        chunk_size=args.chunk_size,
     )
 
     for segment_name in args.segment or corpus: