iterative-pseudo-forced-alignment-ctc/search_on_speech.sh at main · ferugit/iterative-pseudo-forced-alignment-ctc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/bin/bash
#
# Created by Fernando López Gavilánez (2023)
#
# This script generates sub-uttrance alignmnets using the method proposed in https://arxiv.org/abs/2210.15226
#
# Requirements:
#   i) Audio files and a tsv file containing the following columns:
#       [Sample_ID, Sample_Path, Channel, Audio_Length, Start, End, Segment_Score, Transcription, Speaker_ID, Database] + others
#
#       *** The source temporal information is not used, can be dummy temporal information.
#
#       +---------------+-----+--------------------------------------------------------+
#       |     Name      | Use |                      Explanation                       |
#       +---------------+-----+--------------------------------------------------------+
#       | Sample_ID     | Yes | Unique sample identifier (e.g. AG-20210605_9.04_14.08) |
#       | Sample_Path   | Yes | Audio file path (e.g. /path/to/file/AG-20210605.wav)   |
#       | Channel       | No  | e.g. 1                                                 |
#       | Audio_Length  | No  | END_OF_SEGMENT - START_OF_SEGMENT                      |
#       | Start         | No  | START_OF_SEGMENT                                       |
#       | End           | No  | END_OF_SEGMENT                                         |
#       | Transcription | No  | Text to be aligned                                     |
#       | Speaker_ID    | No  | Speaker identifier                                     |
#       | Database      | No  | Database identifier                                    |
#       +---------------+-----+--------------------------------------------------------+
#
#   ii) An already trained ASR in the target language in the SpeechBrain framework (EncoderASR)
#
# Process:
#   i) Search wanted text in transcriptions
#   ii) Perform word-level alignment
#
# Results:
#   i) A tsv with the filtered data: data that contains the word in text reference
#   ii) A tsv with thow wanted words aligned
#


#########################################################
###################### DEFINITIONS ######################
#########################################################

# config zone
alignment_name="benedetti_sos" # alignment name, comment to use timestamp instead
tsv_path=data/wip_benedetti/results/benedetti_aligned.tsv # source file with metadata
speech_to_search="conmigo" # text that will be searched in all segments

# alignment corrections: better apply this after
collar=0.0 # collar to alignment in seconds
offset_time=0.0 # alignment shift to rigth in seconds
left_offset=0.0 # start shift in seconds
right_offset=0.0 # end shift in seconds

# trained ASR
asr_hub="Voyager1/asr-wav2vec2-commonvoice-es"
asr_savedir="data/asr/"


#########################################################
####################### ALIGNMENT #######################
#########################################################


# generate WIP directories
if [ ! -z ${alignment_name+set} ];
then
    wip_dir="data/wip_"$alignment_name
    echo "Alignment name defined, WIP folder is: "$wip_dir
else
    wip_dir="data/wip_"$(date +%s)
    echo "Alignment name not defined, WIP folder is: "$wip_dir
fi

results_dir=$wip_dir"/results"
logs_dir=$wip_dir"/logs"

mkdir -p $wip_dir
mkdir -p $results_dir
mkdir -p $logs_dir

# perform alignment
echo "Starting word-level alignment..."
python -u src/search_on_speech.py --tsv $tsv_path \
 --dst_path $results_dir --asr_hub $asr_hub --asr_savedir $asr_savedir \
 --logs_path $logs_dir --text="$speech_to_search"