-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsearch_on_speech.sh
More file actions
executable file
·85 lines (73 loc) · 3.46 KB
/
search_on_speech.sh
File metadata and controls
executable file
·85 lines (73 loc) · 3.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/bin/bash
#
# Created by Fernando López Gavilánez (2023)
#
# This script generates sub-uttrance alignmnets using the method proposed in https://arxiv.org/abs/2210.15226
#
# Requirements:
# i) Audio files and a tsv file containing the following columns:
# [Sample_ID, Sample_Path, Channel, Audio_Length, Start, End, Segment_Score, Transcription, Speaker_ID, Database] + others
#
# *** The source temporal information is not used, can be dummy temporal information.
#
# +---------------+-----+--------------------------------------------------------+
# | Name | Use | Explanation |
# +---------------+-----+--------------------------------------------------------+
# | Sample_ID | Yes | Unique sample identifier (e.g. AG-20210605_9.04_14.08) |
# | Sample_Path | Yes | Audio file path (e.g. /path/to/file/AG-20210605.wav) |
# | Channel | No | e.g. 1 |
# | Audio_Length | No | END_OF_SEGMENT - START_OF_SEGMENT |
# | Start | No | START_OF_SEGMENT |
# | End | No | END_OF_SEGMENT |
# | Transcription | No | Text to be aligned |
# | Speaker_ID | No | Speaker identifier |
# | Database | No | Database identifier |
# +---------------+-----+--------------------------------------------------------+
#
# ii) An already trained ASR in the target language in the SpeechBrain framework (EncoderASR)
#
# Process:
# i) Search wanted text in transcriptions
# ii) Perform word-level alignment
#
# Results:
# i) A tsv with the filtered data: data that contains the word in text reference
# ii) A tsv with thow wanted words aligned
#
#########################################################
###################### DEFINITIONS ######################
#########################################################
# config zone
alignment_name="benedetti_sos" # alignment name, comment to use timestamp instead
tsv_path=data/wip_benedetti/results/benedetti_aligned.tsv # source file with metadata
speech_to_search="conmigo" # text that will be searched in all segments
# alignment corrections: better apply this after
collar=0.0 # collar to alignment in seconds
offset_time=0.0 # alignment shift to rigth in seconds
left_offset=0.0 # start shift in seconds
right_offset=0.0 # end shift in seconds
# trained ASR
asr_hub="Voyager1/asr-wav2vec2-commonvoice-es"
asr_savedir="data/asr/"
#########################################################
####################### ALIGNMENT #######################
#########################################################
# generate WIP directories
if [ ! -z ${alignment_name+set} ];
then
wip_dir="data/wip_"$alignment_name
echo "Alignment name defined, WIP folder is: "$wip_dir
else
wip_dir="data/wip_"$(date +%s)
echo "Alignment name not defined, WIP folder is: "$wip_dir
fi
results_dir=$wip_dir"/results"
logs_dir=$wip_dir"/logs"
mkdir -p $wip_dir
mkdir -p $results_dir
mkdir -p $logs_dir
# perform alignment
echo "Starting word-level alignment..."
python -u src/search_on_speech.py --tsv $tsv_path \
--dst_path $results_dir --asr_hub $asr_hub --asr_savedir $asr_savedir \
--logs_path $logs_dir --text="$speech_to_search"