|
5 | 5 | # 2020 Jesus Villalba |
6 | 6 | # |
7 | 7 | # Usage: make_voxceleb1.pl /export/voxceleb1 data/ |
8 | | -# Create trial lists for Voxceleb1 original, Entire (E) and hard (H), |
| 8 | +# Create trial lists for Voxceleb1 original, |
9 | 9 | # with cleaned and non-cleaned versions |
| 10 | +# Attention: |
| 11 | +# - This script is for the old version of the dataset without anonymized speaker-ids |
| 12 | +# - This script assumes that the voxceleb1 dataset has all speaker directories |
| 13 | +# dumped in the same wav directory, NOT separated dev and test directories |
| 14 | + |
10 | 15 |
|
11 | 16 | if (@ARGV != 2) { |
12 | 17 | print STDERR "Usage: $0 <path-to-voxceleb1> <path-to-data-dir>\n"; |
|
26 | 31 | my @trials_url = ("$url_base/veri_test.txt", "$url_base/veri_test2.txt"); |
27 | 32 | my @trials = ("trials_o", "trials_o_clean"); |
28 | 33 |
|
29 | | -open(META_IN, "<", "$data_base/vox1_meta.csv") or die "Could not open the meta data file $data_base/vox1_meta.csv"; |
| 34 | +my $meta_url = "https://www.openslr.org/resources/49/vox1_meta.csv"; |
| 35 | +my $meta_path = "$data_base/vox1_meta.csv"; |
| 36 | +if (! -e "$meta_path") { |
| 37 | + $meta_path = "$out_dir/vox1_meta.csv"; |
| 38 | + system("wget -O $meta_path $meta_url"); |
| 39 | +} |
| 40 | + |
| 41 | +open(META_IN, "<", "$meta_path") or die "Could not open the meta data file $meta_path"; |
30 | 42 | my %id2spkr = (); |
| 43 | +my %spkr2gender = (); |
| 44 | +my %spkr2nation = (); |
31 | 45 | while (<META_IN>) { |
32 | | - chomp; |
33 | | - my ($vox_id, $spkr_id, $gender, $nation, $set) = split; |
34 | | - $id2spkr{$vox_id} = $spkr_id; |
35 | | - |
| 46 | + chomp; |
| 47 | + my ($vox_id, $spkr_id, $gender, $nation, $set) = split "\t"; |
| 48 | + $id2spkr{$vox_id} = $spkr_id; |
| 49 | + $spkr2gender{$spkr_id} = $gender; |
| 50 | + $nation =~ s@ @-@g; |
| 51 | + $spkr2nation{$spkr_id} = $nation; |
36 | 52 | } |
37 | 53 | close(META_IN) or die; |
38 | 54 |
|
| 55 | +my $lid_url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox1_final.csv"; |
| 56 | +my $lid_path = "$data_base/lang_vox1_final.csv"; |
| 57 | +if (! -e "$lid_path") { |
| 58 | + $lid_path = "$out_dir/lang_vox1_final.csv"; |
| 59 | + system("wget -O $lid_path $lid_url"); |
| 60 | +} |
| 61 | +open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path"; |
| 62 | +my %utt2lang = (); |
| 63 | +while (<LID_IN>) { |
| 64 | + chomp; |
| 65 | + my ($utt_id, $lang, $score) = split ','; |
| 66 | + my ($vox_id, $vid_id, $file_id) = split '/', $utt_id; |
| 67 | + my $spkr_id = $id2spkr{$vox_id}; |
| 68 | + my $utt_id = "$spkr_id-$vid_id-00$file_id"; |
| 69 | + $utt_id =~ s@\.wav$@@; |
| 70 | + $utt2lang{$utt_id} = $lang; |
| 71 | +} |
| 72 | +close(LID_IN) or die; |
| 73 | + |
39 | 74 | #download trials from voxceleb web page |
40 | | -my %valid_utts = (); |
41 | 75 | for($i = 0; $i <= $#trials; $i++) { |
42 | 76 |
|
43 | 77 | my $file_i = "$out_dir/$trials_basename[$i]"; |
|
70 | 104 | $target = "target"; |
71 | 105 | } |
72 | 106 | print TRIAL_OUT "$utt_id1 $utt_id2 $target\n"; |
73 | | - $valid_utts{$utt_id1} = 1; |
74 | | - $valid_utts{$utt_id2} = 1; |
75 | 107 | } |
76 | 108 |
|
77 | 109 | close(TRIAL_IN) or die; |
|
84 | 116 | my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh); |
85 | 117 | closedir $dh; |
86 | 118 |
|
87 | | -open(SPKR_TEST, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; |
88 | | -open(WAV_TEST, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; |
| 119 | +open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; |
| 120 | +open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; |
| 121 | +open(GENDER, ">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender"; |
| 122 | +open(NAT, ">", "$out_dir/spk2nation") or die "Could not open the output file $out_dir/spk2nation"; |
| 123 | +open(LANG, ">", "$out_dir/utt2lang") or die "Could not open the output file $out_dir/utt2lang"; |
89 | 124 |
|
90 | 125 | foreach (@spkr_dirs) { |
91 | 126 | my $spkr_id = $_; |
|
95 | 130 | if (exists $id2spkr{$spkr_id}) { |
96 | 131 | $new_spkr_id = $id2spkr{$spkr_id}; |
97 | 132 | } |
| 133 | + print GENDER "$new_spkr_id $spkr2gender{$new_spkr_id}\n"; |
| 134 | + print NAT "$new_spkr_id $spkr2nation{$new_spkr_id}\n"; |
| 135 | + |
98 | 136 | opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!"; |
99 | 137 | my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); |
100 | 138 | closedir $dh; |
|
104 | 142 | my $segment = substr($filename, 12, 7); |
105 | 143 | my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav"; |
106 | 144 | my $utt_id = "$new_spkr_id-$rec_id-$segment"; |
107 | | - if (exists $valid_utts{$utt_id}) { |
108 | | - print WAV_TEST "$utt_id", " $wav", "\n"; |
109 | | - print SPKR_TEST "$utt_id", " $new_spkr_id", "\n"; |
| 145 | + print WAV "$utt_id", " $wav", "\n"; |
| 146 | + print SPKR "$utt_id", " $new_spkr_id", "\n"; |
| 147 | + if (exists $utt2lang{$utt_id}) { |
| 148 | + print LANG "$utt_id", " $utt2lang{$utt_id}", "\n"; |
| 149 | + } |
| 150 | + else { |
| 151 | + print LANG "$utt_id N/A\n"; |
110 | 152 | } |
111 | 153 | } |
112 | 154 | } |
113 | 155 |
|
114 | | -close(SPKR_TEST) or die; |
115 | | -close(WAV_TEST) or die; |
| 156 | +close(SPKR) or die; |
| 157 | +close(WAV) or die; |
| 158 | +close(LANG) or die; |
| 159 | +close(GENDER) or die; |
| 160 | +close(NAT) or die; |
116 | 161 |
|
117 | 162 | if (system( |
118 | 163 | "cat $out_dir/trials_* | sort -u > $out_dir/trials") != 0) { |
|
0 commit comments