CKWSL/showing.py at main · Kojungbeom/CKWSL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import Button
import pandas as pd
import webrtcvad
import torchaudio

class csample:
    def __init__(self, path, vad_mode, chunk_num, vad):
        self.path = path
        self.vad = webrtcvad.Vad()
        #self.vad = vad
        self.vad.set_mode(vad_mode)
        self.chunk_num = chunk_num
        self.sample, self.fs = torchaudio.load(self.path)

    def wave_f32toint16(self, sample):
        sample = (sample * (2**16 / 2)).numpy().astype(np.int16)
        return sample

    def reset_vad(self):
        return

    def sample_vad(self):
        vad_result = []
        sample = self.wave_f32toint16(self.sample)
        sample_chunk = np.reshape(sample[:1, :].T, (self.chunk_num, 320, 1))
        for chunk in sample_chunk:
            # Scaling Chunk
            vad_result.append(self.vad.is_speech(chunk, self.fs))
        return vad_result

    def get_result(self):
        return self.wave_f32toint16(self.sample), self.sample_vad()


class Waveform_Aranger:
    def __init__(self, vad_mode=2, sample_rate=16000, chunk_num=50):
        self.vad = webrtcvad.Vad()
        self.vad_mode = vad_mode
        self.vad.set_mode(vad_mode)
        self.sample_rate = sample_rate
        self.chunk_num = chunk_num
        self.chunk_size = sample_rate // chunk_num

    def wave_f32toint16(self, sample):
        sample = (sample * (2**16 / 2)).numpy().astype(np.int16)
        return sample

    def get_result(self, path):
        temp = csample(path, self.vad_mode, self.chunk_num, self.vad)
        return temp.get_result()

    def find_startend_point(self, path):
        start = 0
        found_start = False
        end = 0
        found_end = False
        vad_result = []

        sample, fs = torchaudio.load(path)
        # float32 to int16
        sample = self.wave_f32toint16(sample)
        # 1 second to 20ms chunks
        sample_chunk = np.reshape(sample[:1, :].T, (self.chunk_num, 320, 1))

        for chunk in sample_chunk:
            # Scaling Chunk
            vad_result.append(self.vad.is_speech(chunk, self.sample_rate))

        plt.plot(range(16000), sample[0])
        for idx, result in enumerate(vad_result):
            if result:
                if found_start == False:
                    first = idx
                    found_start = True
                plt.plot([idx * self.chunk_size, (idx+1) * self.chunk_size], [1, 1], color='red', linewidth=2)
            else:
                if found_start and found_end == False:
                    end = idx
                    found_end = True

        if not found_start and not found_end:
            first = 0
            end = 49
        return first, end


annotations_file = 'data_all.csv'
file_labels = pd.read_csv(annotations_file)
wa = Waveform_Aranger(vad_mode=2)
sample_ph = file_labels.iloc[0, 2]
sample, vad_result = wa.get_result(sample_ph)

fig, ax = plt.subplots()
ax.set_title(sample_ph)
fig.subplots_adjust(bottom=0.2)
l, = ax.plot(range(16000), sample[0], lw=2)
k = [0] * 50
for idx, result in enumerate(vad_result):
    if result:
        k[idx], = ax.plot([idx * 320, (idx+1) * 320], [1, 1], color='red', linewidth=2)
        k[idx].set(antialiased=True, visible = True)
    else:
        k[idx], = ax.plot([idx * 320, (idx+1) * 320], [-1, -1], color='red', linewidth=2)
        k[idx].set(antialiased=True, visible = False)


class Index:
    ind = 0

    def next(self, event):
        self.ind += 1
        sample, vad_result = wa.get_result(file_labels.iloc[self.ind, 2])
        ax.set_title(file_labels.iloc[self.ind, 2])
        ydata = sample[0]
        l.set_ydata(ydata)
        for idx, result in enumerate(vad_result):
            if result:
                k[idx].set_ydata([1, 1])
                k[idx].set(antialiased=True, visible=True)
                ax.set_ylim(min(ydata), max(ydata))

            else:
                k[idx].set_ydata([-1, -1])
                k[idx].set(antialiased=True, visible=False)
                ax.set_ylim(min(ydata), max(ydata))
        plt.draw()

    def prev(self, event):
        self.ind -= 1
        sample, vad_result = wa.get_result(file_labels.iloc[self.ind, 2])
        ax.set_title(file_labels.iloc[self.ind, 2])
        ydata = sample[0]
        l.set_ydata(ydata)
        for idx, result in enumerate(vad_result):
            if result:
                k[idx].set_ydata([1, 1])
                k[idx].set(visible=True)
                ax.set_ylim(min(ydata), max(ydata))
            else:
                k[idx].set_ydata([-1, -1])
                k[idx].set(visible=False)
                ax.set_ylim(min(ydata), max(ydata))
        plt.draw()

callback = Index()
axprev = fig.add_axes([0.7, 0.05, 0.1, 0.075])
axnext = fig.add_axes([0.81, 0.05, 0.1, 0.075])

bnext = Button(axnext, 'Next')
bnext.on_clicked(callback.next)
bprev = Button(axprev, 'Previous')
bprev.on_clicked(callback.prev)

plt.show()