Dialecto_CLSC_API/dataset.py at main · abnas7511/Dialecto_CLSC_API · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import pandas as pd
from datasets import Dataset
import soundfile as sf
import os
import json

def prepare_dataset(csv_path, audio_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)

    # Read all CSV files if multiple exist
    if os.path.isfile(csv_path):
        data = pd.read_csv(csv_path, encoding='utf-8')
    else:
        # If csv_path is a directory, combine all CSV files
        csv_files = [f for f in os.listdir(csv_path) if f.endswith('.csv')]
        data_frames = []
        for csv_file in csv_files:
            df = pd.read_csv(os.path.join(csv_path, csv_file), encoding='utf-8')
            data_frames.append(df)
        data = pd.concat(data_frames, ignore_index=True)

    audio_data = []

    for _, row in data.iterrows():
        audio_path = os.path.join(audio_folder, row['filename'])

        if os.path.exists(audio_path):
            try:
                # Extract the audio segment
                start_sample = int(row['start_time'] * 16000)
                end_sample = int(row['end_time'] * 16000)
                audio_segment, sample_rate = sf.read(audio_path, start=start_sample, stop=end_sample)

                segment_path = os.path.join(output_folder, f"{row['filename']}_{start_sample}_{end_sample}.wav")
                sf.write(segment_path, audio_segment, sample_rate)

                audio_data.append({
                    'audio': segment_path,
                    'text': row['label']
                })
            except Exception as e:
                print(f"Error processing {audio_path}: {str(e)}")

    dataset = Dataset.from_pandas(pd.DataFrame(audio_data))

    # Save dataset as JSON
    json_output = [{
        'audio': item['audio'],
        'text': item['text']
    } for item in dataset]

    json_path = os.path.join(output_folder, 'dataset.json')
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(json_output, f, indent=4, ensure_ascii=False)

    return dataset

# Example usage
csv_path = 'data'  # folder containing CSV files
audio_folder = 'input_folder'
output_folder = 'output_folder'

train_dataset = prepare_dataset(csv_path, audio_folder, output_folder)

print("Dataset statistics:")
print(train_dataset)

print("\nFirst few entries:")
for i, data in enumerate(train_dataset):
    if i < 5:  # Print first 5 entries
        print(data)