Synthio/run.sh at main · Sreyan88/Synthio · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
#!/bin/bash
# set -x

export HF_TOKEN='None'
export HF_HOME=./cache

dataset_name="tut_urban"
input_train_file="./dataset_csvs/tut_train.csv"
valid_csv="./dataset_csvs/val.csv"
test_csv="./dataset_csvs/test.csv"
domain="environmental_sounds"


output_folder="./${dataset_name}_synthetic/"
output_folder_supcon="./${dataset_name}_synthetic_supcon/"
num_iters=2 # number of augs for each sample to be generated
num_samples=200 #number of samples in the low-resource split
init_noise_level=80.0
initialize_audio=False
output_csv_path="./${dataset_name}/"
clap_threshold="0.85"
supcon=False
multi_label=False
# captioning arguments
use_label=True
plain_caption=False
plain_wo_caption=False
# iterative arguments
iterative=False
# encoder params
use_ast=True
clap_full_ft=False
# filter params
clap_filter=True
filter_w_finetune=False
full_finetune_clap="False"
clap_exp_name="aug_clap"
#run arguments
augment=True
force_steps=True
only_synthetic=True
# dpo parameters
dpo=True
use_dpo=True
dpo_ckpt_folder="./stable-audio-tools/" #path to save DPO checkpoint

# Check if the directory exists where we need to save synthetic audios
if [ -d "$output_folder" ]; then
  echo "Directory $output_folder already exists."
else
  echo "Directory $output_folder does not exist. Creating now..."
  mkdir -p "$output_folder"
  if [ $? -eq 0 ]; then
    echo "Directory $output_folder created successfully."
  else
    echo "Failed to create directory $output_folder."
  fi
fi

# Check if the directory exists where we need to save csv
if [ -d "$output_csv_path" ]; then
  echo "Directory $output_csv_path already exists."
else
  echo "Directory $output_csv_path does not exist. Creating now..."
  mkdir -p "$output_csv_path"
  if [ $? -eq 0 ]; then
    echo "Directory $output_csv_path created successfully."
  else
    echo "Failed to create directory $output_csv_path."
  fi
fi

# Check if the directory exists where we need to save supervised contrastive audios
if [ -d "$output_folder_supcon" ]; then
  echo "Directory $output_folder_supcon already exists."
else
  echo "Directory $output_folder_supcon does not exist. Creating now..."
  mkdir -p "$output_folder_supcon"
  if [ $? -eq 0 ]; then
    echo "Directory $output_folder_supcon created successfully."
  else
    echo "Failed to create directory $output_folder_supcon."
  fi
fi

# count GPUs
gpu_count=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)

# input your conda path
source /your/conda/path/miniconda3/bin/activate

# stratify the dataset
eval "$(conda shell.bash hook)"
conda activate stable_audio

if [ ! -f "${input_train_file%.csv}_$num_samples.csv" ]; then
    python stratify_dataset.py --input_csv $input_train_file --num_samples $num_samples --output_csv "${input_train_file%.csv}_$num_samples.csv" --dataset_name $dataset_name --multi_label "$multi_label"
fi

# store before changing
orig_input_train_file=$input_train_file
# assign input_train to new file
input_train_file="${input_train_file%.csv}_$num_samples.csv"

if [ "$augment" = True ]; then

    python split_csvs.py --input_csv $input_train_file --num $gpu_count

    # store file names in an array
    files=()
    for i in $(seq 0 $(($gpu_count - 1))); do
        files+=("${input_train_file%.csv}_$i.csv")
    done

    # store GPUs in a list
    gpus=()
    for i in $(seq 0 $(($gpu_count - 1))); do
        gpus+=($i)
    done

    # Generate captions for every instance
    if [ "$use_label" = False ] && [ "$plain_wo_caption" = False ]; then
        eval "$(conda shell.bash hook)"
        conda activate gama
        cd ./GAMA/
        cp gama_csv_inf.py GAMA/
        for i in $(seq 0 $(($gpu_count-1))); do
            CUDA_VISIBLE_DEVICES=${gpus[$i]} python gama_csv_inf.py --input_csv ${files[$i]} --output_csv ${files[$i]} &
        done
        cd ../
        eval "$(conda shell.bash hook)"
        conda activate stable_audio
    fi

    wait

    # finetune the model using DPO
    if [ "$dpo" = True ]; then
        if [ ! -f "$output_csv_path/${dataset_name}_dpo_merged.csv" ] || [ "$force_steps" = True ]; then
            cd stable-audio-tools
            for i in $(seq 0 $(($gpu_count-1))); do
                # hard coded to always use label (Sound of a X) for DPO training -- 4th arg is "True"
                CUDA_VISIBLE_DEVICES=${gpus[$i]} sh generate_augs_audio.sh ${files[$i]} 2 $output_folder "True" $dataset_name $output_csv_path $i $init_noise_level "False" "$dpo" "False" "None" "False" &
            done
            wait
            cd ../
        fi
        dpo=False #set dpo back to false for augmentation generation
        python merge_csv.py --output_csv_path $output_csv_path --dataset_name $dataset_name --num $gpu_count --clap_filter "False" --dpo "True"
        if [ ! -f "${dpo_ckpt_folder}${dataset_name}_${num_samples}.safetensors" ] || [ "$force_steps" = True ]; then
            cd stable-audio-tools
            sh finetune.sh "$output_csv_path/${dataset_name}_dpo_merged.csv" $dataset_name $num_samples
            wait
            cd ../
        fi
    fi

    wait

    if [ "$supcon" = True ]; then
        # first generate new captions using GPT for supervised contrastive
        for i in $(seq 0 $(($gpu_count-1))); do
            echo ${files[$i]}
            python generate_captions_gpt.py --input_csv ${files[$i]} --plain_caption "$plain_caption" --domain $domain --supcon "True" &
        done
        supcon_csv="$output_csv_path/${dataset_name}_supcon_merged.csv"
        if [ ! -f $supcon_csv ]; then
            # next generate audios for the new captions
            cd stable-audio-tools
            for i in $(seq 0 $(($gpu_count-1))); do
                # hard code $use_dpo to True and $dpo to False
                CUDA_VISIBLE_DEVICES=${gpus[$i]} sh generate_augs_audio.sh ${files[$i]} 4 $output_folder_supcon "False" $dataset_name $output_csv_path $i $init_noise_level "$initialize_audio" "False" "True" ${dpo_ckpt_folder}${dataset_name}_${num_samples}.safetensors "True" &
            done
            wait
            cd ../
        fi
        python merge_csv.py --output_csv_path $output_csv_path --dataset_name $dataset_name --num $gpu_count --clap_filter "False" --dpo "False" --supcon "True"
    fi

    # generate new captions using GPT
    if [ "$use_label" = False ]; then
        for i in $(seq 0 $(($gpu_count-1))); do
            echo "Generating GPT Captions"
            echo ${files[$i]}
            python generate_captions_gpt.py --input_csv ${files[$i]} --plain_caption "$plain_caption" --domain $domain --supcon "False" --plain_wo_caption "$plain_wo_caption" --multi_label "$multi_label" &
        done
    fi

    wait

    if [ ! -f "$output_csv_path/${dataset_name}_merged.csv" ] || [ "$force_steps" = True ]; then
    # generate final augmentations
        cd stable-audio-tools
        for i in $(seq 0 $(($gpu_count-1))); do
            CUDA_VISIBLE_DEVICES=${gpus[$i]} sh generate_augs_audio.sh ${files[$i]} $num_iters $output_folder "$use_label" $dataset_name $output_csv_path $i $init_noise_level "$initialize_audio" "False" "$use_dpo" ${dpo_ckpt_folder}${dataset_name}_${num_samples}.safetensors "False" &
        done
        wait
        cd ../
    fi

    # do clap filter
    if [ "$clap_filter" = True ]; then
        if [ "$filter_w_finetune" = True ]; then
            eval "$(conda shell.bash hook)"
            # input your anaconda path
            source /your/conda/path/anaconda3/bin/activate
            conda activate clap
            cd ./CLAP/src/laion_clap/
            echo "Training Dataset: $input_train_file"
            echo "Validation Dataset: $valid_csv"
            sh htsat-roberta-large-dataset-fusion.sh $clap_exp_name $input_train_file $valid_csv $full_finetune_clap
            wait
            for i in $(seq 0 $(($gpu_count-1))); do
                echo "Filtering"
                CUDA_VISIBLE_DEVICES=${gpus[$i]} python filter_audios_sonal.py --model_path "/fs/gamma-projects/audio/clap_logs/${clap_exp_name}/checkpoints/epoch_latest.pt" --input_csv_path $output_csv_path --clap_threshold "$clap_threshold" --dataset_name $dataset_name --iter $i --use_label "$use_label" &
            done
            cd /fs/nexus-projects/brain_project/aaai_2025/
            eval "$(conda shell.bash hook)"
            source /fs/nexus-projects/brain_project/miniconda3/bin/activate
            conda activate stable_audio
            wait
        else
            eval "$(conda shell.bash hook)"
            conda activate msclap
            for i in $(seq 0 $(($gpu_count-1))); do
                echo "Filtering"
                CUDA_VISIBLE_DEVICES=${gpus[$i]} python filter_audios.py --input_csv_path $output_csv_path --clap_threshold "$clap_threshold" --dataset_name $dataset_name --iter $i --use_label "$use_label" &
            done
            eval "$(conda shell.bash hook)"
            conda activate stable_audio
            wait
        fi
    fi

    # merge the training CSVs
    python merge_csv.py --output_csv_path $output_csv_path --dataset_name $dataset_name --num $gpu_count --clap_filter "$clap_filter"

    # iterative refinement (optional)
    if [ "$iterative" = True ]; then
        echo "Entering iterative refinement"
        while [ $counter -le 2 ]; do
          python merge_csv.py --output_csv_path $output_csv_path --dataset_name $dataset_name --num $gpu_count --clap_filter "False" --filteredout "True"
          wait
          exit 0
          python generate_captions_gpt.py --input_csv "$output_csv_path/${dataset_name}_filteredout_merged.csv" --plain_caption "False" --domain $domain --supcon "False" --plain_wo_caption "False" --multi_label "False" --iteration_stage "True"
          wait
          cd stable-audio-tools
          CUDA_VISIBLE_DEVICES=0 sh generate_augs_audio_extra.sh "$output_csv_path/${dataset_name}.iter.jsonl"
          wait
          cd ../
          CUDA_VISIBLE_DEVICES=0 python filter_audios.py --input_csv_path $output_csv_path --clap_threshold "$clap_threshold" --dataset_name $dataset_name --iter $i --use_label "$use_label"
          wait
          counter=$((counter+1))
        done
    fi

    # generate the label map for AST training
    python generate_label_map.py --dataset_name $dataset_name --input_csv "$output_csv_path/${dataset_name}_merged.csv" --output_json "/fs/nexus-projects/brain_project/aaai_2025/ast/egs/esc50/data/${dataset_name}.json" --output_csv_path /fs/nexus-projects/brain_project/aaai_2025/ast/egs/esc50/data/

    # Run the Python script and capture its output
    output=$(python get_mean_std_length.py --dataset_name $dataset_name --input_csv "$output_csv_path/${dataset_name}_merged.csv")

    # Parse the output to extract the values
    eval "$output"

    # Use the extracted values in subsequent commands
    echo "Dataset Mean: $dataset_mean"
    echo "Dataset Std: $dataset_std"
    echo "Average Audio Length: $average_audio_length"

    eval "$(conda shell.bash hook)"
    conda activate ast

    cd ./ast/egs/esc50/

    # hard code both for now
    dataset_mean=-4.2677393
    dataset_std=4.5689974
    # audio_length=
    label_map="./ast/egs/esc50/data/${dataset_name}.json"

    if [ "$only_synthetic" = True ]; then
        train_csv="$output_csv_path/${dataset_name}_merged_synthetic.csv"
    else
        train_csv="$output_csv_path/${dataset_name}_merged.csv"
    fi

    fold_wise_eval="False"

    if [ "$use_ast" = True ]; then
      # finally train your AST
      if [ "$supcon" = False ]; then
          sh run_esc.sh $dataset_name $dataset_mean $dataset_std $average_audio_length $label_map $train_csv $valid_csv $test_csv $fold_wise_eval
      else
          sh run_esc_supcon.sh $dataset_name $dataset_mean $dataset_std $average_audio_length $label_map $train_csv $valid_csv $test_csv $fold_wise_eval $supcon_csv
      fi
    else
      conda activate clap
      cd /../../../aaai_2025/
      sh run_linear_probe.sh $train_csv $valid_csv $test_csv
      #Sonal enter code
    fi

else
    train_csv=$input_train_file

    # generate the label map for AST training
    python generate_label_map.py --dataset_name $dataset_name --input_csv $train_csv --output_json "./ast/egs/esc50/data/${dataset_name}.json" --output_csv_path ./ast/egs/esc50/data/

    output=$(python get_mean_std_length.py --dataset_name $dataset_name --input_csv "$train_csv")

    # Parse the output to extract the values
    eval "$output"

    # Use the extracted values in subsequent commands
    echo "Dataset Mean: $dataset_mean"
    echo "Dataset Std: $dataset_std"
    echo "Average Audio Length: $average_audio_length"

    eval "$(conda shell.bash hook)"
    conda activate ast

    cd ./ast/egs/esc50/

    dataset_mean=-4.2677393
    dataset_std=4.5689974
    # audio_length=
    label_map="./ast/egs/esc50/data/${dataset_name}.json"

    fold_wise_eval="False"

    sh run_esc.sh $dataset_name $dataset_mean $dataset_std $average_audio_length $label_map $train_csv $valid_csv $test_csv $fold_wise_eval
fi