Skip to content

Commit de81a44

Browse files
committed
Removing duplicate data
1 parent 4b39cda commit de81a44

File tree

5 files changed

+16
-10
lines changed

5 files changed

+16
-10
lines changed

augmented_data/dataset.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,11 @@ def read_dataset_mms():
6363
return dataset
6464

6565

66-
def write_dataset_text(dataset, main_folder):
66+
def write_dataset_text(original_dataset, dataset, main_folder):
6767
for folder_name, file_contents in dataset.items():
68+
original_file_contents = original_dataset[folder_name]
69+
if original_file_contents == file_contents:
70+
continue
6871
file_folder = os.path.join(main_folder, folder_name)
6972
os.makedirs(file_folder, exist_ok = True)
7073
file_path = os.path.join(file_folder, text_file_name)
@@ -74,11 +77,14 @@ def write_dataset_text(dataset, main_folder):
7477
f.write(text_line + '\n')
7578

7679

77-
def write_dataset_mms(dataset, main_folder):
80+
def write_dataset_mms(original_dataset, dataset, main_folder):
7881
fieldnames = ['maingloss', 'framestart', 'frameend', 'duration', 'transition', 'domgloss', 'ndomgloss', 'domreloc', 'ndomreloc', 'headpos', 'headmov', 'cheecks', 'nose', 'mouthgest', 'mouthing', 'eyegaze', 'eyeaperture', 'eyebrows', 'neck', 'shoulders', 'torso', 'domhandrelocx', 'domhandrelocy', 'domhandrelocz', 'domhandrelocax', 'domhandrelocay', 'domhandrelocaz', 'domhandrelocsx', 'domhandrelocsy', 'domhandrelocsz', 'domhandrotx', 'domhandroty', 'domhandrotz', 'ndomhandrelocx', 'ndomhandrelocy', 'ndomhandrelocz', 'ndomhandrelocax', 'ndomhandrelocay', 'ndomhandrelocaz', 'ndomhandrelocsx', 'ndomhandrelocsy', 'ndomhandrelocsz', 'ndomhandrotx', 'ndomhandroty', 'ndomhandrotz']
7982
os.makedirs(main_folder, exist_ok = True)
8083

8184
for file_name, file_contents in dataset.items():
85+
original_file_contents = original_dataset[file_name]
86+
if original_file_contents == file_contents:
87+
continue
8288
file_path = os.path.join(main_folder, file_name + '.mms')
8389
with open(file_path, 'w', encoding='utf-8', newline='') as f:
8490
writer = csv.DictWriter(f, fieldnames=fieldnames)

augmented_data/location.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,5 +104,5 @@ def replace_location_entities(dataset_text, dataset_mms):
104104
dataset_mms = read_dataset_mms()
105105

106106
result_text, result_mms = replace_location_entities(dataset_text, dataset_mms)
107-
write_dataset_text(result_text, main_folder = 'modified/location/text')
108-
write_dataset_mms(result_mms, main_folder = 'modified/location/mms')
107+
write_dataset_text(dataset_text, result_text, main_folder = 'modified/location/text')
108+
write_dataset_mms(dataset_mms, result_mms, main_folder = 'modified/location/mms')

augmented_data/platform.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,5 +97,5 @@ class State(Enum):
9797
dataset_mms = read_dataset_mms()
9898

9999
result_text, result_mms = replace_platform_entities(dataset_text, dataset_mms)
100-
write_dataset_text(result_text, main_folder = 'modified/platform/text')
101-
write_dataset_mms(result_mms, main_folder = 'modified/platform/mms')
100+
write_dataset_text(dataset_text, result_text, main_folder = 'modified/platform/text')
101+
write_dataset_mms(dataset_mms, result_mms, main_folder = 'modified/platform/mms')

augmented_data/time.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -338,5 +338,5 @@ def process_time(state, time_data, time_positions):
338338
dataset_mms = read_dataset_mms()
339339

340340
result_text, result_mms = replace_time_entities(dataset_text, dataset_mms)
341-
write_dataset_text(result_text, main_folder = 'modified/time/text')
342-
write_dataset_mms(result_mms, main_folder = 'modified/time/mms')
341+
write_dataset_text(dataset_text, result_text, main_folder = 'modified/time/text')
342+
write_dataset_mms(dataset_mms, result_mms, main_folder = 'modified/time/mms')

augmented_data/train_name.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -356,5 +356,5 @@ def process_train(state, train_data, train_positions):
356356
dataset_mms = read_dataset_mms()
357357

358358
result_text, result_mms = replace_train_entities(dataset_text, dataset_mms)
359-
write_dataset_text(result_text, main_folder = 'modified/train_name/text')
360-
write_dataset_mms(result_mms, main_folder = 'modified/train_name/mms')
359+
write_dataset_text(dataset_text, result_text, main_folder = 'modified/train_name/text')
360+
write_dataset_mms(dataset_mms, result_mms, main_folder = 'modified/train_name/mms')

0 commit comments

Comments
 (0)