DFKI-SignLanguage
diff --git a/‎augmented_data/const.py‎
Lines changed: 9 additions & 0 deletions b/‎augmented_data/const.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎augmented_data/dataset.py‎
Lines changed: 93 additions & 0 deletions b/‎augmented_data/dataset.py‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎augmented_data/location.py‎
Lines changed: 108 additions & 0 deletions b/‎augmented_data/location.py‎
Lines changed: 108 additions & 0 deletions
diff --git a/‎augmented_data/platform.py‎
Lines changed: 101 additions & 0 deletions b/‎augmented_data/platform.py‎
Lines changed: 101 additions & 0 deletions
@@ -0,0 +1,9 @@
+import os
+
+
+main_folder_text = "annotations_full/annotations"
+main_folder_mms = "mms-subset91"
+
+sub_folders_text = sorted([f.path for f in os.scandir(main_folder_text) if f.is_dir()])
+
+text_file_name = "gebaerdler.Text_Deutsch.annotation~"
@@ -0,0 +1,93 @@
+import os
+import csv
+
+from .const import *
+
+
+def read_dataset_text():
+    dataset = {}
+
+    for folder in sub_folders_text:
+        file_path = os.path.join(folder, text_file_name)
+
+        if not os.path.exists(file_path):
+            continue
+
+        folder_name = os.path.basename(os.path.dirname(file_path))
+
+        # Some files are encoded with ISO 8859-1, some are UTF-8.
+        # Trying to work around this dataset by first trying UTF-8,
+        # then ISO 8859-1 won't work because some files are valid UTF-8
+        # even though they were encoded with ISO 8859-1.
+        # To work around that, just hardcode which files contain UTF-8.
+        # 0099 is just completely broken.
+        if folder_name in ("0099"):
+            continue
+        if folder_name in ("0090", "0101", "0102"):
+            encoding = 'utf-8'
+        else:
+            encoding = 'iso-8859-1'
+
+        with open(file_path, 'r', encoding=encoding) as f:
+            lines = f.readlines()
+
+        parsed_file = []
+        for line in lines:
+            # print('>', line, '<', file_path)
+            start_time, end_time, sentence, number = line.strip().split(";")
+            assert number == "1", f"number is {number}"
+
+            parsed_file.append((start_time, end_time, sentence, number))
+        #print(parsed_file)
+        dataset[folder_name] = parsed_file
+
+    return dataset
+
+
+def read_dataset_mms():
+    dataset = {}
+
+    for file_name in sorted(os.listdir(main_folder_mms)):
+        file_path = os.path.join(main_folder_mms, file_name)
+        file_number = file_name.rsplit('.', maxsplit=1)[0]
+
+        parsed_file = []
+        with open(file_path, 'r', encoding='utf-8', newline='') as f:
+            #print(f'reading file {file_path}')
+            reader = csv.DictReader(f)
+            for row in reader:
+                parsed_file.append(row)
+
+        dataset[file_number] = parsed_file
+
+    return dataset
+
+
+def write_dataset_text(original_dataset, dataset, main_folder):
+    for folder_name, file_contents in dataset.items():
+        original_file_contents = original_dataset[folder_name]
+        if original_file_contents == file_contents:
+            continue
+        file_folder = os.path.join(main_folder, folder_name)
+        os.makedirs(file_folder, exist_ok = True)
+        file_path = os.path.join(file_folder, text_file_name)
+        with open(file_path, 'w', encoding='utf-8') as f:
+            for row in file_contents:
+                text_line = ";".join(row)
+                f.write(text_line + '\n')
+
+
+def write_dataset_mms(original_dataset, dataset, main_folder):
+    fieldnames = ['maingloss', 'framestart', 'frameend', 'duration', 'transition', 'domgloss', 'ndomgloss', 'domreloc', 'ndomreloc', 'headpos', 'headmov', 'cheecks', 'nose', 'mouthgest', 'mouthing', 'eyegaze', 'eyeaperture', 'eyebrows', 'neck', 'shoulders', 'torso', 'domhandrelocx', 'domhandrelocy', 'domhandrelocz', 'domhandrelocax', 'domhandrelocay', 'domhandrelocaz', 'domhandrelocsx', 'domhandrelocsy', 'domhandrelocsz', 'domhandrotx', 'domhandroty', 'domhandrotz', 'ndomhandrelocx', 'ndomhandrelocy', 'ndomhandrelocz', 'ndomhandrelocax', 'ndomhandrelocay', 'ndomhandrelocaz', 'ndomhandrelocsx', 'ndomhandrelocsy', 'ndomhandrelocsz', 'ndomhandrotx', 'ndomhandroty', 'ndomhandrotz']
+    os.makedirs(main_folder, exist_ok = True)
+
+    for file_name, file_contents in dataset.items():
+        original_file_contents = original_dataset[file_name]
+        if original_file_contents == file_contents:
+            continue
+        file_path = os.path.join(main_folder, file_name + '.mms')
+        with open(file_path, 'w', encoding='utf-8', newline='') as f:
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            writer.writeheader()
+            for row in file_contents:
+                writer.writerow(row)
@@ -0,0 +1,108 @@
+# Run with: python -m augmented_data.location
+
+import nltk
+import spacy
+import random
+from .utils import replace_multiple, normalize_text_to_mms
+
+
+# nltk.download('punkt')
+# nltk.download('averaged_perceptron_tagger')
+# nltk.download('maxent_ne_chunker')
+# nltk.download('words')
+
+nlp = spacy.load('de_core_news_sm')
+
+
+def replace_location_entities(dataset_text, dataset_mms):
+    location_names = set()
+    dataset_text_with_metadata = {}
+
+    excludes = {'Zuges', 'Alternativen', 'D', 'RE 77','A.', 'D.','Umsteigen','Weiteres',
+            'Notbremse', 'Reservierungen','IC 2313','Sonderzug','Rhein'}
+
+
+    for folder_name, file_content in dataset_text.items():
+        file_with_metadata = []
+        for (start_time, end_time, sentence, number) in file_content:
+            sentences_to_analyze = sentence.strip().translate({ord(i): None for i in "„“"})
+            # sentences_to_analyze = sentence.strip().replace("„", "").replace("“", "")
+            doc = nlp(sentences_to_analyze)
+            entities = [(ent.text, ent.label_) for ent in doc.ents]
+            # print(entities)
+            # print(folder_name, sentence)
+
+            for ent in doc.ents:
+                if ent.label_ == 'LOC' and ent.text not in excludes:
+                    # Check if "Hauptbahnhof" is present in the entity text
+                    location = ent.text
+                    if 'Hauptbahnhof' in location:
+                        location = location.replace('Hauptbahnhof', '').strip()  # Remove "Hauptbahnhof" and strip extra spaces
+                    location_names.add(location)
+                    # print('location_names are', location_names)
+            file_with_metadata.append((start_time, end_time, sentence, number, entities))
+
+        dataset_text_with_metadata[folder_name] = file_with_metadata
+        # print("file_with_metadata", dataset_text_with_metadata)
+
+    # print(location_names)  # Finding all the locations
+
+    result_text = {}
+    result_mms = {}
+
+    for folder_name, file_content in dataset_text_with_metadata.items():
+        location_counts = {}
+        for line_number, (start_time, end_time, sentence, number, entities) in enumerate(file_content):
+            for (text, label) in entities:
+                if label == 'LOC' and text not in excludes:
+                    if 'Hauptbahnhof' in text:
+                        text = text.replace('Hauptbahnhof', '').strip()
+                    location_counts[text] = location_counts.get(text, 0) + 1 #counting the number of times same location appears in a file
+                    # print(f'WARNING: location {text} in file {folder_name} appears multiple times')
+
+        mapping = {}
+        for location, count in location_counts.items():
+            assert len(location_names) > 1, f'ERROR: only one location found'
+            while True:
+                new_location = random.choice(tuple(location_names))
+                if new_location != location:
+                    break
+            mapping[location] = new_location
+
+
+        new_text_data = []
+        for (start_time, end_time, sentence, number, entities) in file_content:
+            sentence, _ = replace_multiple(sentence, mapping)
+            new_text_data.append((start_time, end_time, sentence, number))
+        result_text[folder_name] = new_text_data
+
+        replaced_counts = {}
+        new_mms_data = []
+        for row in dataset_mms[folder_name]:
+            mapping_mms = dict((normalize_text_to_mms(k), normalize_text_to_mms(v)) for k, v in mapping.items())
+            new_row = row.copy()
+            word = row['maingloss']
+            if word in mapping_mms:
+                new_row['maingloss'] = mapping_mms[word]
+                replaced_counts[word] = replaced_counts.get(word, 0) + 1
+            new_mms_data.append(new_row)
+        result_mms[folder_name] = new_mms_data
+
+        for location, count in location_counts.items():
+            location_mms = normalize_text_to_mms(location)
+            replaced_count = replaced_counts.get(location_mms, 0)
+            if replaced_count != count:
+                print(f'WARNING: replaced_count in file {folder_name} should be {count} but was {replaced_count}, trying to replace {location_mms}')
+
+    return (result_text, result_mms)
+
+
+
+if __name__ == "__main__":
+    from .dataset import *
+    dataset_text = read_dataset_text()
+    dataset_mms = read_dataset_mms()
+
+    result_text, result_mms = replace_location_entities(dataset_text, dataset_mms)
+    write_dataset_text(dataset_text, result_text, main_folder = 'modified/location/text')
+    write_dataset_mms(dataset_mms, result_mms, main_folder = 'modified/location/mms')
@@ -0,0 +1,101 @@
+# Run with: python -m augmented_data.platform
+
+import re
+import random
+from enum import Enum, verify, UNIQUE
+from collections import Counter
+from .utils import replace_multiple, parse_num
+
+
+def replace_platform_entities(dataset_text, dataset_mms):
+    all_platforms = set()
+    dataset_text_with_metadata = {}
+
+    platform_pattern = r'\b(Gleis \d+)[a-z]?\b'
+
+    for folder_name, file in dataset_text.items():
+        platforms_per_file = []
+        for (start_time, end_time, sentence, number) in file:
+            platforms_per_file += re.findall(platform_pattern, sentence)
+
+        all_platforms = all_platforms.union(set(platforms_per_file))
+        platform_counts = Counter(platforms_per_file)
+        dataset_text_with_metadata[folder_name] = (file, platform_counts)
+
+    result_text = {}
+    result_mms = {}
+    for folder_name, file_with_metadata in dataset_text_with_metadata.items():
+        mapping = {}
+        (file, platform_counts) = file_with_metadata
+        all_platforms_tuple = tuple(all_platforms)
+        for platform, count in platform_counts.items(): # TODO: think about randomly shuffling instead
+            assert len(all_platforms) > 1, f'ERROR: only one platform found'
+            while True:
+                new_platform = random.choice(all_platforms_tuple)
+                if new_platform != platform:
+                    break
+            mapping[platform] = new_platform
+
+        new_text_data = []
+        for start_time, end_time, sentence, number in file:
+            sentence, _ = replace_multiple(sentence, mapping)
+            new_text_data.append((start_time, end_time, sentence, number))
+        result_text[folder_name] = new_text_data
+
+        @verify(UNIQUE)
+        class State(Enum):
+            NOT_FOUND = 0
+            GLEIS = 1
+            WECHSELN = 2
+            NUM = 3
+
+        replaced_counts = {}
+        new_mms_data = []
+        state = State.NOT_FOUND
+        for row in dataset_mms[folder_name]:
+            new_row = row.copy()
+            word = row['maingloss']
+            if state == State.NOT_FOUND:
+                if word == 'GLEIS':
+                    state = State.GLEIS
+            elif state == State.GLEIS or state == State.WECHSELN:
+                if state == State.GLEIS and word == 'WECHSELN':
+                    state = State.WECHSELN
+                elif word.startswith('num:'):
+                    state = State.NUM
+                else:
+                    print(f'WARNING: expected WECHSELN or num:, got {word} in file {folder_name}')
+                    state = State.NOT_FOUND
+
+            if state == State.NUM:
+                num = parse_num(word, folder_name)
+                old_gleis = f'Gleis {num}'
+                new_gleis = mapping[old_gleis]
+                print(f'Found {old_gleis} in file {folder_name}, replacing with {new_gleis}')
+                new_num = new_gleis.removeprefix('Gleis ')
+                new_row['maingloss'] = 'num:' + new_num
+                replaced_counts[old_gleis] = replaced_counts.get(old_gleis, 0) + 1
+                state = State.NOT_FOUND
+
+            new_mms_data.append(new_row)
+        result_mms[folder_name] = new_mms_data
+
+        for platform, count in platform_counts.items():
+            replaced_count = replaced_counts.get(platform, 0)
+            if replaced_count != count:
+                print(f'WARNING: replaced_count in file {folder_name} should be {count} but was {replaced_count}, trying to replace {platform}')
+
+    # print(result_text)
+    # print(all_platforms)
+    return (result_text, result_mms)
+
+
+
+if __name__ == "__main__":
+    from .dataset import *
+    dataset_text = read_dataset_text()
+    dataset_mms = read_dataset_mms()
+
+    result_text, result_mms = replace_platform_entities(dataset_text, dataset_mms)
+    write_dataset_text(dataset_text, result_text, main_folder = 'modified/platform/text')
+    write_dataset_mms(dataset_mms, result_mms, main_folder = 'modified/platform/mms')