diff --git a/README.md b/README.md index e2f4128..7188682 100644 --- a/README.md +++ b/README.md @@ -201,6 +201,15 @@ A new metadata file is created with the following format: } } } + +### Get Word Occurance in Dialogue + +Run +``` +python get_occurances.py +``` + + ``` ## Directory structure diff --git a/get_all_titles.py b/get_all_titles.py new file mode 100644 index 0000000..e767775 --- /dev/null +++ b/get_all_titles.py @@ -0,0 +1,12 @@ +import os +import json +from tqdm import tqdm + +with open(os.path.join(os.getcwd(), 'scripts/metadata/clean_parsed_meta.json'), 'r', encoding='utf-8') as f: + data = json.load(f) + titles = dict() + for movie in data.items(): + name = movie[1]['file']['name'] + titles[name] = name.replace(' ', '-')+'.json' + with open('all_titles.json', 'w') as json_file: + json.dump(titles, json_file, indent=4, separators=(',',':')) diff --git a/get_occurances.py b/get_occurances.py new file mode 100644 index 0000000..c18e2a4 --- /dev/null +++ b/get_occurances.py @@ -0,0 +1,29 @@ +import os +import json +import glob +from pathlib import Path +from tqdm import tqdm + +WRITE_DIR = 'scripts/occurances/' +if not os.path.exists(WRITE_DIR): + os.makedirs(WRITE_DIR) + +for filename in tqdm(glob.glob('scripts/parsed/dialogue/*.txt')): + with open(os.path.join(os.getcwd(), filename), 'r', encoding='utf-8') as f: + words = dict() + for line in f: + dialogue = line.split(">",1)[1].lower() + for word in dialogue.split(): + if (word): + list_of_chars = '''!()-[]{};:'"\,<>./?@#$%^&*_~''' + for char in list_of_chars: + word = word.replace(char,'') + if (word in words): + words[word] += 1 + else: + words[word] = 1 + sorted_words = dict(sorted(words.items(), key=lambda item: item[1])) + clean_filename = Path(filename).stem + clean_filename = clean_filename.replace('_dialogue', '') + with open(WRITE_DIR+clean_filename+'.json', 'w') as json_file: + json.dump(sorted_words, json_file, indent=4, separators=(',',':'))