From 959920295fee5027ee6c090c29d322e61037891c Mon Sep 17 00:00:00 2001 From: copoer Date: Sat, 18 Sep 2021 02:13:49 +0100 Subject: [PATCH 1/4] Added script to get number of word occurances --- get_occurances.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 get_occurances.py diff --git a/get_occurances.py b/get_occurances.py new file mode 100644 index 0000000..80ef212 --- /dev/null +++ b/get_occurances.py @@ -0,0 +1,28 @@ +import os +import json +import glob +from pathlib import Path +from tqdm import tqdm + +WRITE_DIR = 'scripts/occurances/' +if not os.path.exists(WRITE_DIR): + os.makedirs(WRITE_DIR) + +for filename in tqdm(glob.glob('scripts/parsed/dialogue/*.txt')): + with open(os.path.join(os.getcwd(), filename), 'r') as f: + words = dict() + for line in f: + dialogue = line.split(">",1)[1].lower() + for word in dialogue.split(" "): + if (word): + list_of_chars = ['(', ')', '.', '\n', '\"', '!', '?'] + for char in list_of_chars: + word = word.replace(char,'') + if (word in words): + words[word] += 1 + else: + words[word] = 1 + words = dict(sorted(words.items(), key=lambda item: item[1])) + clean_filename = Path(filename).stem + with open(WRITE_DIR+clean_filename+'.json', 'w') as json_file: + json.dump(words, json_file, indent=4, sort_keys=True, separators=(',',':')) From b00f83a412d978d1e38fe5a0402fc9cb2f590559 Mon Sep 17 00:00:00 2001 From: copoer Date: Sat, 18 Sep 2021 02:15:59 +0100 Subject: [PATCH 2/4] Added readme doc --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index e2f4128..7188682 100644 --- a/README.md +++ b/README.md @@ -201,6 +201,15 @@ A new metadata file is created with the following format: } } } + +### Get Word Occurance in Dialogue + +Run +``` +python get_occurances.py +``` + + ``` ## Directory structure From a9f500850b75f43fe9d8d21f8f38b7649fd6d916 Mon Sep 17 00:00:00 2001 From: copoer Date: Sat, 18 Sep 2021 02:33:33 +0100 Subject: [PATCH 3/4] Added fixes --- get_occurances.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/get_occurances.py b/get_occurances.py index 80ef212..3247bd5 100644 --- a/get_occurances.py +++ b/get_occurances.py @@ -9,7 +9,7 @@ os.makedirs(WRITE_DIR) for filename in tqdm(glob.glob('scripts/parsed/dialogue/*.txt')): - with open(os.path.join(os.getcwd(), filename), 'r') as f: + with open(os.path.join(os.getcwd(), filename), 'r', encoding='utf-8') as f: words = dict() for line in f: dialogue = line.split(">",1)[1].lower() @@ -22,7 +22,8 @@ words[word] += 1 else: words[word] = 1 - words = dict(sorted(words.items(), key=lambda item: item[1])) + sorted_words = dict(sorted(words.items(), key=lambda item: item[1])) clean_filename = Path(filename).stem + clean_filename = clean_filename.replace('_dialogue', '') with open(WRITE_DIR+clean_filename+'.json', 'w') as json_file: - json.dump(words, json_file, indent=4, sort_keys=True, separators=(',',':')) + json.dump(sorted_words, json_file, indent=4, separators=(',',':')) From c029e3c2ed0c6af2dbcbed60726645b56bf01feb Mon Sep 17 00:00:00 2001 From: copoer Date: Sat, 18 Sep 2021 16:26:36 +0100 Subject: [PATCH 4/4] Added fixes --- get_all_titles.py | 12 ++++++++++++ get_occurances.py | 4 ++-- 2 files changed, 14 insertions(+), 2 deletions(-) create mode 100644 get_all_titles.py diff --git a/get_all_titles.py b/get_all_titles.py new file mode 100644 index 0000000..e767775 --- /dev/null +++ b/get_all_titles.py @@ -0,0 +1,12 @@ +import os +import json +from tqdm import tqdm + +with open(os.path.join(os.getcwd(), 'scripts/metadata/clean_parsed_meta.json'), 'r', encoding='utf-8') as f: + data = json.load(f) + titles = dict() + for movie in data.items(): + name = movie[1]['file']['name'] + titles[name] = name.replace(' ', '-')+'.json' + with open('all_titles.json', 'w') as json_file: + json.dump(titles, json_file, indent=4, separators=(',',':')) diff --git a/get_occurances.py b/get_occurances.py index 3247bd5..c18e2a4 100644 --- a/get_occurances.py +++ b/get_occurances.py @@ -13,9 +13,9 @@ words = dict() for line in f: dialogue = line.split(">",1)[1].lower() - for word in dialogue.split(" "): + for word in dialogue.split(): if (word): - list_of_chars = ['(', ')', '.', '\n', '\"', '!', '?'] + list_of_chars = '''!()-[]{};:'"\,<>./?@#$%^&*_~''' for char in list_of_chars: word = word.replace(char,'') if (word in words):