From 959920295fee5027ee6c090c29d322e61037891c Mon Sep 17 00:00:00 2001
From: copoer <c@ooper.space>
Date: Sat, 18 Sep 2021 02:13:49 +0100
Subject: [PATCH 1/4] Added script to get number of word occurances

---
 get_occurances.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 get_occurances.py

diff --git a/get_occurances.py b/get_occurances.py
new file mode 100644
index 0000000..80ef212
--- /dev/null
+++ b/get_occurances.py
@@ -0,0 +1,28 @@
+import os
+import json
+import glob
+from pathlib import Path
+from tqdm import tqdm
+
+WRITE_DIR = 'scripts/occurances/'
+if not os.path.exists(WRITE_DIR):
+    os.makedirs(WRITE_DIR)
+
+for filename in tqdm(glob.glob('scripts/parsed/dialogue/*.txt')):
+   with open(os.path.join(os.getcwd(), filename), 'r') as f:
+       words = dict()
+       for line in f:
+           dialogue = line.split(">",1)[1].lower()
+           for word in dialogue.split(" "):
+               if (word):
+                   list_of_chars = ['(', ')', '.', '\n', '\"', '!', '?']
+                   for char in list_of_chars:
+                      word = word.replace(char,'')
+                   if (word in words):
+                       words[word] += 1
+                   else:
+                       words[word] = 1
+       words = dict(sorted(words.items(), key=lambda item: item[1]))
+       clean_filename = Path(filename).stem
+       with open(WRITE_DIR+clean_filename+'.json', 'w') as json_file:
+           json.dump(words, json_file, indent=4,  sort_keys=True, separators=(',',':'))

From b00f83a412d978d1e38fe5a0402fc9cb2f590559 Mon Sep 17 00:00:00 2001
From: copoer <c@ooper.space>
Date: Sat, 18 Sep 2021 02:15:59 +0100
Subject: [PATCH 2/4] Added readme doc

---
 README.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/README.md b/README.md
index e2f4128..7188682 100644
--- a/README.md
+++ b/README.md
@@ -201,6 +201,15 @@ A new metadata file is created with the following format:
         }
     }
 }
+
+### Get Word Occurance in Dialogue
+
+Run
+```
+python get_occurances.py
+```
+
+
 ```
 
 ## Directory structure

From a9f500850b75f43fe9d8d21f8f38b7649fd6d916 Mon Sep 17 00:00:00 2001
From: copoer <c@ooper.space>
Date: Sat, 18 Sep 2021 02:33:33 +0100
Subject: [PATCH 3/4] Added fixes

---
 get_occurances.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/get_occurances.py b/get_occurances.py
index 80ef212..3247bd5 100644
--- a/get_occurances.py
+++ b/get_occurances.py
@@ -9,7 +9,7 @@
     os.makedirs(WRITE_DIR)
 
 for filename in tqdm(glob.glob('scripts/parsed/dialogue/*.txt')):
-   with open(os.path.join(os.getcwd(), filename), 'r') as f:
+   with open(os.path.join(os.getcwd(), filename), 'r', encoding='utf-8') as f:
        words = dict()
        for line in f:
            dialogue = line.split(">",1)[1].lower()
@@ -22,7 +22,8 @@
                        words[word] += 1
                    else:
                        words[word] = 1
-       words = dict(sorted(words.items(), key=lambda item: item[1]))
+       sorted_words = dict(sorted(words.items(), key=lambda item: item[1]))
        clean_filename = Path(filename).stem
+       clean_filename = clean_filename.replace('_dialogue', '')
        with open(WRITE_DIR+clean_filename+'.json', 'w') as json_file:
-           json.dump(words, json_file, indent=4,  sort_keys=True, separators=(',',':'))
+           json.dump(sorted_words, json_file, indent=4, separators=(',',':'))

From c029e3c2ed0c6af2dbcbed60726645b56bf01feb Mon Sep 17 00:00:00 2001
From: copoer <c@ooper.space>
Date: Sat, 18 Sep 2021 16:26:36 +0100
Subject: [PATCH 4/4] Added fixes

---
 get_all_titles.py | 12 ++++++++++++
 get_occurances.py |  4 ++--
 2 files changed, 14 insertions(+), 2 deletions(-)
 create mode 100644 get_all_titles.py

diff --git a/get_all_titles.py b/get_all_titles.py
new file mode 100644
index 0000000..e767775
--- /dev/null
+++ b/get_all_titles.py
@@ -0,0 +1,12 @@
+import os
+import json
+from tqdm import tqdm
+
+with open(os.path.join(os.getcwd(), 'scripts/metadata/clean_parsed_meta.json'), 'r', encoding='utf-8') as f:
+    data = json.load(f)
+    titles = dict()
+    for movie in data.items():
+        name = movie[1]['file']['name']
+        titles[name] = name.replace(' ', '-')+'.json'
+    with open('all_titles.json', 'w') as json_file:
+        json.dump(titles, json_file, indent=4, separators=(',',':'))
diff --git a/get_occurances.py b/get_occurances.py
index 3247bd5..c18e2a4 100644
--- a/get_occurances.py
+++ b/get_occurances.py
@@ -13,9 +13,9 @@
        words = dict()
        for line in f:
            dialogue = line.split(">",1)[1].lower()
-           for word in dialogue.split(" "):
+           for word in dialogue.split():
                if (word):
-                   list_of_chars = ['(', ')', '.', '\n', '\"', '!', '?']
+                   list_of_chars = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
                    for char in list_of_chars:
                       word = word.replace(char,'')
                    if (word in words):