From 980ccd48f06679f07af710b99cb6d4424d08f140 Mon Sep 17 00:00:00 2001 From: ldnovak Date: Thu, 1 Jun 2023 18:15:34 -0700 Subject: [PATCH 1/3] Update requirements, scriptsavant, and get_scripts to allow them to run --- get_scripts.py | 53 +++++++++++++++++++++++------------------ requirements.txt | 3 ++- sources/scriptsavant.py | 15 ++++-------- 3 files changed, 37 insertions(+), 34 deletions(-) diff --git a/get_scripts.py b/get_scripts.py index aef264b..ee4f87a 100644 --- a/get_scripts.py +++ b/get_scripts.py @@ -6,26 +6,33 @@ DIR = os.path.join("scripts", "temp") -if not os.path.exists(DIR): - os.makedirs(DIR) - -f = open('sources.json', 'r') -data = json.load(f) -processes = [] -starttime = time.time() - -for source in data: - included = data[source] - if included == "true": - # print("Fetching scripts from %s" % (source)) - # sources.get_scripts(source=source) - # print() - p = multiprocessing.Process(target=sources.get_scripts, args=(source,)) - processes.append(p) - p.start() - -for process in processes: - process.join() - -print() -print('Time taken = {} seconds'.format(time.time() - starttime)) \ No newline at end of file +def get_scripts(): + if not os.path.exists(DIR): + os.makedirs(DIR) + + f = open('sources.json', 'r') + data = json.load(f) + processes = [] + starttime = time.time() + + multiprocessing.freeze_support() + + for source in data: + included = data[source] + if included == "true": + # print("Fetching scripts from %s" % (source)) + # sources.get_scripts(source=source) + # print() + p = multiprocessing.Process(target=sources.get_scripts, args=(source,)) + processes.append(p) + p.start() + + for process in processes: + process.join() + print() + print('Time taken = {} seconds'.format(time.time() - starttime)) + + +if __name__ == '__main__': + get_scripts() + diff --git a/requirements.txt b/requirements.txt index 25453b9..bc1e774 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,5 +2,6 @@ tqdm==4.61.1 fuzzywuzzy==0.18.0 Unidecode==1.2.0 textract==1.6.3 -beautifulsoup4==4.9.3 +beautifulsoup4==4.8.0 IMDbPY==2021.4.18 +numpy==1.24.3 diff --git a/sources/scriptsavant.py b/sources/scriptsavant.py index d3f30a2..44d8a8c 100644 --- a/sources/scriptsavant.py +++ b/sources/scriptsavant.py @@ -7,8 +7,7 @@ def get_scriptsavant(): - ALL_URL_1 = "https://thescriptsavant.com/free-movie-screenplays-am/" - ALL_URL_2 = "https://thescriptsavant.com/free-movie-screenplays-nz/" + ALL_URL = "https://thescriptsavant.com/movies.html" BASE_URL = "http://www.thescriptsavant.com/" SOURCE = "scriptsavant" DIR, TEMP_DIR, META_DIR = create_script_dirs(SOURCE) @@ -17,15 +16,11 @@ def get_scriptsavant(): os.path.join(DIR, f)) and os.path.getsize(os.path.join(DIR, f)) > 3000] metadata = {} - soup_1 = get_soup(ALL_URL_1) - soup_2 = get_soup(ALL_URL_2) - - movielist = soup_1.find_all('tbody')[0].find_all('a') - movielist_2 = soup_2.find_all('div', class_='fusion-text')[0].find_all('a') - movielist += movielist_2 + soup = get_soup(ALL_URL) + movielist = soup.find_all('a') for movie in tqdm(movielist, desc=SOURCE): - name = movie.text.replace("script", "").strip() + name = movie.text.replace("script", "").replace("Script", "").strip() file_name = format_filename(name) script_url = movie.get('href') @@ -42,7 +37,7 @@ def get_scriptsavant(): continue try: - text = get_pdf_text(script_url, os.path.join(SOURCE, file_name)) + text = get_pdf_text(os.path.join(BASE_URL, script_url), os.path.join(SOURCE, file_name)) except Exception as err: print(script_url) From 8851c56bc6c812b441089f15ac2c8fc6bc43d856 Mon Sep 17 00:00:00 2001 From: ldnovak Date: Fri, 2 Jun 2023 16:37:04 -0700 Subject: [PATCH 2/3] use new library to get metadata from imdb --- get_metadata.py | 38 ++++++++++++++++++-------------------- requirements.txt | 2 ++ 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/get_metadata.py b/get_metadata.py index 356b47b..14754a3 100644 --- a/get_metadata.py +++ b/get_metadata.py @@ -9,12 +9,11 @@ from unidecode import unidecode from tqdm.std import tqdm from fuzzywuzzy import fuzz - -import imdb +from PyMovieDb import IMDB import config -ia = imdb.IMDb() +imdb = IMDB() f = open('sources.json', 'r') data = json.load(f) @@ -161,24 +160,23 @@ def get_tmdb_from_id(id): def get_imdb(name): try: - movies = ia.search_movie(name) - if len(movies) > 0: - movie_id = movies[0].movieID - movie = movies[0] - - if 'year' in movie: - release_date = movie['year'] - else: - print("Field missing in response") - return {} + movie = imdb.get_by_name(name) + movie = json.loads(movie) + if movie == imdb.NA: + return {} - return { - "title": unidecode(movie['title']), - "release_date": release_date, - "id": movie_id, - } + if 'datePublished' in movie: + release_date = movie['datePublished'] else: + print("datePublished missing in response") return {} + movie_id = movie["url"].split("/")[-2] + return { + "title": unidecode(movie['name']), + "release_date": release_date, + "id": movie_id, + "overview": movie["description"] if "description" in movie else "", + } except Exception as err: print(err) return {} @@ -294,7 +292,7 @@ def get_imdb(name): for script in tqdm(origin): if "imdb" in origin[script] and "tmdb" not in origin[script]: # print(origin[script]["files"][0]["name"]) - imdb_id = "tt" + origin[script]["imdb"]["id"] + imdb_id = origin[script]["imdb"]["id"] movie_data = get_tmdb_from_id(imdb_id) if movie_data: origin[script]["tmdb"] = movie_data @@ -318,7 +316,7 @@ def get_imdb(name): file_name = extra_clean(origin[script]["files"][0]["name"]) if imdb_name != tmdb_name and average_ratio(file_name, tmdb_name) < 85 and average_ratio(file_name, imdb_name) > 85: - imdb_id = "tt" + origin[script]["imdb"]["id"] + imdb_id = origin[script]["imdb"]["id"] movie_data = get_tmdb_from_id(imdb_id) if movie_data: origin[script]["tmdb"] = movie_data diff --git a/requirements.txt b/requirements.txt index bc1e774..78fa7c9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,5 @@ textract==1.6.3 beautifulsoup4==4.8.0 IMDbPY==2021.4.18 numpy==1.24.3 +python-Levenshtein==0.21.0 +PyMovieDb==0.0.8 From 7fa278211063f21e84ef9f14ffd7816fdddea56e Mon Sep 17 00:00:00 2001 From: ldnovak Date: Fri, 2 Jun 2023 18:40:29 -0700 Subject: [PATCH 3/3] fix bug with getMetadata so that it doesn't delete found files --- get_metadata.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/get_metadata.py b/get_metadata.py index 14754a3..418697a 100644 --- a/get_metadata.py +++ b/get_metadata.py @@ -192,6 +192,7 @@ def get_imdb(name): unique = [] origin = {} +names_with_bad_files = [] for source in metadata: DIR = join("scripts", "unprocessed", source) files = [join(DIR, f) for f in listdir(DIR) if isfile( @@ -209,13 +210,13 @@ def get_imdb(name): name = roman_to_int(name) name = unidecode(name) unique.append(name) - if name not in origin: - origin[name] = {"files": []} curr_script = metadata[source][script] curr_file = join("scripts", "unprocessed", source, curr_script["file_name"] + ".txt") if curr_file in files: + if name not in origin: + origin[name] = {"files": []} origin[name]["files"].append({ "name": unidecode(script), "source": source, @@ -224,9 +225,6 @@ def get_imdb(name): "size": getsize(curr_file) }) - else: - origin.pop(name) - final = sorted(list(set(unique))) print(len(final))