fix download scraping problems

noisecode3 · noisecode3 · commit b5e79198c0e5 · 2025-04-06T14:00:26.000+02:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -168,7 +168,7 @@ if(NOT TEST)
         ${CMAKE_SOURCE_DIR}/database/scrape_trle_download.py
         ${CMAKE_SOURCE_DIR}/database/tombll_add_data.py
         ${CMAKE_SOURCE_DIR}/database/tombll_get_data.py
-        ${CMAKE_SOURCE_DIR}/database/tombll_get_list.py
+        ${CMAKE_SOURCE_DIR}/database/tombll_manage_data.py
         ${CMAKE_SOURCE_DIR}/database/make_tombll_database.py
         DESTINATION ${CMAKE_INSTALL_PREFIX}/share/${PROJECT_NAME}
     )
diff --git a/database/scrape_trle.py b/database/scrape_trle.py
@@ -311,7 +311,7 @@ def get_trle_level(soup, data):
     data['screen'] = scrape_common.get_trle_screen(soup)
     data['large_screens'] = scrape_common.get_trle_large_screens(soup)
     level_id = scrape_common.trle_url_to_int(soup.find('a', string='Download').get('href'))
-    data['zip_files'] = [scrape_trle_download.get_zip_file_info(level_id)]
+    data['zip_files'] = scrape_trle_download.get_zip_file_info(level_id)
     data['body'] = scrape_common.get_trle_body(soup)
     data['walkthrough'] = get_trle_walkthrough(soup)
 
diff --git a/database/scrape_trle_download.py b/database/scrape_trle_download.py
@@ -238,13 +238,14 @@ def _get_generic_download(soup):
 
 
 def _get_trlevel_download_info(trle_info):
-    zip_file = data_factory.make_zip_file()
+    zip_file = []
     url = "https://www.trlevel.de/filebase/index.php?category-file-list/558-trle-custom-levels/"
     page_prefix = "&sortField=time&sortOrder=DESC&pageNo="
     number = 1
     max_pages = 20
 
     while True:
+
         trlevel_soup = scrape_common.get_soup(url + f"{page_prefix}{number}")
         if not trlevel_soup:
             print(f"Failed to fetch page {number}")
@@ -253,16 +254,23 @@ def _get_trlevel_download_info(trle_info):
         card_links = trlevel_soup.find_all("a", class_="filebaseFileCardLink")
         for link in card_links:
             if trle_info[0].lower() in link.get_text(strip=True).lower():
-                level = link['href'].split("https://www.trlevel.de/filebase/index.php?file/")[1]
-                level_id = level.split('-')[0]
-                download_url = f"https://www.trlevel.de/index.php?file-download/{level_id}/"
+                trlevel_level_page_soup = scrape_common.get_soup(link['href'])
+                download_tag = trlevel_level_page_soup.find(
+                        "a", class_="button buttonPrimary", itemprop="downloadUrl")
+
+                if not isinstance(download_tag, Tag):
+                    print("Did not get download url from trlevel.de page")
+                    sys.exit(1)
+
+                download_url = download_tag.get('href')
                 head = scrape_common.https.get(f"{download_url}", "head")
                 match = re.search(r'filename\*?=(?:UTF-8\'\')?["\']?([^"\']+)["\']?', head)
+                zip_file = data_factory.make_zip_file()
                 zip_file['name'] = match.group(1) if match else ''
                 zip_file['size'] = trle_info[2]
                 zip_file['md5'] = "MISSING"
                 zip_file['url'] = download_url
-                break
+                return [zip_file]
 
         if len(card_links) < 20:
             break  # Stop if less than 20 links (end of pages)
@@ -276,6 +284,7 @@ def _get_trlevel_download_info(trle_info):
 
 
 def _search_trcustoms(trle_info):
+    zip_file = []
     title = quote(trle_info[0])
     release = trle_info[1]
     trcustoms = scrape_common.get_json(f"https://trcustoms.org/api/levels/?search={title}")
@@ -288,8 +297,8 @@ def _search_trcustoms(trle_info):
                 # check if it has the files attribute
                 last_file = level.get('last_file', {})
                 if last_file:
-                    return _get_trcustoms_download_info(level)
-    return []
+                    zip_file = [_get_trcustoms_download_info(level)]
+    return zip_file
 
 
 def _check_lid(lid):
@@ -300,6 +309,7 @@ def _check_lid(lid):
             logging.error("Error: lid not a digit")
             sys.exit(1)
 
+
 def _get_download_info(lid, url):
     zip_file = data_factory.make_zip_file()
     trle_info = _get_trle_info(lid)
@@ -314,39 +324,40 @@ def get_zip_file_info(lid):
     """Entery function for trle download module."""
     _check_lid(lid)
     head = scrape_common.https.get(f"https://www.trle.net/scadm/trle_dl.php?lid={lid}", 'head')
+    files = []
     if head:
         print(f"head: {head}")
         redirect_url = head.partition("Location: ")[2].split("\r\n", 1)[0]
         print(f"Location: {redirect_url}")
         if redirect_url:
             if redirect_url.endswith(".zip") and \
                     redirect_url.startswith("https://www.trle.net/levels/levels/"):
-                return _get_download_info(lid, redirect_url)
+                files = [_get_download_info(lid, redirect_url)]
 
             if redirect_url.startswith("https://www.trle.net/sc/levelfeatures.php?lid="):
                 trle_info = _get_trle_info(lid)
-                return _search_trcustoms(trle_info)
+                files.extend(_search_trcustoms(trle_info))
 
             if redirect_url.lower().endswith("/btb/web/index.html") and \
                     redirect_url.startswith("https://www.trle.net/levels/levels"):
-                return _get_trle_btb_download_info(redirect_url, lid)
+                files = [_get_trle_btb_download_info(redirect_url, lid)]
 
             if redirect_url.endswith(".htm") and \
                     redirect_url.startswith("https://www.trle.net/levels/levels/"):
                 url = _get_generic_download(scrape_common.get_soup(redirect_url))
-                return _get_download_info(lid, url)
+                files = [_get_download_info(lid, url)]
 
             if redirect_url.startswith("https://trcustoms.org/levels/") and \
                     redirect_url.split("/")[-1].isdigit():
                 api_url = f"https://trcustoms.org/api/levels/{redirect_url.split("/")[-1]}/"
                 trcustoms_level_dict = scrape_common.get_json(api_url)
-                return _get_trcustoms_download_info(trcustoms_level_dict)
+                files = [_get_trcustoms_download_info(trcustoms_level_dict)]
 
             if redirect_url == "https://www.trlevel.de":
                 trle_info = _get_trle_info(lid)
-                return _get_trlevel_download_info(trle_info)
+                files.extend(_get_trlevel_download_info(trle_info))
 
-    return []
+    return files
 
 
 if __name__ == '__main__':
@@ -357,7 +368,5 @@ def get_zip_file_info(lid):
         sys.exit(1)
     else:
         LID = sys.argv[1]
-        if LID == "":
-            LID = "1978"
         ZIP_DATA = get_zip_file_info(LID)
         print(f"{ZIP_DATA}")
diff --git a/database/tombll_get_list.py b/database/tombll_get_list.py
diff --git a/database/tombll_manage_data.py b/database/tombll_manage_data.py
@@ -0,0 +1,134 @@
+
+import os
+import sys
+import sqlite3
+
+def print_info():
+    print("Usage: python3 tombll_manage_data.py [options]")
+    print("  Options:")
+    print("      -l List database records")
+    print("      -r [Level.LevelID] Remove one level")
+
+def print_list(con):
+    cur = con.cursor()
+    cur.execute('''
+        SELECT Level.LevelID, Info.Title, Author.value
+        FROM Level
+        JOIN Info ON Level.infoID = Info.InfoID
+        JOIN AuthorList ON Level.LevelID = AuthorList.levelID
+        JOIN Author ON AuthorList.authorID = Author.AuthorID
+        GROUP BY Level.LevelID
+    ''')
+
+    # Fetch all rows from the executed query
+    results = cur.fetchall()
+
+    # Iterate over the results and print each row
+    for row in results:
+        print(row)
+
+def remove_level(level_id, con):
+    cur = con.cursor()
+    try:
+        # Start transaction
+        cur.execute("BEGIN;")
+
+        cur.execute('''
+            DELETE FROM Picture
+            WHERE PictureID IN (
+                SELECT pictureID
+                FROM Screens
+                GROUP BY pictureID
+                HAVING COUNT(DISTINCT levelID) = 1 AND MAX(levelID) = ?
+            );
+        ''', (level_id,))
+
+        cur.execute('DELETE FROM Screens WHERE levelID = ?;', (level_id,))
+
+        cur.execute('''
+            DELETE FROM Zip
+            WHERE ZipID IN (
+                SELECT zipID
+                FROM ZipList
+                GROUP BY zipID
+                HAVING COUNT(DISTINCT levelID) = 1 AND MAX(levelID) = ?
+            );
+        ''', (level_id,))
+
+        cur.execute('DELETE FROM ZipList WHERE levelID = ?;', (level_id,))
+
+        cur.execute('''
+            DELETE FROM Tag
+            WHERE TagID IN (
+                SELECT tagID
+                FROM TagList
+                GROUP BY tagID
+                HAVING COUNT(DISTINCT levelID) = 1 AND MAX(levelID) = ?
+            );
+        ''', (level_id,))
+
+        cur.execute('DELETE FROM TagList WHERE levelID = ?;', (level_id,))
+
+        cur.execute('''
+            DELETE FROM Genre
+            WHERE genreID IN (
+                SELECT genreID
+                FROM GenreList
+                GROUP BY genreID
+                HAVING COUNT(DISTINCT levelID) = 1 AND MAX(levelID) = ?
+            );
+        ''', (level_id,))
+
+        cur.execute('DELETE FROM GenreList WHERE levelID = ?;', (level_id,))
+
+        # Delete authors only used by this level, if its the last one
+        cur.execute('''
+            DELETE FROM Author
+            WHERE AuthorID IN (
+                SELECT authorID
+                FROM AuthorList
+                GROUP BY authorID
+                HAVING COUNT(DISTINCT levelID) = 1 AND MAX(levelID) = ?
+            );
+        ''', (level_id,))
+
+        # Delete from AuthorList middle table
+        cur.execute('DELETE FROM AuthorList WHERE levelID = ?;', (level_id,))
+
+        cur.execute('''
+            DELETE FROM Info
+            WHERE InfoID IN (
+                SELECT infoID
+                FROM Level WHERE LevelID = ?
+            );
+        ''', (level_id,))
+
+        cur.execute('DELETE FROM Level WHERE LevelID = ?;', (level_id,))
+
+        # Commit if all went well
+        con.commit()
+        print(f"Level {level_id} and related authors removed successfully.")
+
+    except Exception as any_error:
+        # Rollback in case of any error
+        con.rollback()
+        print(f"Error while deleting level {level_id}: {any_error}")
+
+    finally:
+        pass
+
+if __name__ == "__main__":
+    number_of_argument = len(sys.argv)
+    if (number_of_argument == 1 or number_of_argument >= 4):
+        print_info()
+        sys.exit(1)
+    CON = sqlite3.connect(os.path.dirname(os.path.abspath(__file__)) + '/tombll.db')
+    if (sys.argv[1] == "-h" and number_of_argument == 2):
+        print_info()
+    elif (sys.argv[1] == "-l" and number_of_argument == 2):
+        print_list(CON)
+    elif (sys.argv[1] == "-r" and number_of_argument == 3):
+        remove_level(sys.argv[2], CON)
+    else:
+        print_info()
+    CON.close()

Original file line number	Diff line number	Diff line change
`@@ -168,7 +168,7 @@ if(NOT TEST)`
`168`	`168`	`${CMAKE_SOURCE_DIR}/database/scrape_trle_download.py`
`169`	`169`	`${CMAKE_SOURCE_DIR}/database/tombll_add_data.py`
`170`	`170`	`${CMAKE_SOURCE_DIR}/database/tombll_get_data.py`
`171`		`- ${CMAKE_SOURCE_DIR}/database/tombll_get_list.py`
	`171`	`+ ${CMAKE_SOURCE_DIR}/database/tombll_manage_data.py`
`172`	`172`	`${CMAKE_SOURCE_DIR}/database/make_tombll_database.py`
`173`	`173`	`DESTINATION ${CMAKE_INSTALL_PREFIX}/share/${PROJECT_NAME}`
`174`	`174`	`)`