Skip to content

Commit b5e7919

Browse files
committed
fix download scraping problems
1 parent 7d7d171 commit b5e7919

File tree

5 files changed

+161
-49
lines changed

5 files changed

+161
-49
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ if(NOT TEST)
168168
${CMAKE_SOURCE_DIR}/database/scrape_trle_download.py
169169
${CMAKE_SOURCE_DIR}/database/tombll_add_data.py
170170
${CMAKE_SOURCE_DIR}/database/tombll_get_data.py
171-
${CMAKE_SOURCE_DIR}/database/tombll_get_list.py
171+
${CMAKE_SOURCE_DIR}/database/tombll_manage_data.py
172172
${CMAKE_SOURCE_DIR}/database/make_tombll_database.py
173173
DESTINATION ${CMAKE_INSTALL_PREFIX}/share/${PROJECT_NAME}
174174
)

database/scrape_trle.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ def get_trle_level(soup, data):
311311
data['screen'] = scrape_common.get_trle_screen(soup)
312312
data['large_screens'] = scrape_common.get_trle_large_screens(soup)
313313
level_id = scrape_common.trle_url_to_int(soup.find('a', string='Download').get('href'))
314-
data['zip_files'] = [scrape_trle_download.get_zip_file_info(level_id)]
314+
data['zip_files'] = scrape_trle_download.get_zip_file_info(level_id)
315315
data['body'] = scrape_common.get_trle_body(soup)
316316
data['walkthrough'] = get_trle_walkthrough(soup)
317317

database/scrape_trle_download.py

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -238,13 +238,14 @@ def _get_generic_download(soup):
238238

239239

240240
def _get_trlevel_download_info(trle_info):
241-
zip_file = data_factory.make_zip_file()
241+
zip_file = []
242242
url = "https://www.trlevel.de/filebase/index.php?category-file-list/558-trle-custom-levels/"
243243
page_prefix = "&sortField=time&sortOrder=DESC&pageNo="
244244
number = 1
245245
max_pages = 20
246246

247247
while True:
248+
248249
trlevel_soup = scrape_common.get_soup(url + f"{page_prefix}{number}")
249250
if not trlevel_soup:
250251
print(f"Failed to fetch page {number}")
@@ -253,16 +254,23 @@ def _get_trlevel_download_info(trle_info):
253254
card_links = trlevel_soup.find_all("a", class_="filebaseFileCardLink")
254255
for link in card_links:
255256
if trle_info[0].lower() in link.get_text(strip=True).lower():
256-
level = link['href'].split("https://www.trlevel.de/filebase/index.php?file/")[1]
257-
level_id = level.split('-')[0]
258-
download_url = f"https://www.trlevel.de/index.php?file-download/{level_id}/"
257+
trlevel_level_page_soup = scrape_common.get_soup(link['href'])
258+
download_tag = trlevel_level_page_soup.find(
259+
"a", class_="button buttonPrimary", itemprop="downloadUrl")
260+
261+
if not isinstance(download_tag, Tag):
262+
print("Did not get download url from trlevel.de page")
263+
sys.exit(1)
264+
265+
download_url = download_tag.get('href')
259266
head = scrape_common.https.get(f"{download_url}", "head")
260267
match = re.search(r'filename\*?=(?:UTF-8\'\')?["\']?([^"\']+)["\']?', head)
268+
zip_file = data_factory.make_zip_file()
261269
zip_file['name'] = match.group(1) if match else ''
262270
zip_file['size'] = trle_info[2]
263271
zip_file['md5'] = "MISSING"
264272
zip_file['url'] = download_url
265-
break
273+
return [zip_file]
266274

267275
if len(card_links) < 20:
268276
break # Stop if less than 20 links (end of pages)
@@ -276,6 +284,7 @@ def _get_trlevel_download_info(trle_info):
276284

277285

278286
def _search_trcustoms(trle_info):
287+
zip_file = []
279288
title = quote(trle_info[0])
280289
release = trle_info[1]
281290
trcustoms = scrape_common.get_json(f"https://trcustoms.org/api/levels/?search={title}")
@@ -288,8 +297,8 @@ def _search_trcustoms(trle_info):
288297
# check if it has the files attribute
289298
last_file = level.get('last_file', {})
290299
if last_file:
291-
return _get_trcustoms_download_info(level)
292-
return []
300+
zip_file = [_get_trcustoms_download_info(level)]
301+
return zip_file
293302

294303

295304
def _check_lid(lid):
@@ -300,6 +309,7 @@ def _check_lid(lid):
300309
logging.error("Error: lid not a digit")
301310
sys.exit(1)
302311

312+
303313
def _get_download_info(lid, url):
304314
zip_file = data_factory.make_zip_file()
305315
trle_info = _get_trle_info(lid)
@@ -314,39 +324,40 @@ def get_zip_file_info(lid):
314324
"""Entery function for trle download module."""
315325
_check_lid(lid)
316326
head = scrape_common.https.get(f"https://www.trle.net/scadm/trle_dl.php?lid={lid}", 'head')
327+
files = []
317328
if head:
318329
print(f"head: {head}")
319330
redirect_url = head.partition("Location: ")[2].split("\r\n", 1)[0]
320331
print(f"Location: {redirect_url}")
321332
if redirect_url:
322333
if redirect_url.endswith(".zip") and \
323334
redirect_url.startswith("https://www.trle.net/levels/levels/"):
324-
return _get_download_info(lid, redirect_url)
335+
files = [_get_download_info(lid, redirect_url)]
325336

326337
if redirect_url.startswith("https://www.trle.net/sc/levelfeatures.php?lid="):
327338
trle_info = _get_trle_info(lid)
328-
return _search_trcustoms(trle_info)
339+
files.extend(_search_trcustoms(trle_info))
329340

330341
if redirect_url.lower().endswith("/btb/web/index.html") and \
331342
redirect_url.startswith("https://www.trle.net/levels/levels"):
332-
return _get_trle_btb_download_info(redirect_url, lid)
343+
files = [_get_trle_btb_download_info(redirect_url, lid)]
333344

334345
if redirect_url.endswith(".htm") and \
335346
redirect_url.startswith("https://www.trle.net/levels/levels/"):
336347
url = _get_generic_download(scrape_common.get_soup(redirect_url))
337-
return _get_download_info(lid, url)
348+
files = [_get_download_info(lid, url)]
338349

339350
if redirect_url.startswith("https://trcustoms.org/levels/") and \
340351
redirect_url.split("/")[-1].isdigit():
341352
api_url = f"https://trcustoms.org/api/levels/{redirect_url.split("/")[-1]}/"
342353
trcustoms_level_dict = scrape_common.get_json(api_url)
343-
return _get_trcustoms_download_info(trcustoms_level_dict)
354+
files = [_get_trcustoms_download_info(trcustoms_level_dict)]
344355

345356
if redirect_url == "https://www.trlevel.de":
346357
trle_info = _get_trle_info(lid)
347-
return _get_trlevel_download_info(trle_info)
358+
files.extend(_get_trlevel_download_info(trle_info))
348359

349-
return []
360+
return files
350361

351362

352363
if __name__ == '__main__':
@@ -357,7 +368,5 @@ def get_zip_file_info(lid):
357368
sys.exit(1)
358369
else:
359370
LID = sys.argv[1]
360-
if LID == "":
361-
LID = "1978"
362371
ZIP_DATA = get_zip_file_info(LID)
363372
print(f"{ZIP_DATA}")

database/tombll_get_list.py

Lines changed: 0 additions & 31 deletions
This file was deleted.

database/tombll_manage_data.py

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
2+
import os
3+
import sys
4+
import sqlite3
5+
6+
def print_info():
7+
print("Usage: python3 tombll_manage_data.py [options]")
8+
print(" Options:")
9+
print(" -l List database records")
10+
print(" -r [Level.LevelID] Remove one level")
11+
12+
def print_list(con):
13+
cur = con.cursor()
14+
cur.execute('''
15+
SELECT Level.LevelID, Info.Title, Author.value
16+
FROM Level
17+
JOIN Info ON Level.infoID = Info.InfoID
18+
JOIN AuthorList ON Level.LevelID = AuthorList.levelID
19+
JOIN Author ON AuthorList.authorID = Author.AuthorID
20+
GROUP BY Level.LevelID
21+
''')
22+
23+
# Fetch all rows from the executed query
24+
results = cur.fetchall()
25+
26+
# Iterate over the results and print each row
27+
for row in results:
28+
print(row)
29+
30+
def remove_level(level_id, con):
31+
cur = con.cursor()
32+
try:
33+
# Start transaction
34+
cur.execute("BEGIN;")
35+
36+
cur.execute('''
37+
DELETE FROM Picture
38+
WHERE PictureID IN (
39+
SELECT pictureID
40+
FROM Screens
41+
GROUP BY pictureID
42+
HAVING COUNT(DISTINCT levelID) = 1 AND MAX(levelID) = ?
43+
);
44+
''', (level_id,))
45+
46+
cur.execute('DELETE FROM Screens WHERE levelID = ?;', (level_id,))
47+
48+
cur.execute('''
49+
DELETE FROM Zip
50+
WHERE ZipID IN (
51+
SELECT zipID
52+
FROM ZipList
53+
GROUP BY zipID
54+
HAVING COUNT(DISTINCT levelID) = 1 AND MAX(levelID) = ?
55+
);
56+
''', (level_id,))
57+
58+
cur.execute('DELETE FROM ZipList WHERE levelID = ?;', (level_id,))
59+
60+
cur.execute('''
61+
DELETE FROM Tag
62+
WHERE TagID IN (
63+
SELECT tagID
64+
FROM TagList
65+
GROUP BY tagID
66+
HAVING COUNT(DISTINCT levelID) = 1 AND MAX(levelID) = ?
67+
);
68+
''', (level_id,))
69+
70+
cur.execute('DELETE FROM TagList WHERE levelID = ?;', (level_id,))
71+
72+
cur.execute('''
73+
DELETE FROM Genre
74+
WHERE genreID IN (
75+
SELECT genreID
76+
FROM GenreList
77+
GROUP BY genreID
78+
HAVING COUNT(DISTINCT levelID) = 1 AND MAX(levelID) = ?
79+
);
80+
''', (level_id,))
81+
82+
cur.execute('DELETE FROM GenreList WHERE levelID = ?;', (level_id,))
83+
84+
# Delete authors only used by this level, if its the last one
85+
cur.execute('''
86+
DELETE FROM Author
87+
WHERE AuthorID IN (
88+
SELECT authorID
89+
FROM AuthorList
90+
GROUP BY authorID
91+
HAVING COUNT(DISTINCT levelID) = 1 AND MAX(levelID) = ?
92+
);
93+
''', (level_id,))
94+
95+
# Delete from AuthorList middle table
96+
cur.execute('DELETE FROM AuthorList WHERE levelID = ?;', (level_id,))
97+
98+
cur.execute('''
99+
DELETE FROM Info
100+
WHERE InfoID IN (
101+
SELECT infoID
102+
FROM Level WHERE LevelID = ?
103+
);
104+
''', (level_id,))
105+
106+
cur.execute('DELETE FROM Level WHERE LevelID = ?;', (level_id,))
107+
108+
# Commit if all went well
109+
con.commit()
110+
print(f"Level {level_id} and related authors removed successfully.")
111+
112+
except Exception as any_error:
113+
# Rollback in case of any error
114+
con.rollback()
115+
print(f"Error while deleting level {level_id}: {any_error}")
116+
117+
finally:
118+
pass
119+
120+
if __name__ == "__main__":
121+
number_of_argument = len(sys.argv)
122+
if (number_of_argument == 1 or number_of_argument >= 4):
123+
print_info()
124+
sys.exit(1)
125+
CON = sqlite3.connect(os.path.dirname(os.path.abspath(__file__)) + '/tombll.db')
126+
if (sys.argv[1] == "-h" and number_of_argument == 2):
127+
print_info()
128+
elif (sys.argv[1] == "-l" and number_of_argument == 2):
129+
print_list(CON)
130+
elif (sys.argv[1] == "-r" and number_of_argument == 3):
131+
remove_level(sys.argv[2], CON)
132+
else:
133+
print_info()
134+
CON.close()

0 commit comments

Comments
 (0)