Skip to content

Commit 9ae4b6a

Browse files
committed
add support for Back to Basics scraping
1 parent e977449 commit 9ae4b6a

File tree

13 files changed

+1281
-1247
lines changed

13 files changed

+1281
-1247
lines changed

CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,9 @@ if(NOT TEST)
155155
${CMAKE_SOURCE_DIR}/database/data_factory.py
156156
${CMAKE_SOURCE_DIR}/database/get_leaf_cert.py
157157
${CMAKE_SOURCE_DIR}/database/https.py
158-
${CMAKE_SOURCE_DIR}/database/scrape.py
158+
${CMAKE_SOURCE_DIR}/database/scrape_common.py
159+
${CMAKE_SOURCE_DIR}/database/scrape_trle.py
160+
${CMAKE_SOURCE_DIR}/database/scrape_trle_download.py
159161
${CMAKE_SOURCE_DIR}/database/tombll_add_data.py
160162
${CMAKE_SOURCE_DIR}/database/tombll_get_data.py
161163
${CMAKE_SOURCE_DIR}/database/tombll_get_list.py

README.md

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@ sudo pacman -S qt5-wayland qt5-webengine qt5-imageformats boost
3030

3131
### Build
3232
```shell
33-
cmake -DCMAKE_INSTALL_PREFIX=~/.local .
33+
mkdir build
34+
cd build
35+
cmake -DCMAKE_INSTALL_PREFIX=/home/$USER/.local ..
3436
make install -j$(nproc)
3537
```
3638

@@ -85,17 +87,20 @@ sudo apk add py3-pycurl py3-tqdm py3-cryptography py3-beautifulsoup4 py3-pillow
8587
```
8688

8789
Some levels wont be added because they use external or different download URL's
88-
You can add maps to the database if you cd into where you installed you're database.
90+
You can add levels to the database if you cd into where you installed you're database.
91+
From you're browser copy the number from level page https://www.trle.net/sc/levelfeatures.php?lid=3684
8992

9093
If you did just follow the command above you can use:
9194

9295

9396
```shell
94-
python3 tombll_get_data.py https://www.trle.net/sc/levelfeatures.php?lid=3684
97+
python3 tombll_get_data.py 3684
9598

9699
```
97100
Now that you have an data.json file you get a chance to edit it.
98101
Sometimes you need or want to edit those but right now I allow only trle.net
102+
You can add you're own local file with its md5sum and it should not try to download it.
103+
But it has to be a zip file at this point.
99104

100105
```text
101106
"zipFileName": "",

database/https.py

Lines changed: 12 additions & 142 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@ def validate_url(self, url):
7474
"https://www.trle.net/",
7575
"https://trle.net/",
7676
"https://trcustoms.org/",
77-
"https://data.trcustoms.org/"
77+
"https://data.trcustoms.org/",
78+
"https://www.trlevel.de/"
7879
)
7980

8081
if any(url.startswith(domain) for domain in allowed_domains):
@@ -160,6 +161,9 @@ def get_leaf(self, url):
160161

161162
def setup_before_get_response(self, url, content_type):
162163
"""Validate known URL and content type."""
164+
if not url:
165+
logging.error("Faild setup_before_get_response, url is None. Exiting.")
166+
sys.exit(1)
163167
self.validate_url(url)
164168
self.validate_data_type(content_type)
165169

@@ -170,6 +174,12 @@ def get_response(self, url, content_type):
170174
"""Handle all https requests."""
171175
self.setup_before_get_response(url, content_type)
172176

177+
if url.startswith("https://trcustoms.org"):
178+
self.misconfigured_server = False
179+
180+
if url.startswith("https://www.trlevel.de"):
181+
self.misconfigured_server = False
182+
173183
if content_type == 'application/zip':
174184
return DOWNLOADER.download_file(url)
175185

@@ -321,7 +331,7 @@ def progress_callback(self, total_to_download, downloaded, total_to_upload, uplo
321331
self.progress_bar.total = total_to_download
322332
return 0 # Returning 0 means to continue
323333

324-
def download_file_trle(self, url):
334+
def download_file(self, url):
325335
"""
326336
Download a file from the trle URL and stores its contents in a buffer.
327337
@@ -430,147 +440,7 @@ def download_file_trle(self, url):
430440

431441
return zip_file # Always return the zip_file dictionary
432442

433-
def download_file_trcustoms(self, url):
434-
"""
435-
Download a file from the trcustoms URL and stores its contents in a buffer.
436-
437-
This method utilizes the `pycurl` library to perform the download, providing
438-
a progress bar for user feedback. It handles server JSON API, follows redirects,
439-
and calculates the MD5 checksum of the downloaded file.
440-
441-
Parameters:
442-
----------
443-
url : str
444-
The URL of the file to download. Must be a valid URL.
445-
446-
Raises:
447-
-------
448-
SystemExit
449-
Exits the program if the server is misconfigured and no leaf certificate is available.
450-
451-
Exceptions:
452-
------------
453-
pycurl.error
454-
Raised if an error occurs during the download process.
455443

456-
Returns:
457-
--------
458-
dict
459-
Returns a dictionary containing details of the downloaded file, including:
460-
- 'size': Size of the file in MiB (mebibytes).
461-
- 'url': The effective URL from which the file was downloaded.
462-
- 'name': The name of the file.
463-
- 'md5': The MD5 checksum of the downloaded content.
464-
465-
Notes:
466-
------
467-
- The progress bar is displayed using the `tqdm` library to indicate the download status.
468-
- The method checks the HTTP response code after the download to ensure success (HTTP 200).
469-
- Temporary files created for certificate handling are cleaned up after the download.
470-
"""
471-
curl = pycurl.Curl()
472-
zip_file = data_factory.make_zip_file() # Initialize the zip_file dictionary
473-
474-
REQUEST_HANDLER.misconfigured_server = False
475-
level_id = url.split("https://trcustoms.org/levels/")[1]
476-
print(f'https://trcustoms.org/api/levels/{level_id}/')
477-
level_json = get(f'https://trcustoms.org/api/levels/{level_id}/', 'application/json')
478-
if not isinstance(level_json, dict):
479-
sys.exit(1)
480-
level_files = level_json.get('files', [])
481-
level_last_file = level_files[-1]
482-
url = level_last_file.get('url', '')
483-
484-
try:
485-
# Get header info
486-
total_size = level_last_file.get('size', '')
487-
zip_file['size'] = round(total_size / (1024 * 1024), 2) # Size in MiB
488-
zip_file['url'] = level_last_file.get('url', '')
489-
level_version = level_last_file.get('version', '')
490-
zip_file['name'] = f"{level_id}-title-V{level_version}-authors.zip"
491-
492-
# Set up for download
493-
curl.setopt(pycurl.URL, url)
494-
curl.setopt(pycurl.FOLLOWLOCATION, True)
495-
curl.setopt(pycurl.WRITEFUNCTION, self.write_callback)
496-
curl.setopt(pycurl.WRITEDATA, self.buffer)
497-
498-
# Enable progress meter
499-
self.progress_bar = tqdm(total=total_size,
500-
unit='B',
501-
unit_scale=True,
502-
unit_divisor=1024,
503-
desc="Downloading")
504-
505-
curl.setopt(pycurl.NOPROGRESS, False)
506-
curl.setopt(pycurl.XFERINFOFUNCTION, self.progress_callback)
507-
508-
# Perform the download
509-
curl.perform()
510-
511-
# Check for errors
512-
http_code = curl.getinfo(pycurl.RESPONSE_CODE)
513-
if http_code != 200:
514-
self.status = 1
515-
print(f"Error: HTTP response code {http_code}")
516-
return {} # Return an empty dict on error
517-
518-
self.status = 0
519-
520-
# Finalize MD5 checksum
521-
md5_hash = hashlib.md5(usedforsecurity=False)
522-
self.buffer.seek(0) # Reset buffer pointer
523-
md5_hash.update(self.buffer.getvalue())
524-
zip_file['md5'] = md5_hash.hexdigest()
525-
526-
except pycurl.error as e:
527-
self.status = 1
528-
print(f"Error: {e}")
529-
return {} # Return an empty dict on error
530-
531-
finally:
532-
if self.progress_bar:
533-
self.progress_bar.close()
534-
curl.close()
535-
REQUEST_HANDLER.misconfigured_server = True
536-
537-
return zip_file # Always return the zip_file dictionary
538-
539-
def download_file(self, url):
540-
"""
541-
Branch the download from an trle.net redirect link to trcustoms or trle.
542-
543-
This method checks the head for the domain the redirection leads to.
544-
545-
Parameters:
546-
----------
547-
url : str
548-
The URL of the file to download. Must be a valid URL.
549-
550-
Returns:
551-
--------
552-
dict
553-
Returns a dictionary containing details of the downloaded file, including:
554-
- 'size': Size of the file in MiB (mebibytes).
555-
- 'url': The effective URL from which the file was downloaded.
556-
- 'name': The name of the file.
557-
- 'md5': The MD5 checksum of the downloaded content.
558-
559-
"""
560-
head_string = REQUEST_HANDLER.head(url)
561-
redirect_url = head_string.partition("Location: ")[2].split("\r\n", 1)[0]
562-
563-
if redirect_url.startswith("https://trcustoms.org/levels/"):
564-
print("trcustoms.org")
565-
return self.download_file_trcustoms(redirect_url)
566-
if redirect_url.startswith("https://www.trle.net/levels/"):
567-
print("trle.net")
568-
return self.download_file_trle(url)
569-
if redirect_url.startswith("https://trle.net/levels/"):
570-
print("trle.net")
571-
return self.download_file_trle(url)
572-
573-
return data_factory.make_zip_file()
574444

575445

576446
ACQUIRE_LOCK = AcquireLock()

database/ideas.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ we can get those also so there will be 3 hosts but they use rar
288288
still the app should support rar also at some point
289289

290290
we need to support this kind of download link also
291-
https://www.trle.net/levels/levels/2020/0620/BtB
291+
https://www.trle.net/levels/levels/2020/0620/BtB/Web/index.html
292292

293293
we have typechecking lint, so we need to install like this:
294294
pip install types-tqdm

0 commit comments

Comments
 (0)