update https module

noisecode3 · noisecode3 · commit e958cc81ab11 · 2024-11-11T10:22:20.000+01:00
diff --git a/database/get_leaf_cert.py b/database/get_leaf_cert.py
@@ -12,14 +12,14 @@
 from cryptography.hazmat.backends import default_backend
 from cryptography.hazmat.primitives import hashes, serialization
 
-def get_certificate(hostname, port=443):
+def get_certificate(hostname):
     """OpenSSL with TCP get the certificate"""
     context = ssl.create_default_context()
     # Disable certificate verification for the first connection
     context.check_hostname = False
     context.verify_mode = ssl.CERT_NONE
 
-    with socket.create_connection((hostname, port)) as sock:
+    with socket.create_connection((hostname, 443)) as sock:
         with context.wrap_socket(sock, server_hostname=hostname) as ssock:
             # Get certificate info
             cert_der = ssock.getpeercert(True)
diff --git a/database/https.py b/database/https.py
@@ -81,6 +81,34 @@ def validate_url(self, url):
         sys.exit(1)
 
 
+    def head(self, curl):
+        """Take the curl object to head state, with redirect."""
+        if isinstance(curl, pycurl.Curl):
+            buffer = BytesIO()
+            curl.setopt(pycurl.NOBODY, True)
+            curl.setopt(pycurl.HEADERFUNCTION, buffer.write)
+            curl.setopt(pycurl.FOLLOWLOCATION, True)
+            temp_cert_path = None
+
+            if self.misconfigured_server:
+                if not self.leaf_cert:
+                    sys.exit(1)
+                temp_cert_path = REQUEST_HANDLER.set_leaf(curl)
+
+            try:
+                curl.perform()
+            except pycurl.error:
+                print("Error performing request:", pycurl.error)
+            finally:
+                curl.close()
+
+                if temp_cert_path and os.path.exists(temp_cert_path):
+                    os.remove(temp_cert_path)
+
+            return buffer.getvalue().decode('utf-8')
+        return ""
+
+
     def validate_data_type(self, content_type):
         """Limit to used data types."""
         valid_content_types = {
@@ -89,7 +117,8 @@ def validate_data_type(self, content_type):
             'application/zip',
             'image/jpeg',
             'image/png',
-            'text/html'
+            'text/html',
+            'head' # this is not MIME
         }
 
         if content_type not in valid_content_types:
@@ -127,15 +156,19 @@ def get_leaf(self, url):
             logging.error("Failed to retrieve leaf certificate. Exiting.")
             sys.exit(1)
 
-
-    def get_response(self, url, content_type):
-        """Handle all https requests"""
+    def setup_before_get_response(self, url, content_type):
+        """validate known url and content type"""
         self.validate_url(url)
         self.validate_data_type(content_type)
 
         if url.startswith("https://www.trle.net/") and not self.misconfigured_server:
             self.get_leaf(url)
 
+
+    def get_response(self, url, content_type):
+        """Handle all https requests"""
+        self.setup_before_get_response(url, content_type)
+
         if content_type == 'application/zip':
             return DOWNLOADER.download_file(url)
 
@@ -152,8 +185,12 @@ def get_response(self, url, content_type):
                 headers_buffer = BytesIO()
                 curl = pycurl.Curl()  # pylint: disable=no-member
                 curl.setopt(pycurl.URL, url)
-                curl.setopt(pycurl.WRITEDATA, response_buffer)
+
+                if content_type == 'application/zip':
+                    return self.head(curl)
+
                 curl.setopt(pycurl.WRITEHEADER, headers_buffer)
+                curl.setopt(pycurl.WRITEDATA, response_buffer)
 
                 if self.misconfigured_server:
                     if not self.leaf_cert:
@@ -191,6 +228,11 @@ def get_response(self, url, content_type):
                 if temp_cert_path and os.path.exists(temp_cert_path):
                     os.remove(temp_cert_path)
 
+        return self.close_response(curl, headers, response_buffer, content_type)
+
+
+    def close_response(self, curl, headers, response_buffer, content_type):
+        """Pack response and close curl"""
         if curl is None:
             logging.error("No curl instance")
             sys.exit(1)
@@ -244,6 +286,7 @@ def extract_content_type(self, headers):
         logging.error("Could not extract content type from header: %s", headers)
         return None
 
+
 class Downloader:
     """Zip file downloader to be used in RequestHandler"""
     def __init__(self):
@@ -403,6 +446,7 @@ def get(url, content_type):
         'image/jpeg'
         'image/png'
         'text/html'
+        'head'
 
     url must start with:
         "https://www.trle.net/"
diff --git a/database/ideas.txt b/database/ideas.txt
@@ -255,3 +255,26 @@ limit of around 2 GB.
 
 This was a special walkthrough the script cant handle
 https://www.trle.net/sc/Levelwalk.php?lid=864
+
+We should probaly just look at the link fist
+https://www.trle.net/walk/864.jpg
+https://www.trle.net/walk/666.htm
+
+Security
+I have realized that we can validate the certificate carefully
+and have curl use certificate pinning
+
+   curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L);
+   curl_easy_setopt(curl, CURLOPT_PINNEDPUBLICKEY, "sha256//base64_encoded_hash_here"); // Pin the public key
+
+and make sure the index database can't every change the host name,
+unless someone use a quantum computer or specialized illegal hardware, it could
+be difficult to tampered with the download of files. We must make sure we install
+in /usr/bin/ or in /opt/appname/bin or /usr/local/bin together with read only database
+we put in /usr/share /usr/local/share or /opt/appname/share as a base index or other data.
+
+We cant control what happens in home, we can protect against obvious treats
+like another user on the computer trying to prank another user :)
+You're home is a dirty place and no one can help what happens there
+but we validate before we open any database there that is has sane
+permissions sanitized data in there.
diff --git a/database/index_view.py b/database/index_view.py
@@ -15,7 +15,7 @@ def print_trle_page(page):
     levels = page['levels']
 
     # Column widths for even spacing
-    column_widths = [20, 20, 70, 20, 15, 15, 10, 20]
+    column_widths = [6, 20, 70, 17, 11, 16, 6, 10]
 
     headers = ["ID", "Author", "Level Name", "Difficulty",
                "Duration", "Class", "Type", "Released"]
@@ -30,8 +30,8 @@ def print_trle_page(page):
         for idx, k in enumerate(row.keys()):
             cell = str(row[k])  # Convert each cell value to string
             width = column_widths[idx]  # Get the correct column width
-            truncated_text = cell[:width].ljust(width)  # Truncate and pad the text
-            cell_data.append(truncated_text)
+            truncated_text = cell[:width-1].ljust(width-1)  # Truncate and pad the text
+            cell_data.append(truncated_text + ' ')
         print("".join(cell_data))  # Print the row in one line
 
 
diff --git a/database/scrape.py b/database/scrape.py
@@ -272,7 +272,7 @@ def get_soup(url):
     Returns:
         BeautifulSoup: A BeautifulSoup object representing the parsed HTML content.
     """
-    if  validate_url(url) == None:
+    if  validate_url(url) is None:
         print(f"{url} had wrong domain")
         sys.exit(1)
     return BeautifulSoup(https.get(validate_url(url), 'text/html'), 'html.parser')
@@ -291,7 +291,7 @@ def get_image(url):
     Raises:
         SystemExit: If the file format is unsupported.
     """
-    if  validate_url(url) == None:
+    if  validate_url(url) is None:
         print(f"{url} had wrong domain")
         sys.exit(1)
     ext = url_postfix(url).lower()
@@ -313,7 +313,7 @@ def get_jpg(url):
     Returns:
         bytes: The JPEG image content in bytes.
     """
-    if  validate_url(url) == None:
+    if  validate_url(url) is None:
         print(f"{url} had wrong domain")
         sys.exit(1)
     return https.get(validate_url(url), 'image/jpeg')
@@ -329,7 +329,7 @@ def get_png(url):
     Returns:
         bytes: The PNG image content in bytes.
     """
-    if  validate_url(url) == None:
+    if  validate_url(url) is None:
         print(f"{url} had wrong domain")
         sys.exit(1)
     return https.get(validate_url(url), 'image/png')
@@ -345,7 +345,7 @@ def get_json(url):
     Returns:
         dict: The JSON data parsed into a Python dictionary.
     """
-    if  validate_url(url) == None:
+    if  validate_url(url) is None:
         print(f"{url} had wrong domain")
         sys.exit(1)
     return https.get(validate_url(url), 'application/json')
@@ -361,7 +361,7 @@ def get_zip(url):
     Returns:
         dict: The ZIP file content in a dictionary format, if applicable.
     """
-    if  validate_url(url) == None:
+    if  validate_url(url) is None:
         print(f"{url} had wrong domain")
         sys.exit(1)
     return https.get(validate_url(url), 'application/zip')
@@ -714,7 +714,19 @@ def get_trle_walkthrough(level_soup):
 
     # Fetches the walkthrough content from the extracted URL
     url = "https://www.trle.net" + iframe_src
-    response = https.get(url, 'text/html')
+
+    # Check the type of "document"
+    typ = url_postfix(url)
+    if typ == 'jpg':
+        # we should handle all images here but right now
+        # we return "" this is a bit more complex
+        # want cant just give binary image as text to
+        # qt, we need to implement out own html "document" thu a filter
+        response = ""
+        # response = https.get(url, 'image/jpeg')
+    else:
+        response = https.get(url, 'text/html')
+
     if response:
         return response
     return None