1717import logging
1818import random
1919import re
20+ from urllib .parse import quote
2021
2122import packaging .version
2223import requests
4142log = logging .getLogger ("murfey.server.bootstrap" )
4243
4344
45+ def _validate_package_name (package : str ) -> bool :
46+ """
47+ Check that a package name follows PEP 503 naming conventions, containing only
48+ alphanumerics, "_", "-", or "." characters
49+ """
50+ if re .match (r"^[a-z0-9\-\_\.]+$" , package ):
51+ return True
52+ else :
53+ return False
54+
55+
56+ def _get_full_path_response (package : str ) -> requests .Response :
57+ """
58+ Validates the package name, sanitises it if valid, and attempts to return a HTTP
59+ response from PyPI.
60+ """
61+
62+ if _validate_package_name (package ):
63+ # Sanitise and normalise package name (PEP 503)
64+ package_clean = quote (re .sub (r"[-_.]+" , "-" , package .lower ()))
65+
66+ # Get HTTP response
67+ url = f"https://pypi.org/simple/{ package_clean } "
68+ response = requests .get (url )
69+
70+ if response .status_code == 200 :
71+ return response
72+ else :
73+ raise HTTPException (status_code = response .status_code )
74+ else :
75+ raise ValueError (f"{ package } is not a valid package name" )
76+
77+
4478@pypi .get ("/" , response_class = Response )
4579def get_pypi_index ():
46- """Obtain list of all PyPI packages via the simple API (PEP 503)."""
80+ """
81+ Obtain list of all PyPI packages via the simple API (PEP 503).
82+ """
83+
4784 index = requests .get ("https://pypi.org/simple/" )
85+
4886 return Response (
4987 content = index .content ,
5088 media_type = index .headers .get ("Content-Type" ),
@@ -53,52 +91,115 @@ def get_pypi_index():
5391
5492
5593@pypi .get ("/{package}/" , response_class = Response )
56- def get_pypi_package_downloads_list (package : str ):
57- """Obtain list of all package downloads from PyPI via the simple API (PEP 503),
58- and rewrite all download URLs to point to this server,
59- underneath the current directory."""
60- full_path_response = requests .get (f"https://pypi.org/simple/{ package } " )
61-
62- def rewrite_pypi_url (match ):
63- url = match .group (4 )
64- return (
65- b"<a "
66- + match .group (1 )
67- + b'href="'
68- + url
69- + b'"'
70- + match .group (3 )
71- + b">"
72- + match .group (4 )
73- + b"</a>"
74- )
94+ def get_pypi_package_downloads_list (package : str ) -> Response :
95+ """
96+ Obtain list of all package downloads from PyPI via the simple API (PEP 503), and
97+ rewrite all download URLs to point to this server, under the current directory.
98+ """
99+
100+ def _rewrite_pypi_url (match ):
101+ """
102+ Use regular expression matching to rewrite the URLs. Points them from
103+ pythonhosted.org to current server, and removes the hash from the URL as well
104+ """
105+ # url = match.group(4) # Original
106+ url = match .group (3 )
107+ return '<a href="' + url + '"' + match .group (2 ) + ">" + match .group (3 ) + "</a>"
108+
109+ # Validate package and URL
110+ full_path_response = _get_full_path_response (package )
111+
112+ # Process lines related to PyPI packages in response
113+ content : bytes = full_path_response .content # In bytes
114+ content_text : str = content .decode ("latin1" ) # Convert to strings
115+ content_text_list = []
116+ for line in content_text .splitlines ():
117+ # Look for lines with hyperlinks
118+ if "<a href" in line :
119+ # Rewrite URL to point to current proxy server
120+ line_new = re .sub (
121+ '^<a href="([^">]*)"([^>]*)>([^<]*)</a>' , # Regex search criteria
122+ _rewrite_pypi_url , # Search criteria applied to this function
123+ line ,
124+ )
125+ content_text_list .append (line_new )
126+
127+ # Add entry for wheel metadata (PEP 658; see _expose_wheel_metadata)
128+ if ".whl" in line_new :
129+ line_metadata = line_new .replace (".whl" , ".whl.metadata" )
130+ content_text_list .append (line_metadata )
131+ else :
132+ # Append other lines as normal
133+ content_text_list .append (line )
134+
135+ content_text_new = str ("\n " .join (content_text_list )) # Regenerate HTML structure
136+ content_new = content_text_new .encode ("latin1" ) # Convert back to bytes
75137
76- content = re .sub (
77- b'<a ([^>]*)href="([^">]*)"([^>]*)>([^<]*)</a>' ,
78- rewrite_pypi_url ,
79- full_path_response .content ,
80- )
81138 return Response (
82- content = content ,
139+ content = content_new ,
83140 media_type = full_path_response .headers .get ("Content-Type" ),
84141 status_code = full_path_response .status_code ,
85142 )
86143
87144
88145@pypi .get ("/{package}/{filename}" , response_class = Response )
89146def get_pypi_file (package : str , filename : str ):
90- """Obtain and pass through a specific download for a PyPI package."""
91- full_path_response = requests .get (f"https://pypi.org/simple/{ package } " )
147+ """
148+ Obtain and pass through a specific download for a PyPI package.
149+ """
150+
151+ def _expose_wheel_metadata (response_bytes : bytes ) -> bytes :
152+ """
153+ As of pip v22.3 (coinciding with PEP 658), pip expects to find an additonal
154+ ".whl.metadata" file based on the URL of the ".whl" file present on the PyPI Simple
155+ Index. However, because it is not listed on the webpage itself, it is not copied
156+ across to the proxy. This function adds that URL to the proxy explicitly.
157+ """
158+
159+ # Analyse API response line-by-line
160+ response_text : str = response_bytes .decode ("latin1" ) # Convert to text
161+ response_text_list = [] # Write line-by-line analysis to here
162+
163+ for line in response_text .splitlines ():
164+ # Process URLs
165+ if r"<a href=" in line :
166+ response_text_list .append (line ) # Add to list
167+
168+ # Add new line to explicitly call for wheel metadata
169+ if ".whl" in line :
170+ # Add ".metadata" to URL and file name
171+ line_new = line .replace (".whl" , ".whl.metadata" )
172+ response_text_list .append (line_new ) # Add to list
173+
174+ # Append all other lines as normal
175+ else :
176+ response_text_list .append (line )
177+
178+ # Recover original structure
179+ response_text_new = str ("\n " .join (response_text_list ))
180+ response_bytes_new = bytes (response_text_new , encoding = "latin-1" )
181+
182+ return response_bytes_new
183+
184+ # Validate package and URL
185+ full_path_response = _get_full_path_response (package )
186+
187+ # Get filename in bytes
92188 filename_bytes = re .escape (filename .encode ("latin1" ))
93189
190+ # Add explicit URLs for ".whl.metadata" files
191+ content = _expose_wheel_metadata (full_path_response .content )
192+
193+ # Find package matching the specified filename
94194 selected_package_link = re .search (
95- b'<a [^>]*? href="([^">]*)"[^>]*>' + filename_bytes + b"</a>" ,
96- full_path_response . content ,
195+ b'<a href="([^">]*)"[^>]*>' + filename_bytes + b"</a>" ,
196+ content ,
97197 )
98198 if not selected_package_link :
99199 raise HTTPException (status_code = 404 , detail = "File not found for package" )
100200 original_url = selected_package_link .group (1 )
101201 original_file = requests .get (original_url )
202+
102203 return Response (
103204 content = original_file .content ,
104205 media_type = original_file .headers .get ("Content-Type" ),
@@ -108,8 +209,10 @@ def get_pypi_file(package: str, filename: str):
108209
109210@plugins .get ("/{package}" , response_class = FileResponse )
110211def get_plugin_wheel (package : str ):
212+
111213 machine_config = get_machine_config ()
112214 wheel_path = machine_config .plugin_packages .get (package )
215+
113216 if wheel_path is None :
114217 return None
115218 return FileResponse (
@@ -124,6 +227,7 @@ def get_bootstrap_instructions(request: Request):
124227 Return a website containing instructions for installing the Murfey client on a
125228 machine with no internet access.
126229 """
230+
127231 return respond_with_template (
128232 "bootstrap.html" ,
129233 {
@@ -140,7 +244,10 @@ def get_pip_wheel():
140244 This is only used during bootstrapping by the client to identify and then
141245 download the actually newest appropriate version of pip.
142246 """
143- return get_pypi_file (package = "pip" , filename = "pip-21.3.1-py3-none-any.whl" )
247+ return get_pypi_file (
248+ package = "pip" ,
249+ filename = "pip-22.2.2-py3-none-any.whl" , # Highest version that works before PEP 658 change
250+ )
144251
145252
146253@bootstrap .get ("/murfey.whl" , response_class = Response )
@@ -153,6 +260,7 @@ def get_murfey_wheel():
153260 """
154261 full_path_response = requests .get ("https://pypi.org/simple/murfey" )
155262 wheels = {}
263+
156264 for wheel_file in re .findall (
157265 b"<a [^>]*>([^<]*).whl</a>" ,
158266 full_path_response .content ,
@@ -174,7 +282,7 @@ def get_murfey_wheel():
174282@cygwin .get ("/setup-x86_64.exe" , response_class = Response )
175283def get_cygwin_setup ():
176284 """
177- Obtain and past though a Cygwin installer from an official source.
285+ Obtain and pass through a Cygwin installer from an official source.
178286 This is used during client bootstrapping and can download and install the
179287 Cygwin distribution that then remains on the client machines.
180288 """
0 commit comments