Added compatibility for pip >v22.2.2 (#277)

tieneupin · web-flow · commit 9de091794058 · 2024-05-24T10:13:31.000+01:00
* Pulls hidden `.whl.metadata` files from the PyPI simple API as well for newer versions of `pip` to look for.
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,7 +30,7 @@ classifiers = [
     "Programming Language :: Python :: 3.12",
 ]
 dependencies = [
-    "pydantic",
+    "pydantic<2", # Pip hops between installing v2.7 or v1.10 depending on which of the additional dependencies are requested
     "requests",
     "rich",
     "werkzeug",
diff --git a/src/murfey/server/__init__.py b/src/murfey/server/__init__.py
@@ -111,7 +111,7 @@ def sanitise(in_string: str) -> str:
     return in_string.replace("\r\n", "").replace("\n", "")
 
 
-def santise_path(in_path: Path) -> Path:
+def sanitise_path(in_path: Path) -> Path:
     return Path("/".join(secure_filename(p) for p in in_path.parts))
 
 
diff --git a/src/murfey/server/api.py b/src/murfey/server/api.py
@@ -43,6 +43,7 @@
     get_machine_config,
     get_microscope,
     get_tomo_preproc_params,
+    sanitise,
     templates,
 )
 from murfey.server.config import from_file, settings
@@ -110,10 +111,6 @@
 router = APIRouter()
 
 
-def sanitise(in_string: str) -> str:
-    return in_string.replace("\r\n", "").replace("\n", "")
-
-
 # This will be the homepage for a given microscope.
 @router.get("/", response_class=HTMLResponse)
 async def root(request: Request):
diff --git a/src/murfey/server/bootstrap.py b/src/murfey/server/bootstrap.py
@@ -17,6 +17,7 @@
 import logging
 import random
 import re
+from urllib.parse import quote
 
 import packaging.version
 import requests
@@ -41,10 +42,47 @@
 log = logging.getLogger("murfey.server.bootstrap")
 
 
+def _validate_package_name(package: str) -> bool:
+    """
+    Check that a package name follows PEP 503 naming conventions, containing only
+    alphanumerics, "_", "-", or "." characters
+    """
+    if re.match(r"^[a-z0-9\-\_\.]+$", package):
+        return True
+    else:
+        return False
+
+
+def _get_full_path_response(package: str) -> requests.Response:
+    """
+    Validates the package name, sanitises it if valid, and attempts to return a HTTP
+    response from PyPI.
+    """
+
+    if _validate_package_name(package):
+        # Sanitise and normalise package name (PEP 503)
+        package_clean = quote(re.sub(r"[-_.]+", "-", package.lower()))
+
+        # Get HTTP response
+        url = f"https://pypi.org/simple/{package_clean}"
+        response = requests.get(url)
+
+        if response.status_code == 200:
+            return response
+        else:
+            raise HTTPException(status_code=response.status_code)
+    else:
+        raise ValueError(f"{package} is not a valid package name")
+
+
 @pypi.get("/", response_class=Response)
 def get_pypi_index():
-    """Obtain list of all PyPI packages via the simple API (PEP 503)."""
+    """
+    Obtain list of all PyPI packages via the simple API (PEP 503).
+    """
+
     index = requests.get("https://pypi.org/simple/")
+
     return Response(
         content=index.content,
         media_type=index.headers.get("Content-Type"),
@@ -53,52 +91,115 @@ def get_pypi_index():
 
 
 @pypi.get("/{package}/", response_class=Response)
-def get_pypi_package_downloads_list(package: str):
-    """Obtain list of all package downloads from PyPI via the simple API (PEP 503),
-    and rewrite all download URLs to point to this server,
-    underneath the current directory."""
-    full_path_response = requests.get(f"https://pypi.org/simple/{package}")
-
-    def rewrite_pypi_url(match):
-        url = match.group(4)
-        return (
-            b"<a "
-            + match.group(1)
-            + b'href="'
-            + url
-            + b'"'
-            + match.group(3)
-            + b">"
-            + match.group(4)
-            + b"</a>"
-        )
+def get_pypi_package_downloads_list(package: str) -> Response:
+    """
+    Obtain list of all package downloads from PyPI via the simple API (PEP 503), and
+    rewrite all download URLs to point to this server, under the current directory.
+    """
+
+    def _rewrite_pypi_url(match):
+        """
+        Use regular expression matching to rewrite the URLs. Points them from
+        pythonhosted.org to current server, and removes the hash from the URL as well
+        """
+        # url = match.group(4)  # Original
+        url = match.group(3)
+        return '<a href="' + url + '"' + match.group(2) + ">" + match.group(3) + "</a>"
+
+    # Validate package and URL
+    full_path_response = _get_full_path_response(package)
+
+    # Process lines related to PyPI packages in response
+    content: bytes = full_path_response.content  # In bytes
+    content_text: str = content.decode("latin1")  # Convert to strings
+    content_text_list = []
+    for line in content_text.splitlines():
+        # Look for lines with hyperlinks
+        if "<a href" in line:
+            # Rewrite URL to point to current proxy server
+            line_new = re.sub(
+                '^<a href="([^">]*)"([^>]*)>([^<]*)</a>',  # Regex search criteria
+                _rewrite_pypi_url,  # Search criteria applied to this function
+                line,
+            )
+            content_text_list.append(line_new)
+
+            # Add entry for wheel metadata (PEP 658; see _expose_wheel_metadata)
+            if ".whl" in line_new:
+                line_metadata = line_new.replace(".whl", ".whl.metadata")
+                content_text_list.append(line_metadata)
+        else:
+            # Append other lines as normal
+            content_text_list.append(line)
+
+    content_text_new = str("\n".join(content_text_list))  # Regenerate HTML structure
+    content_new = content_text_new.encode("latin1")  # Convert back to bytes
 
-    content = re.sub(
-        b'<a ([^>]*)href="([^">]*)"([^>]*)>([^<]*)</a>',
-        rewrite_pypi_url,
-        full_path_response.content,
-    )
     return Response(
-        content=content,
+        content=content_new,
         media_type=full_path_response.headers.get("Content-Type"),
         status_code=full_path_response.status_code,
     )
 
 
 @pypi.get("/{package}/{filename}", response_class=Response)
 def get_pypi_file(package: str, filename: str):
-    """Obtain and pass through a specific download for a PyPI package."""
-    full_path_response = requests.get(f"https://pypi.org/simple/{package}")
+    """
+    Obtain and pass through a specific download for a PyPI package.
+    """
+
+    def _expose_wheel_metadata(response_bytes: bytes) -> bytes:
+        """
+        As of pip v22.3 (coinciding with PEP 658), pip expects to find an additonal
+        ".whl.metadata" file based on the URL of the ".whl" file present on the PyPI Simple
+        Index. However, because it is not listed on the webpage itself, it is not copied
+        across to the proxy. This function adds that URL to the proxy explicitly.
+        """
+
+        # Analyse API response line-by-line
+        response_text: str = response_bytes.decode("latin1")  # Convert to text
+        response_text_list = []  # Write line-by-line analysis to here
+
+        for line in response_text.splitlines():
+            # Process URLs
+            if r"<a href=" in line:
+                response_text_list.append(line)  # Add to list
+
+                # Add new line to explicitly call for wheel metadata
+                if ".whl" in line:
+                    # Add ".metadata" to URL and file name
+                    line_new = line.replace(".whl", ".whl.metadata")
+                    response_text_list.append(line_new)  # Add to list
+
+            # Append all other lines as normal
+            else:
+                response_text_list.append(line)
+
+        # Recover original structure
+        response_text_new = str("\n".join(response_text_list))
+        response_bytes_new = bytes(response_text_new, encoding="latin-1")
+
+        return response_bytes_new
+
+    # Validate package and URL
+    full_path_response = _get_full_path_response(package)
+
+    # Get filename in bytes
     filename_bytes = re.escape(filename.encode("latin1"))
 
+    # Add explicit URLs for ".whl.metadata" files
+    content = _expose_wheel_metadata(full_path_response.content)
+
+    # Find package matching the specified filename
     selected_package_link = re.search(
-        b'<a [^>]*?href="([^">]*)"[^>]*>' + filename_bytes + b"</a>",
-        full_path_response.content,
+        b'<a href="([^">]*)"[^>]*>' + filename_bytes + b"</a>",
+        content,
     )
     if not selected_package_link:
         raise HTTPException(status_code=404, detail="File not found for package")
     original_url = selected_package_link.group(1)
     original_file = requests.get(original_url)
+
     return Response(
         content=original_file.content,
         media_type=original_file.headers.get("Content-Type"),
@@ -108,8 +209,10 @@ def get_pypi_file(package: str, filename: str):
 
 @plugins.get("/{package}", response_class=FileResponse)
 def get_plugin_wheel(package: str):
+
     machine_config = get_machine_config()
     wheel_path = machine_config.plugin_packages.get(package)
+
     if wheel_path is None:
         return None
     return FileResponse(
@@ -124,6 +227,7 @@ def get_bootstrap_instructions(request: Request):
     Return a website containing instructions for installing the Murfey client on a
     machine with no internet access.
     """
+
     return respond_with_template(
         "bootstrap.html",
         {
@@ -140,7 +244,10 @@ def get_pip_wheel():
     This is only used during bootstrapping by the client to identify and then
     download the actually newest appropriate version of pip.
     """
-    return get_pypi_file(package="pip", filename="pip-21.3.1-py3-none-any.whl")
+    return get_pypi_file(
+        package="pip",
+        filename="pip-22.2.2-py3-none-any.whl",  # Highest version that works before PEP 658 change
+    )
 
 
 @bootstrap.get("/murfey.whl", response_class=Response)
@@ -153,6 +260,7 @@ def get_murfey_wheel():
     """
     full_path_response = requests.get("https://pypi.org/simple/murfey")
     wheels = {}
+
     for wheel_file in re.findall(
         b"<a [^>]*>([^<]*).whl</a>",
         full_path_response.content,
@@ -174,7 +282,7 @@ def get_murfey_wheel():
 @cygwin.get("/setup-x86_64.exe", response_class=Response)
 def get_cygwin_setup():
     """
-    Obtain and past though a Cygwin installer from an official source.
+    Obtain and pass through a Cygwin installer from an official source.
     This is used during client bootstrapping and can download and install the
     Cygwin distribution that then remains on the client machines.
     """
diff --git a/src/murfey/server/demo_api.py b/src/murfey/server/demo_api.py
@@ -32,7 +32,7 @@
     get_hostname,
     get_microscope,
     sanitise,
-    santise_path,
+    sanitise_path,
 )
 from murfey.server import shutdown as _shutdown
 from murfey.server import templates
@@ -968,7 +968,7 @@ def flush_tomography_processing(
 async def request_tomography_preprocessing(
     visit_name: str, client_id: int, proc_file: ProcessFile, db=murfey_db
 ):
-    if not santise_path(Path(proc_file.path)).exists():
+    if not sanitise_path(Path(proc_file.path)).exists():
         log.warning(
             f"{sanitise(str(proc_file.path))} has not been transferred before preprocessing"
         )
diff --git a/src/murfey/util/__init__.py b/src/murfey/util/__init__.py
@@ -21,6 +21,10 @@
 logger = logging.getLogger("murfey.util")
 
 
+def sanitise(in_string: str) -> str:
+    return in_string.replace("\r\n", "").replace("\n", "")
+
+
 @lru_cache(maxsize=1)
 def get_machine_config(url: str, demo: bool = False) -> dict:
     return requests.get(f"{url}/machine/").json()
diff --git a/src/murfey/util/lif.py b/src/murfey/util/lif.py
@@ -15,14 +15,12 @@
 from readlif.reader import LifFile
 from tifffile import imwrite
 
+from murfey.util import sanitise
+
 # Create logger object to output messages with
 logger = logging.getLogger("murfey.util.lif")
 
 
-def sanitise(in_string: str) -> str:
-    return in_string.replace("\r\n", "").replace("\n", "")
-
-
 def get_xml_metadata(
     file: LifFile,
     save_xml: Optional[Path] = None,

Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@ classifiers = [`
`30`	`30`	`"Programming Language :: Python :: 3.12",`
`31`	`31`	`]`
`32`	`32`	`dependencies = [`
`33`		`- "pydantic",`
	`33`	`+ "pydantic<2", # Pip hops between installing v2.7 or v1.10 depending on which of the additional dependencies are requested`
`34`	`34`	`"requests",`
`35`	`35`	`"rich",`
`36`	`36`	`"werkzeug",`