pydata · mattpitkin · Dec 3, 2024 · Dec 4, 2024 · Dec 4, 2024 · Dec 4, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -39,6 +39,7 @@ repos:
     hooks:
       - id: djlint-jinja
         types_or: ["html"]
+        exclude: ^tests/test_build/.*\.html$
 
   - repo: "https://github.com/PyCQA/doc8"
     rev: v1.1.2

diff --git a/src/pydata_sphinx_theme/short_link.py b/src/pydata_sphinx_theme/short_link.py
@@ -1,7 +1,9 @@
 """A custom Transform object to shorten github and gitlab links."""
 
+import re
+
 from typing import ClassVar
-from urllib.parse import ParseResult, urlparse, urlunparse
+from urllib.parse import unquote, urlparse
 
 from docutils import nodes
 from sphinx.transforms.post_transforms import SphinxPostTransform
@@ -39,7 +41,6 @@ class ShortenLinkTransform(SphinxPostTransform):
         "gitlab.com": "gitlab",
         "bitbucket.org": "bitbucket",
     }
-    platform = None
 
     @classmethod
     def add_platform_mapping(cls, platform, netloc):
@@ -56,90 +57,169 @@ def run(self, **kwargs):
             # only act if the uri and text are the same
             # if not the user has already customized the display of the link
             if uri is not None and text is not None and text == uri:
-                uri = urlparse(uri)
+                parsed_uri = urlparse(uri)
                 # only do something if the platform is identified
-                self.platform = self.supported_platform.get(uri.netloc)
-                if self.platform is not None:
-                    node.attributes["classes"].append(self.platform)
-                    node.children[0] = nodes.Text(self.parse_url(uri))
-
-    def parse_url(self, uri: ParseResult) -> str:
-        """Parse the content of the url with respect to the selected platform.
-
-        Args:
-            uri: the link to the platform content
-
-        Returns:
-            the reformated url title
-        """
-        path = uri.path
-        if path == "":
-            # plain url passed, return platform only
-            return self.platform
-
-        # if the path is not empty it contains a leading "/", which we don't want to
-        # include in the parsed content
-        path = path.lstrip("/")
-
-        # check the platform name and read the information accordingly
-        # as "<organisation>/<repository>#<element number>"
-        # or "<group>/<subgroup 1>/…/<subgroup N>/<repository>#<element number>"
-        if self.platform == "github":
-            # split the url content
-            parts = path.split("/")
-
-            if parts[0] == "orgs" and "/projects" in path:
-                # We have a projects board link
-                # ref: `orgs/{org}/projects/{project-id}`
-                text = f"{parts[1]}/projects#{parts[3]}"
-            else:
-                # We have an issues, PRs, or repository link
-                if len(parts) > 0:
-                    text = parts[0]  # organisation
-                if len(parts) > 1:
-                    text += f"/{parts[1]}"  # repository
-                if len(parts) > 2:
-                    if parts[2] in ["issues", "pull", "discussions"]:
-                        text += f"#{parts[-1]}"  # element number
-
-        elif self.platform == "bitbucket":
-            # split the url content
-            parts = path.split("/")
-
-            if len(parts) > 0:
-                text = parts[0]  # organisation
-            if len(parts) > 1 and not (
-                parts[-2] == "workspace" and parts[-1] == "overview"
-            ):
-                if len(parts) > 1:
-                    text += f"/{parts[1]}"  # repository
-                if len(parts) > 2:
-                    if parts[2] in ["issues", "pull-requests"]:
-                        itemnumber = parts[-1]
-                        text += f"#{itemnumber}"  # element number
-
-        elif self.platform == "gitlab":
-            # cp. https://docs.gitlab.com/ee/user/markdown.html#gitlab-specific-references
-            if "/-/" in path and any(
-                map(uri.path.__contains__, ["issues", "merge_requests"])
-            ):
-                group_and_subgroups, parts, *_ = path.split("/-/")
-                parts = parts.rstrip("/")
-                if "/" not in parts:
-                    text = f"{group_and_subgroups}/{parts}"
-                else:
-                    parts = parts.split("/")
-                    url_type, element_number, *_ = parts
-                    if not element_number:
-                        text = group_and_subgroups
-                    elif url_type == "issues":
-                        text = f"{group_and_subgroups}#{element_number}"
-                    elif url_type == "merge_requests":
-                        text = f"{group_and_subgroups}!{element_number}"
-            else:
-                # display the whole uri (after "gitlab.com/") including parameters
-                # for example "<group>/<subgroup1>/<subgroup2>/<repository>"
-                text = uri._replace(netloc="", scheme="")  # remove platform
-                text = urlunparse(text)[1:]  # combine to string and strip leading "/"
-
-        return text
+                platform = self.supported_platform.get(parsed_uri.netloc)
+                if platform is not None:
+                    short = shorten_url(platform, uri)
+                    if short != uri:
+                        node.attributes["classes"].append(platform)
+                        node.children[0] = nodes.Text(short)
+
+
+def shorten_url(platform: str, url: str) -> str:
+    """Parse the content of the path with respect to the selected platform.
+
+    Args:
+        platform: "github", "gitlab", "bitbucket", etc.
+        url: the full url to the platform content, beginning with https://
+
+    Returns:
+        short form version of the url,
+        or the full url if it could not shorten it
+    """
+    if platform == "github":
+        return shorten_github(url)
+    elif platform == "bitbucket":
+        return shorten_bitbucket(url)
+    elif platform == "gitlab":
+        return shorten_gitlab(url)
+
+    return url
+
+
+def shorten_github(url: str) -> str:
+    """
+    Convert a GitHub URL to a short form like owner/repo#123 or
+    owner/repo@abc123.
+    """
+    path = urlparse(url).path
+
+    # Pull request URL
+    if match := re.match(r"/([^/]+)/([^/]+)/pull/(\d+)", path):
+        owner, repo, pr_id = match.groups()
+        return f"{owner}/{repo}#{pr_id}"
+
+    # Issue URL
+    elif match := re.match(r"/([^/]+)/([^/]+)/issues/(\d+)", path):
+        owner, repo, issue_id = match.groups()
+        return f"{owner}/{repo}#{issue_id}"
+
+    # Commit URL
+    elif match := re.match(r"/([^/]+)/([^/]+)/commit/([a-f0-9]{7,40})", path):
+        owner, repo, commit_hash = match.groups()
+        return f"{owner}/{repo}@{commit_hash[:7]}"
+
+    # Branch URL
+    elif match := re.match(r"/([^/]+)/([^/]+)/tree/([^/]+)", path):
+        owner, repo, branch = match.groups()
+        return f"{owner}/{repo}:{unquote(branch)}"
+
+    # Tag URL
+    elif match := re.match(r"/([^/]+)/([^/]+)/releases/tag/([^/]+)", path):
+        owner, repo, tag = match.groups()
+        return f"{owner}/{repo}@{unquote(tag)}"
+
+    # File URL
+    elif match := re.match(r"/([^/]+)/([^/]+)/blob/([^/]+)/(.*)", path):
+        owner, repo, ref, filepath = match.groups()
+        return f"{owner}/{repo}@{ref}/{unquote(filepath)}"
+
+    # No match — return the original URL
+    return url
+
+
+def shorten_gitlab(url: str) -> str:
+    """
+    Convert a GitLab URL to a short form like group/project!123 or
+    group/project@abcdef7.
+
+    Supports both canonical ('/-/') and non-canonical path formats.
+    """
+    path = urlparse(url).path
+
+    # Merge requests
+    if (m := re.match(r"^/(.+)/([^/]+)/-/merge_requests/(\d+)$", path)) or (
+        m := re.match(r"^/(.+)/([^/]+)/merge_requests/(\d+)$", path)
+    ):
+        namespace, project, mr_id = m.groups()
+        return f"{namespace}/{project}!{mr_id}"
+
+    # Issues
+    if (m := re.match(r"^/(.+)/([^/]+)/-/issues/(\d+)$", path)) or (
+        m := re.match(r"^/(.+)/([^/]+)/issues/(\d+)$", path)
+    ):
+        namespace, project, issue_id = m.groups()
+        return f"{namespace}/{project}#{issue_id}"
+
+    # Commits
+    if (m := re.match(r"^/(.+)/([^/]+)/-/commit/([a-fA-F0-9]+)$", path)) or (
+        m := re.match(r"^/(.+)/([^/]+)/commit/([a-fA-F0-9]+)$", path)
+    ):
+        namespace, project, commit_hash = m.groups()
+        return f"{namespace}/{project}@{commit_hash[:7]}"
+
+    # Branches (tree)
+    if (m := re.match(r"^https://gitlab\.com/(.+)/([^/]+)/-/tree/(.+)$", path)) or (
+        m := re.match(r"^https://gitlab\.com/(.+)/([^/]+)/tree/(.+)$", path)
+    ):
+        namespace, project, branch = m.groups()
+        return f"{namespace}/{project}:{unquote(branch)}"
+
+    # Tags
+    if (m := re.match(r"^/(.+)/([^/]+)/-/tags/(.+)$", path)) or (
+        m := re.match(r"^/(.+)/([^/]+)/tags/(.+)$", path)
+    ):
+        namespace, project, tag = m.groups()
+        return f"{namespace}/{project}@{unquote(tag)}"
+
+    # Blob (files)
+    if (m := re.match(r"^/(.+)/([^/]+)/-/blob/([^/]+)/(.+)$", path)) or (
+        m := re.match(r"^/(.+)/([^/]+)/blob/([^/]+)/(.+)$", path)
+    ):
+        namespace, project, ref, path = m.groups()
+        return f"{namespace}/{project}@{ref}/{unquote(path)}"
+
+    # No match — return the original URL
+    return url
+
+
+def shorten_bitbucket(url: str) -> str:
+    """
+    Convert a Bitbucket URL to a short form like team/repo#123 or
+    team/repo@main.
+    """
+    path = urlparse(url).path
+
+    # Pull request URL
+    if match := re.match(r"/([^/]+)/([^/]+)/pull-requests/(\d+)", path):
+        workspace, repo, pr_id = match.groups()
+        return f"{workspace}/{repo}#{pr_id}"
+
+    # Issue URL
+    elif match := re.match(r"/([^/]+)/([^/]+)/issues/(\d+)", path):
+        workspace, repo, issue_id = match.groups()
+        return f"{workspace}/{repo}!{issue_id}"
+
+    # Commit URL
+    elif match := re.match(r"/([^/]+)/([^/]+)/commits/([a-f0-9]+)", path):
+        workspace, repo, commit_hash = match.groups()
+        return f"{workspace}/{repo}@{commit_hash[:7]}"
+
+    # Branch URL
+    elif match := re.match(r"/([^/]+)/([^/]+)/branch/(.+)", path):
+        workspace, repo, branch = match.groups()
+        return f"{workspace}/{repo}:{unquote(branch)}"
+
+    # Tag URL
+    elif match := re.match(r"/([^/]+)/([^/]+)/commits/tag/(.+)", path):
+        workspace, repo, tag = match.groups()
+        return f"{workspace}/{repo}@{unquote(tag)}"
+
+    # File URL
+    elif match := re.match(r"/([^/]+)/([^/]+)/src/([^/]+)/(.*)", path):
+        workspace, repo, ref, path = match.groups()
+        return f"{workspace}/{repo}@{ref}/{unquote(path)}"
+
+    # No match — return the original URL
+    return url
diff --git a/tests/sites/base/page1.rst b/tests/sites/base/page1.rst
@@ -15,6 +15,18 @@ Page 1
     https://github.com/pydata/pydata-sphinx-theme/pull/1012
     https://github.com/orgs/pydata/projects/2
 
+    http will get shortened:
+
+    http://github.com/pydata/pydata-sphinx-theme/pull/1012
+
+    www will not get shortened:
+
+    https://www.github.com/pydata/pydata-sphinx-theme/pull/1012
+
+    will not be linkified:
+
+    github.com/pydata/pydata-sphinx-theme/pull/1012
+
 **GitLab**
 
 .. container:: gitlab-container

diff --git a/tests/test_build/bitbucket_links.html b/tests/test_build/bitbucket_links.html
@@ -1,13 +1,13 @@
 <div class="bitbucket-container docutils container">
  <p>
-  <a class="bitbucket reference external" href="https://bitbucket.org">
-   bitbucket
+  <a class="reference external" href="https://bitbucket.org">
+   https://bitbucket.org
   </a>
-  <a class="bitbucket reference external" href="https://bitbucket.org/atlassian/workspace/overview">
-   atlassian
+  <a class="reference external" href="https://bitbucket.org/atlassian/workspace/overview">
+   https://bitbucket.org/atlassian/workspace/overview
   </a>
-  <a class="bitbucket reference external" href="https://bitbucket.org/atlassian/aui">
-   atlassian/aui
+  <a class="reference external" href="https://bitbucket.org/atlassian/aui">
+   https://bitbucket.org/atlassian/aui
   </a>
   <a class="bitbucket reference external" href="https://bitbucket.org/atlassian/aui/pull-requests/4758">
    atlassian/aui#4758

diff --git a/tests/test_build/github_links.html b/tests/test_build/github_links.html
@@ -1,19 +1,41 @@
 <div class="github-container docutils container">
  <p>
-  <a class="github reference external" href="https://github.com">
-   github
+  <a class="reference external" href="https://github.com">
+   https://github.com
   </a>
-  <a class="github reference external" href="https://github.com/pydata">
-   pydata
+  <a class="reference external" href="https://github.com/pydata">
+   https://github.com/pydata
   </a>
-  <a class="github reference external" href="https://github.com/pydata/pydata-sphinx-theme">
-   pydata/pydata-sphinx-theme
+  <a class="reference external" href="https://github.com/pydata/pydata-sphinx-theme">
+   https://github.com/pydata/pydata-sphinx-theme
   </a>
   <a class="github reference external" href="https://github.com/pydata/pydata-sphinx-theme/pull/1012">
    pydata/pydata-sphinx-theme#1012
   </a>
-  <a class="github reference external" href="https://github.com/orgs/pydata/projects/2">
-   pydata/projects#2
+  <a class="reference external" href="https://github.com/orgs/pydata/projects/2">
+   https://github.com/orgs/pydata/projects/2
   </a>
  </p>
+ <p>
+  http will get shortened:
+ </p>
+ <p>
+  <a class="github reference external" href="http://github.com/pydata/pydata-sphinx-theme/pull/1012">
+   pydata/pydata-sphinx-theme#1012
+  </a>
+ </p>
+ <p>
+  www will not get shortened:
+ </p>
+ <p>
+  <a class="reference external" href="https://www.github.com/pydata/pydata-sphinx-theme/pull/1012">
+   https://www.github.com/pydata/pydata-sphinx-theme/pull/1012
+  </a>
+ </p>
+ <p>
+  will not be linkified:
+ </p>
+ <p>
+  github.com/pydata/pydata-sphinx-theme/pull/1012
+ </p>
 </div>