Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ repos:
hooks:
- id: djlint-jinja
types_or: ["html"]
exclude: ^tests/test_build/.*\.html$
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Linter was complaining about http link in a test fixture so I excluded it here.


- repo: "https://github.com/PyCQA/doc8"
rev: v1.1.2
Expand Down
256 changes: 168 additions & 88 deletions src/pydata_sphinx_theme/short_link.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
"""A custom Transform object to shorten github and gitlab links."""

import re

from typing import ClassVar
from urllib.parse import ParseResult, urlparse, urlunparse
from urllib.parse import unquote, urlparse

from docutils import nodes
from sphinx.transforms.post_transforms import SphinxPostTransform
Expand Down Expand Up @@ -39,7 +41,6 @@ class ShortenLinkTransform(SphinxPostTransform):
"gitlab.com": "gitlab",
"bitbucket.org": "bitbucket",
}
platform = None

@classmethod
def add_platform_mapping(cls, platform, netloc):
Expand All @@ -56,90 +57,169 @@ def run(self, **kwargs):
# only act if the uri and text are the same
# if not the user has already customized the display of the link
if uri is not None and text is not None and text == uri:
uri = urlparse(uri)
parsed_uri = urlparse(uri)
# only do something if the platform is identified
self.platform = self.supported_platform.get(uri.netloc)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not 100% sure about this part of the class refactor.

I got rid of the platform member. I also renamed the parse_url method and moved it out of the class.

If I am reading the code correctly, it seems odd to me that each time this class encounters a node, it changes self.platform to whatever is matched in that moment.

I felt like it would be cleaner and easier to test if I just passed platform as an argument to the shortener function.

if self.platform is not None:
node.attributes["classes"].append(self.platform)
node.children[0] = nodes.Text(self.parse_url(uri))

def parse_url(self, uri: ParseResult) -> str:
"""Parse the content of the url with respect to the selected platform.

Args:
uri: the link to the platform content

Returns:
the reformated url title
"""
path = uri.path
if path == "":
# plain url passed, return platform only
return self.platform

# if the path is not empty it contains a leading "/", which we don't want to
# include in the parsed content
path = path.lstrip("/")

# check the platform name and read the information accordingly
# as "<organisation>/<repository>#<element number>"
# or "<group>/<subgroup 1>/…/<subgroup N>/<repository>#<element number>"
if self.platform == "github":
# split the url content
parts = path.split("/")

if parts[0] == "orgs" and "/projects" in path:
# We have a projects board link
# ref: `orgs/{org}/projects/{project-id}`
text = f"{parts[1]}/projects#{parts[3]}"
else:
# We have an issues, PRs, or repository link
if len(parts) > 0:
text = parts[0] # organisation
if len(parts) > 1:
text += f"/{parts[1]}" # repository
if len(parts) > 2:
if parts[2] in ["issues", "pull", "discussions"]:
text += f"#{parts[-1]}" # element number

elif self.platform == "bitbucket":
# split the url content
parts = path.split("/")

if len(parts) > 0:
text = parts[0] # organisation
if len(parts) > 1 and not (
parts[-2] == "workspace" and parts[-1] == "overview"
):
if len(parts) > 1:
text += f"/{parts[1]}" # repository
if len(parts) > 2:
if parts[2] in ["issues", "pull-requests"]:
itemnumber = parts[-1]
text += f"#{itemnumber}" # element number

elif self.platform == "gitlab":
# cp. https://docs.gitlab.com/ee/user/markdown.html#gitlab-specific-references
if "/-/" in path and any(
map(uri.path.__contains__, ["issues", "merge_requests"])
):
group_and_subgroups, parts, *_ = path.split("/-/")
parts = parts.rstrip("/")
if "/" not in parts:
text = f"{group_and_subgroups}/{parts}"
else:
parts = parts.split("/")
url_type, element_number, *_ = parts
if not element_number:
text = group_and_subgroups
elif url_type == "issues":
text = f"{group_and_subgroups}#{element_number}"
elif url_type == "merge_requests":
text = f"{group_and_subgroups}!{element_number}"
else:
# display the whole uri (after "gitlab.com/") including parameters
# for example "<group>/<subgroup1>/<subgroup2>/<repository>"
text = uri._replace(netloc="", scheme="") # remove platform
text = urlunparse(text)[1:] # combine to string and strip leading "/"

return text
platform = self.supported_platform.get(parsed_uri.netloc)
if platform is not None:
short = shorten_url(platform, uri)
if short != uri:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the code change that prevents adding the platform class (and thereby the icon) unless the link is actually shortened.

node.attributes["classes"].append(platform)
node.children[0] = nodes.Text(short)


def shorten_url(platform: str, url: str) -> str:
"""Parse the content of the path with respect to the selected platform.

Args:
platform: "github", "gitlab", "bitbucket", etc.
url: the full url to the platform content, beginning with https://

Returns:
short form version of the url,
or the full url if it could not shorten it
"""
if platform == "github":
return shorten_github(url)
elif platform == "bitbucket":
return shorten_bitbucket(url)
elif platform == "gitlab":
return shorten_gitlab(url)

return url


def shorten_github(url: str) -> str:
"""
Convert a GitHub URL to a short form like owner/repo#123 or
owner/repo@abc123.
"""
path = urlparse(url).path

# Pull request URL
if match := re.match(r"/([^/]+)/([^/]+)/pull/(\d+)", path):
owner, repo, pr_id = match.groups()
return f"{owner}/{repo}#{pr_id}"

# Issue URL
elif match := re.match(r"/([^/]+)/([^/]+)/issues/(\d+)", path):
owner, repo, issue_id = match.groups()
return f"{owner}/{repo}#{issue_id}"

# Commit URL
elif match := re.match(r"/([^/]+)/([^/]+)/commit/([a-f0-9]{7,40})", path):
owner, repo, commit_hash = match.groups()
return f"{owner}/{repo}@{commit_hash[:7]}"

# Branch URL
elif match := re.match(r"/([^/]+)/([^/]+)/tree/([^/]+)", path):
owner, repo, branch = match.groups()
return f"{owner}/{repo}:{unquote(branch)}"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why the colon here? Aren't (tips of) branches usually represented with @? For example pip install git+https://git.example.com/MyProject.git@master

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I used AI to help write this and I thought that it would get the short patterns correct.

Looking into this some more, it looks like this also goes above and beyond what GitHub supports. I'm not sure we should do that.

Maybe for GitHub, I should change this code to only support URLs to PRs, issues and commits.


# Tag URL
elif match := re.match(r"/([^/]+)/([^/]+)/releases/tag/([^/]+)", path):
owner, repo, tag = match.groups()
return f"{owner}/{repo}@{unquote(tag)}"

# File URL
elif match := re.match(r"/([^/]+)/([^/]+)/blob/([^/]+)/(.*)", path):
owner, repo, ref, filepath = match.groups()
return f"{owner}/{repo}@{ref}/{unquote(filepath)}"

# No match — return the original URL
return url


def shorten_gitlab(url: str) -> str:
"""
Convert a GitLab URL to a short form like group/project!123 or
group/project@abcdef7.

Supports both canonical ('/-/') and non-canonical path formats.
"""
path = urlparse(url).path

# Merge requests
if (m := re.match(r"^/(.+)/([^/]+)/-/merge_requests/(\d+)$", path)) or (
m := re.match(r"^/(.+)/([^/]+)/merge_requests/(\d+)$", path)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the second pattern can be omitted if you insert a (?:/-)? (optional non-capturing group containing literal /-) in place of the /- in the first pattern.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How would the full regex look? Like this?

r"^/(.+)/([^/]+)(?:/-)?/merge_requests/(\d+)$"

I don't think that works because what happens is if you try to match that regex on a URL path like:

/gitlab-org/gitlab/-/merge_requests/123

You get:

>>> m = re.match(r"^/(.+)/([^/]+)(?:/-)?/merge_requests/(\d+)$", "/gitlab-org/gitlab/-/merge_requests/123")
>>> m.groups()
('gitlab-org/gitlab', '-', '123')

It also doesn't work if you try to make the patterns in the capturing groups lazy:

>>> m = re.match(r"^/(.+?)/([^/]+?)/merge_requests/(\d+)$", "/gitlab-org/gitlab/-/merge_requests/123")
>>> m.groups()
('gitlab-org/gitlab', '-', '123')

I fiddled with this several times until I just gave up and decided it's probably easier to write two separate regexes (one for canonical, one for non-canonical GitLab URLs) than to try to combine them into one.

):
namespace, project, mr_id = m.groups()
return f"{namespace}/{project}!{mr_id}"

# Issues
if (m := re.match(r"^/(.+)/([^/]+)/-/issues/(\d+)$", path)) or (
m := re.match(r"^/(.+)/([^/]+)/issues/(\d+)$", path)
):
namespace, project, issue_id = m.groups()
return f"{namespace}/{project}#{issue_id}"

# Commits
if (m := re.match(r"^/(.+)/([^/]+)/-/commit/([a-fA-F0-9]+)$", path)) or (
m := re.match(r"^/(.+)/([^/]+)/commit/([a-fA-F0-9]+)$", path)
):
namespace, project, commit_hash = m.groups()
return f"{namespace}/{project}@{commit_hash[:7]}"

# Branches (tree)
if (m := re.match(r"^https://gitlab\.com/(.+)/([^/]+)/-/tree/(.+)$", path)) or (
m := re.match(r"^https://gitlab\.com/(.+)/([^/]+)/tree/(.+)$", path)
):
namespace, project, branch = m.groups()
return f"{namespace}/{project}:{unquote(branch)}"

# Tags
if (m := re.match(r"^/(.+)/([^/]+)/-/tags/(.+)$", path)) or (
m := re.match(r"^/(.+)/([^/]+)/tags/(.+)$", path)
):
namespace, project, tag = m.groups()
return f"{namespace}/{project}@{unquote(tag)}"

# Blob (files)
if (m := re.match(r"^/(.+)/([^/]+)/-/blob/([^/]+)/(.+)$", path)) or (
m := re.match(r"^/(.+)/([^/]+)/blob/([^/]+)/(.+)$", path)
):
namespace, project, ref, path = m.groups()
return f"{namespace}/{project}@{ref}/{unquote(path)}"

# No match — return the original URL
return url


def shorten_bitbucket(url: str) -> str:
"""
Convert a Bitbucket URL to a short form like team/repo#123 or
team/repo@main.
"""
path = urlparse(url).path

# Pull request URL
if match := re.match(r"/([^/]+)/([^/]+)/pull-requests/(\d+)", path):
workspace, repo, pr_id = match.groups()
return f"{workspace}/{repo}#{pr_id}"

# Issue URL
elif match := re.match(r"/([^/]+)/([^/]+)/issues/(\d+)", path):
workspace, repo, issue_id = match.groups()
return f"{workspace}/{repo}!{issue_id}"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is the ! standard for bitbucket issues? above we use # for both PRs and issues in the GitHub/GitLab shorteners.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Honestly, I have no clue about Bitbucket. Never used it.


# Commit URL
elif match := re.match(r"/([^/]+)/([^/]+)/commits/([a-f0-9]+)", path):
workspace, repo, commit_hash = match.groups()
return f"{workspace}/{repo}@{commit_hash[:7]}"

# Branch URL
elif match := re.match(r"/([^/]+)/([^/]+)/branch/(.+)", path):
workspace, repo, branch = match.groups()
return f"{workspace}/{repo}:{unquote(branch)}"

# Tag URL
elif match := re.match(r"/([^/]+)/([^/]+)/commits/tag/(.+)", path):
workspace, repo, tag = match.groups()
return f"{workspace}/{repo}@{unquote(tag)}"

# File URL
elif match := re.match(r"/([^/]+)/([^/]+)/src/([^/]+)/(.*)", path):
workspace, repo, ref, path = match.groups()
return f"{workspace}/{repo}@{ref}/{unquote(path)}"

# No match — return the original URL
return url
12 changes: 12 additions & 0 deletions tests/sites/base/page1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,18 @@ Page 1
https://github.com/pydata/pydata-sphinx-theme/pull/1012
https://github.com/orgs/pydata/projects/2

http will get shortened:

http://github.com/pydata/pydata-sphinx-theme/pull/1012

www will not get shortened:

https://www.github.com/pydata/pydata-sphinx-theme/pull/1012

will not be linkified:

github.com/pydata/pydata-sphinx-theme/pull/1012

**GitLab**

.. container:: gitlab-container
Expand Down
12 changes: 6 additions & 6 deletions tests/test_build/bitbucket_links.html
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
<div class="bitbucket-container docutils container">
<p>
<a class="bitbucket reference external" href="https://bitbucket.org">
bitbucket
<a class="reference external" href="https://bitbucket.org">
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we still add the bitbucket class, which adds the BitBucket icon before the link, even if we do not shorten the link?

My sense is that only shortened links should get the icon.

Copy link
Member

@drammock drammock Jun 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is could be slightly shortened, as it omits by omitting the https:// part. If we did so and I had to pick I would include the logo but I don't really care.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that could be nice follow-on work

https://bitbucket.org
</a>
<a class="bitbucket reference external" href="https://bitbucket.org/atlassian/workspace/overview">
atlassian
<a class="reference external" href="https://bitbucket.org/atlassian/workspace/overview">
https://bitbucket.org/atlassian/workspace/overview
</a>
<a class="bitbucket reference external" href="https://bitbucket.org/atlassian/aui">
atlassian/aui
<a class="reference external" href="https://bitbucket.org/atlassian/aui">
https://bitbucket.org/atlassian/aui
</a>
<a class="bitbucket reference external" href="https://bitbucket.org/atlassian/aui/pull-requests/4758">
atlassian/aui#4758
Expand Down
38 changes: 30 additions & 8 deletions tests/test_build/github_links.html
Original file line number Diff line number Diff line change
@@ -1,19 +1,41 @@
<div class="github-container docutils container">
<p>
<a class="github reference external" href="https://github.com">
github
<a class="reference external" href="https://github.com">
https://github.com
</a>
<a class="github reference external" href="https://github.com/pydata">
pydata
<a class="reference external" href="https://github.com/pydata">
https://github.com/pydata
</a>
<a class="github reference external" href="https://github.com/pydata/pydata-sphinx-theme">
pydata/pydata-sphinx-theme
<a class="reference external" href="https://github.com/pydata/pydata-sphinx-theme">
https://github.com/pydata/pydata-sphinx-theme
</a>
<a class="github reference external" href="https://github.com/pydata/pydata-sphinx-theme/pull/1012">
pydata/pydata-sphinx-theme#1012
</a>
<a class="github reference external" href="https://github.com/orgs/pydata/projects/2">
pydata/projects#2
<a class="reference external" href="https://github.com/orgs/pydata/projects/2">
https://github.com/orgs/pydata/projects/2
</a>
</p>
<p>
http will get shortened:
</p>
<p>
<a class="github reference external" href="http://github.com/pydata/pydata-sphinx-theme/pull/1012">
pydata/pydata-sphinx-theme#1012
</a>
</p>
<p>
www will not get shortened:
</p>
<p>
<a class="reference external" href="https://www.github.com/pydata/pydata-sphinx-theme/pull/1012">
https://www.github.com/pydata/pydata-sphinx-theme/pull/1012
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if we should support www-prefixed URLs or not.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's the argument for not supporting it? Is it hard to make it work?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In my head, the argument against is just to nudge the documentation writers to use canonical, rather than non-canonical URLs. By that reasoning, I should remove support for the non-canonical GitLab URLs too.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should add support for non-canonical forms; in the long run I think it will save us time. If we don't, it seems almost certain that user complaints will arise. Can be a follow-up PR

</a>
</p>
<p>
will not be linkified:
</p>
<p>
github.com/pydata/pydata-sphinx-theme/pull/1012
</p>
</div>
Loading