Skip to content

Commit b81d913

Browse files
authored
Merge pull request #1135 from betatim/fix-unescaped-ref
[MRG] Fix escaping of non SHA1 refs
2 parents 030b435 + a9fe7d2 commit b81d913

File tree

3 files changed

+84
-59
lines changed

3 files changed

+84
-59
lines changed

binderhub/builder.py

Lines changed: 57 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,61 @@
5454
LAUNCHES_INPROGRESS = Gauge('binderhub_inprogress_launches', 'Launches currently in progress')
5555

5656

57+
def _generate_build_name(build_slug, ref, prefix='', limit=63, ref_length=6):
58+
"""Generate a unique build name with a limited character length.
59+
60+
Guaranteed (to acceptable level) to be unique for a given user, repo,
61+
and ref.
62+
63+
We really, *really* care that we always end up with the same
64+
'build_name' for a particular repo + ref, but the default max
65+
character limit for build names is 63. To meet this constraint, we
66+
include a prefixed hash of the user / repo in all build names and do
67+
some length limiting :)
68+
69+
Note that ``build`` names only need to be unique over a shorter period
70+
of time, while ``image`` names need to be unique for longer. Hence,
71+
different strategies are used.
72+
73+
We also ensure that the returned value is DNS safe, by only using
74+
ascii lowercase + digits. everything else is escaped
75+
"""
76+
# escape parts that came from providers (build slug, ref)
77+
# build names are case-insensitive `.lower()` is called at the end
78+
build_slug = _safe_build_slug(build_slug, limit=limit - len(prefix) - ref_length - 1)
79+
ref = _safe_build_slug(ref, limit=ref_length, hash_length=2)
80+
81+
return '{prefix}{safe_slug}-{ref}'.format(
82+
prefix=prefix,
83+
safe_slug=build_slug,
84+
ref=ref[:ref_length],
85+
).lower()
86+
87+
88+
def _safe_build_slug(build_slug, limit, hash_length=6):
89+
"""Create a unique-ish name from a slug.
90+
91+
This function catches a bug where a build slug may not produce a valid
92+
image name (e.g. arepo name ending with _, which results in image name
93+
ending with '-' which is invalid). This ensures that the image name is
94+
always safe, regardless of build slugs returned by providers
95+
(rather than requiring all providers to return image-safe build slugs
96+
below a certain length).
97+
98+
Since this changes the image name generation scheme, all existing cached
99+
images will be invalidated.
100+
"""
101+
build_slug_hash = hashlib.sha256(build_slug.encode('utf-8')).hexdigest()
102+
safe_chars = set(string.ascii_letters + string.digits)
103+
def escape(s):
104+
return escapism.escape(s, safe=safe_chars, escape_char='-')
105+
build_slug = escape(build_slug)
106+
return '{name}-{hash}'.format(
107+
name=build_slug[:limit - hash_length - 1],
108+
hash=build_slug_hash[:hash_length],
109+
).lower()
110+
111+
57112
class BuildHandler(BaseHandler):
58113
"""A handler for working with GitHub."""
59114

@@ -125,63 +180,6 @@ def initialize(self):
125180

126181
self.event_log = self.settings['event_log']
127182

128-
def _generate_build_name(self, build_slug, ref, prefix='', limit=63, ref_length=6):
129-
"""
130-
Generate a unique build name with a limited character length..
131-
132-
Guaranteed (to acceptable level) to be unique for a given user, repo,
133-
and ref.
134-
135-
We really, *really* care that we always end up with the same
136-
'build_name' for a particular repo + ref, but the default max
137-
character limit for build names is 63. To meet this constraint, we
138-
include a prefixed hash of the user / repo in all build names and do
139-
some length limiting :)
140-
141-
Note that ``build`` names only need to be unique over a shorter period
142-
of time, while ``image`` names need to be unique for longer. Hence,
143-
different strategies are used.
144-
145-
We also ensure that the returned value is DNS safe, by only using
146-
ascii lowercase + digits. everything else is escaped
147-
"""
148-
149-
# escape parts that came from providers (build slug, ref)
150-
# only build_slug *really* needs this (refs should be sha1 hashes)
151-
# build names are case-insensitive because ascii_letters are allowed,
152-
# and `.lower()` is called at the end
153-
safe_chars = set(string.ascii_letters + string.digits)
154-
def escape(s):
155-
return escapism.escape(s, safe=safe_chars, escape_char='-')
156-
157-
build_slug = self._safe_build_slug(build_slug, limit=limit - len(prefix) - ref_length - 1)
158-
ref = escape(ref)
159-
160-
return '{prefix}{safe_slug}-{ref}'.format(
161-
prefix=prefix,
162-
safe_slug=build_slug,
163-
ref=ref[:ref_length],
164-
).lower()
165-
166-
def _safe_build_slug(self, build_slug, limit, hash_length=6):
167-
"""
168-
This function catches a bug where build slug may not produce a valid image name
169-
(e.g. repo name ending with _, which results in image name ending with '-' which is invalid).
170-
This ensures that the image name is always safe, regardless of build slugs returned by providers
171-
(rather than requiring all providers to return image-safe build slugs below a certain length).
172-
Since this changes the image name generation scheme, all existing cached images will be invalidated.
173-
"""
174-
build_slug_hash = hashlib.sha256(build_slug.encode('utf-8')).hexdigest()
175-
safe_chars = set(string.ascii_letters + string.digits)
176-
def escape(s):
177-
return escapism.escape(s, safe=safe_chars, escape_char='-')
178-
build_slug = escape(build_slug)
179-
return '{name}-{hash}'.format(
180-
name=build_slug[:limit - hash_length - 1],
181-
hash=build_slug_hash[:hash_length],
182-
).lower()
183-
184-
185183
async def fail(self, message):
186184
await self.emit({
187185
'phase': 'failed',
@@ -280,9 +278,9 @@ async def get(self, provider_prefix, _unescaped_spec):
280278
image_prefix = self.settings['image_prefix']
281279

282280
# Enforces max 255 characters before image
283-
safe_build_slug = self._safe_build_slug(provider.get_build_slug(), limit=255 - len(image_prefix))
281+
safe_build_slug = _safe_build_slug(provider.get_build_slug(), limit=255 - len(image_prefix))
284282

285-
build_name = self._generate_build_name(provider.get_build_slug(), ref, prefix='build-')
283+
build_name = _generate_build_name(provider.get_build_slug(), ref, prefix='build-')
286284

287285
image_name = self.image_name = '{prefix}{build_slug}:{ref}'.format(
288286
prefix=image_prefix,

binderhub/tests/test_builder.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import pytest
2+
3+
from binderhub.builder import _generate_build_name
4+
5+
6+
@pytest.mark.parametrize('ref,build_slug', [
7+
# a long ref, no special characters at critical positions
8+
('3035124.v3.0', 'dataverse-dvn-2ftjclkp'),
9+
# with ref_length=6 this has a full stop that gets escaped to a -
10+
# as the last character, this used to cause an error
11+
('20460.v1.0', 'dataverse-s6-2fde95rt'),
12+
# short ref, should just work and need no special processing
13+
('123456', 'dataverse-s6-2fde95rt')
14+
])
15+
def test_build_name(build_slug, ref):
16+
# Build names have to be usable as pod names, which means they have to
17+
# be usable as hostnames as well.
18+
build_name = _generate_build_name(build_slug, ref)
19+
20+
last_char = build_name[-1]
21+
assert last_char not in ("-", "_", ".")

binderhub/tests/test_repoproviders.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,12 +134,18 @@ async def test_hydroshare_doi():
134134
resolved_spec = await provider.get_resolved_spec()
135135
assert resolved_spec == repo_url
136136

137+
137138
@pytest.mark.parametrize('spec,resolved_spec,resolved_ref,resolved_ref_url,build_slug', [
138139
['10.7910/DVN/TJCLKP',
139140
'10.7910/DVN/TJCLKP',
140141
'3035124.v3.0',
141142
'https://doi.org/10.7910/DVN/TJCLKP',
142143
'dataverse-dvn-2ftjclkp'],
144+
['10.25346/S6/DE95RT',
145+
'10.25346/S6/DE95RT',
146+
'20460.v1.0',
147+
'https://doi.org/10.25346/S6/DE95RT',
148+
'dataverse-s6-2fde95rt']
143149
])
144150
async def test_dataverse(spec, resolved_spec, resolved_ref, resolved_ref_url, build_slug):
145151
provider = DataverseProvider(spec=spec)

0 commit comments

Comments
 (0)