Skip to content

Commit a211198

Browse files
authored
Merge pull request #1393 from minrk/launchmessages
Emit launch events
2 parents a8e3444 + 3af0092 commit a211198

File tree

5 files changed

+136
-28
lines changed

5 files changed

+136
-28
lines changed

binderhub/builder.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -582,17 +582,30 @@ async def launch(self, provider):
582582
username = launcher.unique_name_from_repo(self.repo_url)
583583
server_name = ''
584584
try:
585+
586+
async def handle_progress_event(event):
587+
message = event["message"]
588+
await self.emit(
589+
{
590+
"phase": "launching",
591+
"message": message + "\n",
592+
}
593+
)
594+
585595
extra_args = {
586596
'binder_ref_url': self.ref_url,
587597
'binder_launch_host': self.binder_launch_host,
588598
'binder_request': self.binder_request,
589599
'binder_persistent_request': self.binder_persistent_request,
590600
}
591-
server_info = await launcher.launch(image=self.image_name,
592-
username=username,
593-
server_name=server_name,
594-
repo_url=self.repo_url,
595-
extra_args=extra_args)
601+
server_info = await launcher.launch(
602+
image=self.image_name,
603+
username=username,
604+
server_name=server_name,
605+
repo_url=self.repo_url,
606+
extra_args=extra_args,
607+
event_callback=handle_progress_event,
608+
)
596609
except Exception as e:
597610
duration = time.perf_counter() - launch_starttime
598611
if i + 1 == launcher.retries:

binderhub/launcher.py

Lines changed: 101 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""
22
Launch an image with a temporary user via JupyterHub
33
"""
4+
import asyncio
45
import base64
56
import json
67
import random
@@ -9,6 +10,7 @@
910
from urllib.parse import urlparse, quote
1011
import uuid
1112
import os
13+
from datetime import timedelta
1214

1315
from tornado.log import app_log
1416
from tornado import web, gen
@@ -18,6 +20,8 @@
1820
from jupyterhub.traitlets import Callable
1921
from jupyterhub.utils import maybe_future
2022

23+
from .utils import url_path_join
24+
2125
# pattern for checking if it's an ssh repo and not a URL
2226
# used only after verifying that `://` is not present
2327
_ssh_repo_pat = re.compile(r'.*@.*\:')
@@ -80,6 +84,13 @@ def _default_hub_url_local(self):
8084
Receives 5 parameters: launcher, image, username, server_name, repo_url
8185
"""
8286
)
87+
launch_timeout = Integer(
88+
600,
89+
config=True,
90+
help="""
91+
Wait this many seconds until server is ready, raise TimeoutError otherwise.
92+
""",
93+
)
8394

8495
async def api_request(self, url, *args, **kwargs):
8596
"""Make an API request to JupyterHub"""
@@ -148,7 +159,15 @@ def unique_name_from_repo(self, repo_url):
148159
# add a random suffix to avoid collisions for users on the same image
149160
return '{}-{}'.format(prefix, ''.join(random.choices(SUFFIX_CHARS, k=SUFFIX_LENGTH)))
150161

151-
async def launch(self, image, username, server_name='', repo_url='', extra_args=None):
162+
async def launch(
163+
self,
164+
image,
165+
username,
166+
server_name="",
167+
repo_url="",
168+
extra_args=None,
169+
event_callback=None,
170+
):
152171
"""Launch a server for a given image
153172
154173
- creates a temporary user on the Hub if authentication is not enabled
@@ -215,38 +234,100 @@ async def launch(self, image, username, server_name='', repo_url='', extra_args=
215234

216235
# start server
217236
app_log.info(f"Starting server{_server_name} for user {username} with image {image}")
237+
ready_event_future = asyncio.Future()
238+
239+
def _cancel_ready_event(f=None):
240+
if not ready_event_future.done():
241+
if f and f.exception():
242+
ready_event_future.set_exception(f.exception())
243+
else:
244+
ready_event_future.cancel()
218245
try:
219246
resp = await self.api_request(
220247
'users/{}/servers/{}'.format(escaped_username, server_name),
221248
method='POST',
222249
body=json.dumps(data).encode('utf8'),
223250
)
224-
if resp.code == 202:
225-
# Server hasn't actually started yet
226-
# We wait for it!
227-
# NOTE: This ends up being about ten minutes
228-
for i in range(64):
229-
user_data = await self.get_user_data(escaped_username)
230-
if user_data['servers'][server_name]['ready']:
231-
break
232-
if not user_data['servers'][server_name]['pending']:
233-
raise web.HTTPError(500, "Image %s for user %s failed to launch" % (image, username))
234-
# FIXME: make this configurable
235-
# FIXME: Measure how long it takes for servers to start
236-
# and tune this appropriately
237-
await gen.sleep(min(1.4 ** i, 10))
238-
else:
239-
raise web.HTTPError(500, f"Image {image} for user {username} took too long to launch")
251+
# listen for pending spawn (launch) events until server is ready
252+
# do this even if previous request finished!
253+
buffer_list = []
254+
255+
async def handle_chunk(chunk):
256+
lines = b"".join(buffer_list + [chunk]).split(b"\n\n")
257+
# the last item in the list is usually an empty line ('')
258+
# but it can be the partial line after the last `\n\n`,
259+
# so put it back on the buffer to handle with the next chunk
260+
buffer_list[:] = [lines[-1]]
261+
for line in lines[:-1]:
262+
if line:
263+
line = line.decode("utf8", "replace")
264+
if line and line.startswith("data:"):
265+
event = json.loads(line.split(":", 1)[1])
266+
if event_callback:
267+
await event_callback(event)
268+
269+
# stream ends when server is ready or fails
270+
if event.get("ready", False):
271+
if not ready_event_future.done():
272+
ready_event_future.set_result(event)
273+
elif event.get("failed", False):
274+
if not ready_event_future.done():
275+
ready_event_future.set_exception(
276+
web.HTTPError(
277+
500, event.get("message", "unknown error")
278+
)
279+
)
280+
281+
url_parts = ["users", username]
282+
if server_name:
283+
url_parts.extend(["servers", server_name, "progress"])
284+
else:
285+
url_parts.extend(["server/progress"])
286+
progress_api_url = url_path_join(*url_parts)
287+
self.log.debug("Requesting progress for {username}: {progress_api_url}")
288+
resp_future = self.api_request(
289+
progress_api_url,
290+
streaming_callback=lambda chunk: asyncio.ensure_future(
291+
handle_chunk(chunk)
292+
),
293+
request_timeout=self.launch_timeout,
294+
)
295+
try:
296+
await gen.with_timeout(
297+
timedelta(seconds=self.launch_timeout), resp_future
298+
)
299+
except (gen.TimeoutError, TimeoutError):
300+
_cancel_ready_event()
301+
raise web.HTTPError(
302+
500,
303+
f"Image {image} for user {username} took too long to launch",
304+
)
240305

241306
except HTTPError as e:
307+
_cancel_ready_event()
242308
if e.response:
243309
body = e.response.body
244310
else:
245311
body = ''
246312

247-
app_log.error("Error starting server{} for user {}: {}\n{}".
248-
format(_server_name, username, e, body))
313+
app_log.error(
314+
f"Error starting server{_server_name} for user {username}: {e}\n{body}"
315+
)
249316
raise web.HTTPError(500, f"Failed to launch image {image}")
317+
except Exception:
318+
_cancel_ready_event()
319+
raise
320+
321+
# verify that the server is running!
322+
try:
323+
# this should already be done, but it's async so wait a finite time
324+
ready_event = await gen.with_timeout(
325+
timedelta(seconds=5), ready_event_future
326+
)
327+
except (gen.TimeoutError, TimeoutError):
328+
raise web.HTTPError(
329+
500, f"Image {image} for user {username} failed to launch"
330+
)
250331

251-
data['url'] = self.hub_url + 'user/%s/%s' % (escaped_username, server_name)
332+
data["url"] = url_path_join(self.hub_url, ready_event["url"])
252333
return data

binderhub/tests/test_build.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,21 +48,29 @@ async def test_build(app, needs_build, needs_launch, always_build, slug, pytestc
4848
r = await async_requests.get(build_url, stream=True)
4949
r.raise_for_status()
5050
events = []
51+
launch_events = 0
5152
async for line in async_requests.iter_lines(r):
5253
line = line.decode('utf8', 'replace')
5354
if line.startswith('data:'):
5455
event = json.loads(line.split(':', 1)[1])
5556
events.append(event)
5657
assert 'message' in event
57-
sys.stdout.write(event['message'])
58+
sys.stdout.write(f"{event.get('phase', '')}: {event['message']}")
5859
# this is the signal that everything is ready, pod is launched
5960
# and server is up inside the pod. Break out of the loop now
6061
# because BinderHub keeps the connection open for many seconds
6162
# after to avoid "reconnects" from slow clients
6263
if event.get('phase') == 'ready':
6364
r.close()
6465
break
65-
66+
if event.get("phase") == "launching" and not event["message"].startswith(
67+
("Launching server...", "Launch attempt ")
68+
):
69+
# skip standard launching events of builder
70+
# we are interested in launching events from spawner
71+
launch_events += 1
72+
73+
assert launch_events > 0
6674
final = events[-1]
6775
assert 'phase' in final
6876
assert final['phase'] == 'ready'

helm-chart/binderhub/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ jupyterhub:
211211
212212
# launch the notebook server
213213
os.execvp("jupyter-notebook", sys.argv)
214-
events: false
214+
events: true
215215
storage:
216216
type: none
217217
memory:

testing/local-binder-local-hub/jupyterhub_config.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,9 @@ class LocalContainerSpawner(BinderSpawnerMixin, DockerSpawner):
4444
"environment": {"JUPYTERHUB_EXTERNAL_URL": os.getenv("JUPYTERHUB_EXTERNAL_URL", "")}
4545
}]
4646
c.JupyterHub.default_url = f"/services/{binderhub_service_name}/"
47+
48+
c.JupyterHub.tornado_settings = {
49+
"slow_spawn_timeout": 0,
50+
}
51+
52+
c.KubeSpawner.events_enabled = True

0 commit comments

Comments
 (0)