Skip to content

Commit 91b1177

Browse files
committed
Rework internal navigation to prevent deadlocking
The mix of sync and async HTTP requests requires care to avoid deadlocks. Previously, it was possible for async requests to use up all available HTTP state objects duration a navigation flow (either directly, or via an internal redirect (e.g. click, submit, ...)). This would block the navigation, which, because everything is single thread, would block the I/O loop, resulting in a deadlock. The correct solution seems to be to remove all synchronous I/O. And I tried to do that, but I ran into a wall with module-loading, which is initiated from V8. V8 says "give me the source for this module", and I don't see a great way to tell it: wait a bit. So I went back to trying to make this work with the hybrid model, despite last weeks failures to get it to work. I changed two things: 1 - The http client will only directly initiate an async request if there's at least 2 free state objects available (1 for the request, and leaving 1 free for any synchronous requests) 2 - Delayed navigation retries until there's at least 1 free http state object available. Commits from last week did help with this. First, we're now guaranteed to have a single sync-request at a time (previously, we could have had 2). Secondly, the async connection is now async end-to-end (previously, it could have blocked on an empty state pool). We could probably make this a bit more obviously by reserving 1 state object for synchronous requests. But, since the long term solution is probably having no synchronous requests, I'm happy with anything that lets me move past this issue.
1 parent f12e9b6 commit 91b1177

File tree

6 files changed

+134
-50
lines changed

6 files changed

+134
-50
lines changed

src/browser/page.zig

Lines changed: 74 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,12 @@ pub const Page = struct {
190190
const session = self.session;
191191
const notification = session.browser.notification;
192192

193-
log.debug(.http, "navigate", .{ .url = request_url, .reason = opts.reason });
193+
log.debug(.http, "navigate", .{
194+
.url = request_url,
195+
.method = opts.method,
196+
.reason = opts.reason,
197+
.body = opts.body != null,
198+
});
194199

195200
// if the url is about:blank, nothing to do.
196201
if (std.mem.eql(u8, "about:blank", request_url.raw)) {
@@ -593,6 +598,8 @@ pub const Page = struct {
593598
}
594599
}
595600

601+
// We cannot navigate immediately as navigating will delete the DOM tree,
602+
// which holds this event's node.
596603
// As such we schedule the function to be called as soon as possible.
597604
// The page.arena is safe to use here, but the transfer_arena exists
598605
// specifically for this type of lifetime.
@@ -603,7 +610,7 @@ pub const Page = struct {
603610
navi.* = .{
604611
.opts = opts,
605612
.session = self.session,
606-
.url = try arena.dupe(u8, url),
613+
.url = try self.url.resolve(arena, url),
607614
};
608615
_ = try self.loop.timeout(0, &navi.navigate_node);
609616
}
@@ -692,15 +699,77 @@ pub const Page = struct {
692699
};
693700

694701
const DelayedNavigation = struct {
695-
url: []const u8,
702+
url: URL,
696703
session: *Session,
697704
opts: NavigateOpts,
705+
initial: bool = true,
698706
navigate_node: Loop.CallbackNode = .{ .func = delayNavigate },
699707

708+
// Navigation is blocking, which is problem because it can seize up
709+
// the loop and deadlock. We can only safely try to navigate to a
710+
// new page when we're sure there's at least 1 free slot in the
711+
// http client. We handle this in two phases:
712+
//
713+
// In the first phase, when self.initial == true, we'll shutdown the page
714+
// and create a new one. The shutdown is important, because it resets the
715+
// loop ctx_id and closes the scope. Closing the scope calls our XHR
716+
// destructors which aborts requests. This is necessary to make sure our
717+
// [blocking] navigate won't block.
718+
//
719+
// In the 2nd phase, we wait until there's a free http slot so that our
720+
// navigate definetly won't block (which could deadlock the system if there
721+
// are still pending async requests, which we've seen happen, even after
722+
// an abort).
700723
fn delayNavigate(node: *Loop.CallbackNode, repeat_delay: *?u63) void {
701-
_ = repeat_delay;
702724
const self: *DelayedNavigation = @fieldParentPtr("navigate_node", node);
703-
self.session.pageNavigate(self.url, self.opts) catch |err| {
725+
726+
const session = self.session;
727+
const initial = self.initial;
728+
729+
if (initial) {
730+
session.removePage();
731+
_ = session.createPage() catch |err| {
732+
log.err(.browser, "delayed navigation page error", .{
733+
.err = err,
734+
.url = self.url,
735+
});
736+
return;
737+
};
738+
self.initial = false;
739+
}
740+
741+
if (session.browser.http_client.freeSlotCount() == 0) {
742+
log.debug(.browser, "delayed navigate waiting", .{});
743+
const delay = 0 * std.time.ns_per_ms;
744+
745+
// If this isn't the initial check, we can safely re-use the timer
746+
// to check again.
747+
if (initial == false) {
748+
repeat_delay.* = delay;
749+
return;
750+
}
751+
752+
// However, if this _is_ the initial check, we called
753+
// session.removePage above, and that reset the loop ctx_id.
754+
// We can't re-use this timer, because it has the previous ctx_id.
755+
// We can create a new timeout though, and that'll get the new ctx_id.
756+
//
757+
// Page has to be not-null here because we called createPage above.
758+
_ = session.page.?.loop.timeout(delay, &self.navigate_node) catch |err| {
759+
log.err(.browser, "delayed navigation loop err", .{ .err = err });
760+
};
761+
return;
762+
}
763+
764+
const page = session.currentPage() orelse return;
765+
defer if (!page.delayed_navigation) {
766+
// If, while loading the page, we intend to navigate to another
767+
// page, then we need to keep the transfer_arena around, as this
768+
// sub-navigation is probably using it.
769+
_ = session.browser.transfer_arena.reset(.{ .retain_with_limit = 64 * 1024 });
770+
};
771+
772+
return page.navigate(self.url, self.opts) catch |err| {
704773
log.err(.browser, "delayed navigation error", .{ .err = err, .url = self.url });
705774
};
706775
}

src/browser/session.zig

Lines changed: 3 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ const Allocator = std.mem.Allocator;
2222

2323
const Env = @import("env.zig").Env;
2424
const Page = @import("page.zig").Page;
25+
const URL = @import("../url.zig").URL;
2526
const Browser = @import("browser.zig").Browser;
2627
const NavigateOpts = @import("page.zig").NavigateOpts;
2728

@@ -72,7 +73,7 @@ pub const Session = struct {
7273

7374
pub fn deinit(self: *Session) void {
7475
if (self.page != null) {
75-
self.removePage() catch {};
76+
self.removePage();
7677
}
7778
self.cookie_jar.deinit();
7879
self.storage_shed.deinit();
@@ -104,7 +105,7 @@ pub const Session = struct {
104105
return page;
105106
}
106107

107-
pub fn removePage(self: *Session) !void {
108+
pub fn removePage(self: *Session) void {
108109
// Inform CDP the page is going to be removed, allowing other worlds to remove themselves before the main one
109110
self.browser.notification.dispatch(.page_remove, .{});
110111

@@ -127,12 +128,6 @@ pub const Session = struct {
127128
// window.setTimeout and running microtasks should be ignored
128129
self.browser.app.loop.reset();
129130

130-
// Finally, we run the loop. Because of the reset just above, this will
131-
// ignore any timeouts. And, because of the endScope about this, it
132-
// should ensure that the http requests detect the shutdown socket and
133-
// release their resources.
134-
try self.browser.app.loop.run();
135-
136131
self.page = null;
137132

138133
// clear netsurf memory arena.
@@ -144,28 +139,4 @@ pub const Session = struct {
144139
pub fn currentPage(self: *Session) ?*Page {
145140
return &(self.page orelse return null);
146141
}
147-
148-
pub fn pageNavigate(self: *Session, url_string: []const u8, opts: NavigateOpts) !void {
149-
// currently, this is only called from the page, so let's hope
150-
// it isn't null!
151-
std.debug.assert(self.page != null);
152-
153-
defer if (self.page) |*p| {
154-
if (!p.delayed_navigation) {
155-
// If, while loading the page, we intend to navigate to another
156-
// page, then we need to keep the transfer_arena around, as this
157-
// sub-navigation is probably using it.
158-
_ = self.browser.transfer_arena.reset(.{ .retain_with_limit = 64 * 1024 });
159-
}
160-
};
161-
162-
// it's safe to use the transfer arena here, because the page will
163-
// eventually clone the URL using its own page_arena (after it gets
164-
// the final URL, possibly following redirects)
165-
const url = try self.page.?.url.resolve(self.transfer_arena, url_string);
166-
167-
try self.removePage();
168-
var page = try self.createPage();
169-
return page.navigate(url, opts);
170-
}
171142
};

src/browser/xhr/xhr.zig

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -338,9 +338,20 @@ pub const XMLHttpRequest = struct {
338338
// dispatch request event.
339339
// errors are logged only.
340340
fn dispatchEvt(self: *XMLHttpRequest, typ: []const u8) void {
341-
log.debug(.script_event, "dispatch event", .{ .type = typ, .source = "xhr" });
341+
log.debug(.script_event, "dispatch event", .{
342+
.type = typ,
343+
.source = "xhr",
344+
.method = self.method,
345+
.url = self.url,
346+
});
342347
self._dispatchEvt(typ) catch |err| {
343-
log.err(.app, "dispatch event error", .{ .err = err, .type = typ, .source = "xhr" });
348+
log.err(.app, "dispatch event error", .{
349+
.err = err,
350+
.type = typ,
351+
.source = "xhr",
352+
.method = self.method,
353+
.url = self.url,
354+
});
344355
};
345356
}
346357

@@ -358,9 +369,20 @@ pub const XMLHttpRequest = struct {
358369
typ: []const u8,
359370
opts: ProgressEvent.EventInit,
360371
) void {
361-
log.debug(.script_event, "dispatch progress event", .{ .type = typ, .source = "xhr" });
372+
log.debug(.script_event, "dispatch progress event", .{
373+
.type = typ,
374+
.source = "xhr",
375+
.method = self.method,
376+
.url = self.url,
377+
});
362378
self._dispatchProgressEvent(typ, opts) catch |err| {
363-
log.err(.app, "dispatch progress event error", .{ .err = err, .type = typ, .source = "xhr" });
379+
log.err(.app, "dispatch progress event error", .{
380+
.err = err,
381+
.type = typ,
382+
.source = "xhr",
383+
.method = self.method,
384+
.url = self.url,
385+
});
364386
};
365387
}
366388

src/cdp/domains/target.zig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ fn closeTarget(cmd: anytype) !void {
220220
bc.session_id = null;
221221
}
222222

223-
try bc.session.removePage();
223+
bc.session.removePage();
224224
if (bc.isolated_world) |*world| {
225225
world.deinit();
226226
bc.isolated_world = null;

src/http/client.zig

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -113,10 +113,18 @@ pub const Client = struct {
113113
loop: *Loop,
114114
opts: RequestOpts,
115115
) !void {
116-
if (self.state_pool.acquireOrNull()) |state| {
117-
// if we have state ready, we can skip the loop and immediately
118-
// kick this request off.
119-
return self.asyncRequestReady(method, uri, ctx, callback, state, opts);
116+
117+
// See the page's DelayedNavitation for why we're doing this. TL;DR -
118+
// we need to keep 1 slot available for the blocking page navigation flow
119+
// (Almost worth keeping a dedicate State just for that flow, but keep
120+
// thinking we need a more permanent solution (i.e. making everything
121+
// non-blocking).
122+
if (self.freeSlotCount() > 1) {
123+
if (self.state_pool.acquireOrNull()) |state| {
124+
// if we have state ready, we can skip the loop and immediately
125+
// kick this request off.
126+
return self.asyncRequestReady(method, uri, ctx, callback, state, opts);
127+
}
120128
}
121129

122130
// This cannot be a client-owned MemoryPool. The page can end before
@@ -174,6 +182,10 @@ pub const Client = struct {
174182
.client = self,
175183
};
176184
}
185+
186+
pub fn freeSlotCount(self: *Client) usize {
187+
return self.state_pool.freeSlotCount();
188+
}
177189
};
178190

179191
const RequestOpts = struct {
@@ -2509,6 +2521,12 @@ const StatePool = struct {
25092521
allocator.free(self.states);
25102522
}
25112523

2524+
pub fn freeSlotCount(self: *StatePool) usize {
2525+
self.mutex.lock();
2526+
defer self.mutex.unlock();
2527+
return self.available;
2528+
}
2529+
25122530
pub fn acquireWait(self: *StatePool) *State {
25132531
const states = self.states;
25142532

@@ -3000,8 +3018,14 @@ test "HttpClient: async connect error" {
30003018
.{},
30013019
);
30023020

3003-
try loop.io.run_for_ns(std.time.ns_per_ms);
3004-
try reset.timedWait(std.time.ns_per_s);
3021+
for (0..10) |_| {
3022+
try loop.io.run_for_ns(std.time.ns_per_ms * 10);
3023+
if (reset.isSet()) {
3024+
break;
3025+
}
3026+
} else {
3027+
return error.Timeout;
3028+
}
30053029
}
30063030

30073031
test "HttpClient: async no body" {

src/runtime/loop.zig

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,6 @@ pub const Loop = struct {
127127
}
128128
}
129129

130-
131130
// JS callbacks APIs
132131
// -----------------
133132

@@ -255,7 +254,6 @@ pub const Loop = struct {
255254
}
256255
}.onConnect;
257256

258-
259257
const callback = try self.event_callback_pool.create();
260258
errdefer self.event_callback_pool.destroy(callback);
261259
callback.* = .{ .loop = self, .ctx = ctx };

0 commit comments

Comments
 (0)