Skip to content

Commit 97c769e

Browse files
committed
Rework internal navigation to prevent deadlocking
The mix of sync and async HTTP requests requires care to avoid deadlocks. Previously, it was possible for async requests to use up all available HTTP state objects duration a navigation flow (either directly, or via an internal redirect (e.g. click, submit, ...)). This would block the navigation, which, because everything is single thread, would block the I/O loop, resulting in a deadlock. The correct solution seems to be to remove all synchronous I/O. And I tried to do that, but I ran into a wall with module-loading, which is initiated from V8. V8 says "give me the source for this module", and I don't see a great way to tell it: wait a bit. So I went back to trying to make this work with the hybrid model, despite last weeks failures to get it to work. I changed two things: 1 - The http client will only directly initiate an async request if there's at least 2 free state objects available (1 for the request, and leaving 1 free for any synchronous requests) 2 - Delayed navigation retries until there's at least 1 free http state object available. Commits from last week did help with this. First, we're now guaranteed to have a single sync-request at a time (previously, we could have had 2). Secondly, the async connection is now async end-to-end (previously, it could have blocked on an empty state pool). We could probably make this a bit more obviously by reserving 1 state object for synchronous requests. But, since the long term solution is probably having no synchronous requests, I'm happy with anything that lets me move past this issue.
1 parent 0de33b3 commit 97c769e

File tree

5 files changed

+124
-46
lines changed

5 files changed

+124
-46
lines changed

src/browser/page.zig

Lines changed: 74 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,12 @@ pub const Page = struct {
192192
const session = self.session;
193193
const notification = session.browser.notification;
194194

195-
log.debug(.http, "navigate", .{ .url = request_url, .reason = opts.reason });
195+
log.debug(.http, "navigate", .{
196+
.url = request_url,
197+
.method = opts.method,
198+
.reason = opts.reason,
199+
.body = opts.body != null,
200+
});
196201

197202
// if the url is about:blank, nothing to do.
198203
if (std.mem.eql(u8, "about:blank", request_url.raw)) {
@@ -685,6 +690,8 @@ pub const Page = struct {
685690
}
686691
}
687692

693+
// We cannot navigate immediately as navigating will delete the DOM tree,
694+
// which holds this event's node.
688695
// As such we schedule the function to be called as soon as possible.
689696
// The page.arena is safe to use here, but the transfer_arena exists
690697
// specifically for this type of lifetime.
@@ -699,7 +706,7 @@ pub const Page = struct {
699706
navi.* = .{
700707
.opts = opts,
701708
.session = self.session,
702-
.url = try arena.dupe(u8, url),
709+
.url = try self.url.resolve(arena, url),
703710
};
704711
_ = try self.loop.timeout(0, &navi.navigate_node);
705712
}
@@ -787,15 +794,77 @@ pub const Page = struct {
787794
};
788795

789796
const DelayedNavigation = struct {
790-
url: []const u8,
797+
url: URL,
791798
session: *Session,
792799
opts: NavigateOpts,
800+
initial: bool = true,
793801
navigate_node: Loop.CallbackNode = .{ .func = delayNavigate },
794802

803+
// Navigation is blocking, which is problem because it can seize up
804+
// the loop and deadlock. We can only safely try to navigate to a
805+
// new page when we're sure there's at least 1 free slot in the
806+
// http client. We handle this in two phases:
807+
//
808+
// In the first phase, when self.initial == true, we'll shutdown the page
809+
// and create a new one. The shutdown is important, because it resets the
810+
// loop ctx_id and closes the scope. Closing the scope calls our XHR
811+
// destructors which aborts requests. This is necessary to make sure our
812+
// [blocking] navigate won't block.
813+
//
814+
// In the 2nd phase, we wait until there's a free http slot so that our
815+
// navigate definetly won't block (which could deadlock the system if there
816+
// are still pending async requests, which we've seen happen, even after
817+
// an abort).
795818
fn delayNavigate(node: *Loop.CallbackNode, repeat_delay: *?u63) void {
796-
_ = repeat_delay;
797819
const self: *DelayedNavigation = @fieldParentPtr("navigate_node", node);
798-
self.session.pageNavigate(self.url, self.opts) catch |err| {
820+
821+
const session = self.session;
822+
const initial = self.initial;
823+
824+
if (initial) {
825+
session.removePage();
826+
_ = session.createPage() catch |err| {
827+
log.err(.browser, "delayed navigation page error", .{
828+
.err = err,
829+
.url = self.url,
830+
});
831+
return;
832+
};
833+
self.initial = false;
834+
}
835+
836+
if (session.browser.http_client.freeSlotCount() == 0) {
837+
log.debug(.browser, "delayed navigate waiting", .{});
838+
const delay = 0 * std.time.ns_per_ms;
839+
840+
// If this isn't the initial check, we can safely re-use the timer
841+
// to check again.
842+
if (initial == false) {
843+
repeat_delay.* = delay;
844+
return;
845+
}
846+
847+
// However, if this _is_ the initial check, we called
848+
// session.removePage above, and that reset the loop ctx_id.
849+
// We can't re-use this timer, because it has the previous ctx_id.
850+
// We can create a new timeout though, and that'll get the new ctx_id.
851+
//
852+
// Page has to be not-null here because we called createPage above.
853+
_ = session.page.?.loop.timeout(delay, &self.navigate_node) catch |err| {
854+
log.err(.browser, "delayed navigation loop err", .{ .err = err });
855+
};
856+
return;
857+
}
858+
859+
const page = session.currentPage() orelse return;
860+
defer if (!page.delayed_navigation) {
861+
// If, while loading the page, we intend to navigate to another
862+
// page, then we need to keep the transfer_arena around, as this
863+
// sub-navigation is probably using it.
864+
_ = session.browser.transfer_arena.reset(.{ .retain_with_limit = 64 * 1024 });
865+
};
866+
867+
return page.navigate(self.url, self.opts) catch |err| {
799868
log.err(.browser, "delayed navigation error", .{ .err = err, .url = self.url });
800869
};
801870
}

src/browser/session.zig

Lines changed: 3 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ const Allocator = std.mem.Allocator;
2222

2323
const Env = @import("env.zig").Env;
2424
const Page = @import("page.zig").Page;
25+
const URL = @import("../url.zig").URL;
2526
const Browser = @import("browser.zig").Browser;
2627
const NavigateOpts = @import("page.zig").NavigateOpts;
2728

@@ -72,7 +73,7 @@ pub const Session = struct {
7273

7374
pub fn deinit(self: *Session) void {
7475
if (self.page != null) {
75-
self.removePage() catch {};
76+
self.removePage();
7677
}
7778
self.cookie_jar.deinit();
7879
self.storage_shed.deinit();
@@ -104,7 +105,7 @@ pub const Session = struct {
104105
return page;
105106
}
106107

107-
pub fn removePage(self: *Session) !void {
108+
pub fn removePage(self: *Session) void {
108109
// Inform CDP the page is going to be removed, allowing other worlds to remove themselves before the main one
109110
self.browser.notification.dispatch(.page_remove, .{});
110111

@@ -127,12 +128,6 @@ pub const Session = struct {
127128
// window.setTimeout and running microtasks should be ignored
128129
self.browser.app.loop.reset();
129130

130-
// Finally, we run the loop. Because of the reset just above, this will
131-
// ignore any timeouts. And, because of the endScope about this, it
132-
// should ensure that the http requests detect the shutdown socket and
133-
// release their resources.
134-
try self.browser.app.loop.run();
135-
136131
self.page = null;
137132

138133
// clear netsurf memory arena.
@@ -144,28 +139,4 @@ pub const Session = struct {
144139
pub fn currentPage(self: *Session) ?*Page {
145140
return &(self.page orelse return null);
146141
}
147-
148-
pub fn pageNavigate(self: *Session, url_string: []const u8, opts: NavigateOpts) !void {
149-
// currently, this is only called from the page, so let's hope
150-
// it isn't null!
151-
std.debug.assert(self.page != null);
152-
153-
defer if (self.page) |*p| {
154-
if (!p.delayed_navigation) {
155-
// If, while loading the page, we intend to navigate to another
156-
// page, then we need to keep the transfer_arena around, as this
157-
// sub-navigation is probably using it.
158-
_ = self.browser.transfer_arena.reset(.{ .retain_with_limit = 64 * 1024 });
159-
}
160-
};
161-
162-
// it's safe to use the transfer arena here, because the page will
163-
// eventually clone the URL using its own page_arena (after it gets
164-
// the final URL, possibly following redirects)
165-
const url = try self.page.?.url.resolve(self.transfer_arena, url_string);
166-
167-
try self.removePage();
168-
var page = try self.createPage();
169-
return page.navigate(url, opts);
170-
}
171142
};

src/browser/xhr/xhr.zig

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -341,10 +341,17 @@ pub const XMLHttpRequest = struct {
341341
log.debug(.script_event, "dispatch event", .{
342342
.type = typ,
343343
.source = "xhr",
344+
.method = self.method,
344345
.url = self.url,
345346
});
346347
self._dispatchEvt(typ) catch |err| {
347-
log.err(.app, "dispatch event error", .{ .err = err, .type = typ, .source = "xhr" });
348+
log.err(.app, "dispatch event error", .{
349+
.err = err,
350+
.type = typ,
351+
.source = "xhr",
352+
.method = self.method,
353+
.url = self.url,
354+
});
348355
};
349356
}
350357

@@ -365,10 +372,17 @@ pub const XMLHttpRequest = struct {
365372
log.debug(.script_event, "dispatch progress event", .{
366373
.type = typ,
367374
.source = "xhr",
375+
.method = self.method,
368376
.url = self.url,
369377
});
370378
self._dispatchProgressEvent(typ, opts) catch |err| {
371-
log.err(.app, "dispatch progress event error", .{ .err = err, .type = typ, .source = "xhr" });
379+
log.err(.app, "dispatch progress event error", .{
380+
.err = err,
381+
.type = typ,
382+
.source = "xhr",
383+
.method = self.method,
384+
.url = self.url,
385+
});
372386
};
373387
}
374388

src/cdp/domains/target.zig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ fn closeTarget(cmd: anytype) !void {
220220
bc.session_id = null;
221221
}
222222

223-
try bc.session.removePage();
223+
bc.session.removePage();
224224
if (bc.isolated_world) |*world| {
225225
world.deinit();
226226
bc.isolated_world = null;

src/http/client.zig

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -113,10 +113,18 @@ pub const Client = struct {
113113
loop: *Loop,
114114
opts: RequestOpts,
115115
) !void {
116-
if (self.state_pool.acquireOrNull()) |state| {
117-
// if we have state ready, we can skip the loop and immediately
118-
// kick this request off.
119-
return self.asyncRequestReady(method, uri, ctx, callback, state, opts);
116+
117+
// See the page's DelayedNavitation for why we're doing this. TL;DR -
118+
// we need to keep 1 slot available for the blocking page navigation flow
119+
// (Almost worth keeping a dedicate State just for that flow, but keep
120+
// thinking we need a more permanent solution (i.e. making everything
121+
// non-blocking).
122+
if (self.freeSlotCount() > 1) {
123+
if (self.state_pool.acquireOrNull()) |state| {
124+
// if we have state ready, we can skip the loop and immediately
125+
// kick this request off.
126+
return self.asyncRequestReady(method, uri, ctx, callback, state, opts);
127+
}
120128
}
121129

122130
// This cannot be a client-owned MemoryPool. The page can end before
@@ -174,6 +182,10 @@ pub const Client = struct {
174182
.client = self,
175183
};
176184
}
185+
186+
pub fn freeSlotCount(self: *Client) usize {
187+
return self.state_pool.freeSlotCount();
188+
}
177189
};
178190

179191
const RequestOpts = struct {
@@ -2531,6 +2543,12 @@ const StatePool = struct {
25312543
allocator.free(self.states);
25322544
}
25332545

2546+
pub fn freeSlotCount(self: *StatePool) usize {
2547+
self.mutex.lock();
2548+
defer self.mutex.unlock();
2549+
return self.available;
2550+
}
2551+
25342552
pub fn acquireWait(self: *StatePool) *State {
25352553
const states = self.states;
25362554

@@ -3022,8 +3040,14 @@ test "HttpClient: async connect error" {
30223040
.{},
30233041
);
30243042

3025-
try loop.io.run_for_ns(std.time.ns_per_ms);
3026-
try reset.timedWait(std.time.ns_per_s);
3043+
for (0..10) |_| {
3044+
try loop.io.run_for_ns(std.time.ns_per_ms * 10);
3045+
if (reset.isSet()) {
3046+
break;
3047+
}
3048+
} else {
3049+
return error.Timeout;
3050+
}
30273051
}
30283052

30293053
test "HttpClient: async no body" {

0 commit comments

Comments
 (0)