Skip to content

Commit 326851e

Browse files
markdown: first implementation
1 parent 5dcc3db commit 326851e

File tree

3 files changed

+305
-1
lines changed

3 files changed

+305
-1
lines changed

src/browser/markdown.zig

Lines changed: 271 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,271 @@
1+
// Copyright (C) 2023-2025 Lightpanda (Selecy SAS)
2+
//
3+
// Francis Bouvier <[email protected]>
4+
// Pierre Tachoire <[email protected]>
5+
//
6+
// This program is free software: you can redistribute it and/or modify
7+
// it under the terms of the GNU Affero General Public License as
8+
// published by the Free Software Foundation, either version 3 of the
9+
// License, or (at your option) any later version.
10+
//
11+
// This program is distributed in the hope that it will be useful,
12+
// but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
// GNU Affero General Public License for more details.
15+
//
16+
// You should have received a copy of the GNU Affero General Public License
17+
// along with this program. If not, see <https://www.gnu.org/licenses/>.
18+
19+
const std = @import("std");
20+
21+
const parser = @import("netsurf.zig");
22+
const Walker = @import("dom/walker.zig").WalkerChildren;
23+
24+
const NP = "\n\n";
25+
26+
// writer must be a std.io.Writer
27+
pub fn writeMarkdown(doc: *parser.Document, writer: anytype) !void {
28+
_ = try writeChildren(parser.documentToNode(doc), true, writer);
29+
try writer.writeAll("\n");
30+
}
31+
32+
fn writeChildren(root: *parser.Node, new_para: bool, writer: anytype) !bool {
33+
const walker = Walker{};
34+
var next: ?*parser.Node = null;
35+
var _new_para = new_para;
36+
while (true) {
37+
next = try walker.get_next(root, next) orelse break;
38+
_new_para = try writeNode(next.?, _new_para, writer);
39+
}
40+
return _new_para;
41+
}
42+
43+
fn skipTextChild(root: *parser.Node) !*parser.Node {
44+
const child = parser.nodeFirstChild(root) orelse return root;
45+
const node_type = try parser.nodeType(child);
46+
if (node_type == .text) return child;
47+
return root;
48+
}
49+
50+
// the returned boolean can be either:
51+
// - true if a new paragraph has been written at the end
52+
// - false if an inline text (ie. without new paragraph) has been written at the end
53+
// - the value of the writeChildren function if it has been called recursively at the end
54+
// - the new_para received as argument otherwise
55+
fn writeNode(node: *parser.Node, new_para: bool, writer: anytype) anyerror!bool {
56+
switch (try parser.nodeType(node)) {
57+
.element => {
58+
const html_element: *parser.ElementHTML = @ptrCast(node);
59+
const tag = try parser.elementHTMLGetTagType(html_element);
60+
61+
// debug
62+
// try writer.writeAll(@tagName(tag));
63+
// try writer.writeAll("-");
64+
// if (new_para) {
65+
// try writer.writeAll("1");
66+
// } else {
67+
// try writer.writeAll("0");
68+
// }
69+
70+
switch (tag) {
71+
72+
// skip element, go to children
73+
.html, .head, .header, .footer, .meta, .link, .body => {
74+
return try writeChildren(node, new_para, writer);
75+
},
76+
77+
// skip element and children (eg. text)
78+
.title, .i, .script, .noscript, .undef, .style => return new_para,
79+
80+
// generic elements
81+
.h1, .h2, .h3, .h4 => {
82+
if (!new_para) {
83+
try writer.writeAll(NP);
84+
}
85+
switch (tag) {
86+
.h1 => try writer.writeAll("# "),
87+
.h2 => try writer.writeAll("## "),
88+
.h3 => try writer.writeAll("### "),
89+
.h4 => try writer.writeAll("#### "),
90+
else => @panic("only headers tags are supported here"),
91+
}
92+
const np = try writeChildren(node, false, writer);
93+
if (!np) try writer.writeAll(NP);
94+
return true;
95+
},
96+
97+
// containers and dividers
98+
.nav, .section, .article, .p, .div, .button, .form => {
99+
if (!new_para) try writer.writeAll(NP);
100+
const np = try writeChildren(node, true, writer);
101+
if (!np) try writer.writeAll(NP);
102+
return true;
103+
},
104+
.span => {
105+
return try writeChildren(node, new_para, writer);
106+
},
107+
.b => {
108+
try writer.writeAll("**");
109+
_ = try writeChildren(node, false, writer);
110+
try writer.writeAll("**");
111+
return false;
112+
},
113+
.br => {
114+
if (!new_para) try writer.writeAll(NP);
115+
return try writeChildren(node, true, writer);
116+
},
117+
.hr => {
118+
if (!new_para) try writer.writeAll(NP);
119+
try writer.writeAll("---");
120+
try writer.writeAll(NP);
121+
return true;
122+
},
123+
124+
// specific elements
125+
.a => {
126+
const element = parser.nodeToElement(node);
127+
if (try getAttributeValue(element, "href")) |href| {
128+
// TODO: absolute path?
129+
try writer.writeAll("[");
130+
_ = try writeChildren(node, false, writer);
131+
try writer.writeAll("](");
132+
try writer.writeAll(href);
133+
try writer.writeAll(")");
134+
return false;
135+
}
136+
return try writeChildren(node, new_para, writer);
137+
},
138+
.img => {
139+
const element = parser.nodeToElement(node);
140+
if (try getAttributeValue(element, "src")) |src| {
141+
// TODO: absolute path?
142+
try writer.writeAll("![");
143+
if (try getAttributeValue(element, "alt")) |alt| {
144+
try writer.writeAll(alt);
145+
} else {
146+
try writer.writeAll(src);
147+
}
148+
try writer.writeAll("](");
149+
try writer.writeAll(src);
150+
try writer.writeAll(")");
151+
return false;
152+
}
153+
return new_para;
154+
},
155+
.ol => {
156+
if (!new_para) try writer.writeAll(NP);
157+
const np = try writeChildren(node, true, writer);
158+
if (!np) try writer.writeAll(NP);
159+
return true;
160+
},
161+
.ul => {
162+
if (!new_para) try writer.writeAll(NP);
163+
const np = try writeChildren(node, true, writer);
164+
if (!np) try writer.writeAll(NP);
165+
return true;
166+
},
167+
.li => {
168+
if (!new_para) try writer.writeAll("\n");
169+
try writer.writeAll("- ");
170+
return try writeChildren(node, false, writer);
171+
},
172+
.input => {
173+
const element = parser.nodeToElement(node);
174+
if (try getAttributeValue(element, "value")) |value| {
175+
try writer.writeAll(value);
176+
try writer.writeAll(" ");
177+
}
178+
return false;
179+
},
180+
else => {
181+
try writer.writeAll("\n");
182+
try writer.writeAll(@tagName(tag));
183+
try writer.writeAll(" not supported\n");
184+
},
185+
}
186+
// panic
187+
},
188+
.text => {
189+
const v = try parser.nodeValue(node) orelse return new_para;
190+
const printed = try writeText(v, writer);
191+
if (printed) return false;
192+
return new_para;
193+
},
194+
.cdata_section => {
195+
return new_para;
196+
},
197+
.comment => {
198+
return new_para;
199+
},
200+
// TODO handle processing instruction dump
201+
.processing_instruction => return new_para,
202+
// document fragment is outside of the main document DOM, so we
203+
// don't output it.
204+
.document_fragment => return new_para,
205+
// document will never be called, but required for completeness.
206+
.document => return new_para,
207+
// done globally instead, but required for completeness. Only the outer DOCTYPE should be written
208+
.document_type => return new_para,
209+
// deprecated
210+
.attribute, .entity_reference, .entity, .notation => return new_para,
211+
}
212+
return new_para;
213+
}
214+
215+
// TODO: not sure about + - . ! as they are very common characters
216+
// I fear that we add too much escape strings
217+
// TODO: | (pipe)
218+
const escape = [_]u8{ '\\', '`', '*', '_', '{', '}', '[', ']', '<', '>', '(', ')', '#' };
219+
220+
fn writeText(value: []const u8, writer: anytype) !bool {
221+
if (value.len == 0) return false;
222+
223+
var last_char: u8 = ' ';
224+
var printed: u64 = 0;
225+
for (value) |v| {
226+
// do not print:
227+
// - multiple spaces
228+
// - return line
229+
// - tabs
230+
if (v == last_char and v == ' ') continue;
231+
if (v == '\n') continue;
232+
if (v == '\t') continue;
233+
234+
// escape char
235+
for (escape) |esc| {
236+
if (v == esc) try writer.writeAll("\\");
237+
}
238+
239+
last_char = v;
240+
printed += 1;
241+
const x = [_]u8{v}; // TODO: do we have something better?
242+
try writer.writeAll(&x);
243+
}
244+
if (printed > 0) return true;
245+
return false;
246+
}
247+
248+
fn getAttributeValue(elem: *parser.Element, attr: []const u8) !?[]const u8 {
249+
if (try parser.elementGetAttribute(elem, attr)) |value| {
250+
if (value.len > 0) return value;
251+
}
252+
return null;
253+
}
254+
255+
fn writeEscapedTextNode(writer: anytype, value: []const u8) !void {
256+
var v = value;
257+
while (v.len > 0) {
258+
try writer.writeAll("TEXT: ");
259+
const index = std.mem.indexOfAnyPos(u8, v, 0, &.{ '&', '<', '>' }) orelse {
260+
return writer.writeAll(v);
261+
};
262+
try writer.writeAll(v[0..index]);
263+
switch (v[index]) {
264+
'&' => try writer.writeAll("&amp;"),
265+
'<' => try writer.writeAll("&lt;"),
266+
'>' => try writer.writeAll("&gt;"),
267+
else => unreachable,
268+
}
269+
v = v[index + 1 ..];
270+
}
271+
}

src/browser/page.zig

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ const builtin = @import("builtin");
2222
const Allocator = std.mem.Allocator;
2323

2424
const Dump = @import("dump.zig");
25+
const Markdown = @import("markdown.zig");
2526
const State = @import("State.zig");
2627
const Env = @import("env.zig").Env;
2728
const Mime = @import("mime.zig").Mime;
@@ -147,6 +148,18 @@ pub const Page = struct {
147148
try Dump.writeHTML(doc, out);
148149
}
149150

151+
// dump writes the page content into the given file.
152+
pub fn markdown(self: *const Page, out: std.fs.File) !void {
153+
if (self.raw_data) |_| {
154+
// raw_data was set if the document was not HTML we can not convert it to Markdown,
155+
return error.HTMLDocument;
156+
}
157+
158+
// if the page has a pointer to a document, converts the HTML in Markdown and dump it.
159+
const doc = parser.documentHTMLToDocument(self.window.document);
160+
try Markdown.writeMarkdown(doc, out);
161+
}
162+
150163
pub fn fetchModuleSource(ctx: *anyopaque, specifier: []const u8) !?[]const u8 {
151164
const self: *Page = @ptrCast(@alignCast(ctx));
152165
const base = if (self.current_script) |s| s.src else null;

src/main.zig

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ fn run(alloc: Allocator) !void {
103103
};
104104
},
105105
.fetch => |opts| {
106-
log.debug(.app, "startup", .{ .mode = "fetch", .dump = opts.dump, .url = opts.url });
106+
log.debug(.app, "startup", .{ .mode = "fetch", .dump = opts.dump, .markdown = opts.markdown, .url = opts.url });
107107
const url = try @import("url.zig").URL.parse(opts.url, null);
108108

109109
// browser
@@ -128,6 +128,13 @@ fn run(alloc: Allocator) !void {
128128

129129
try page.wait();
130130

131+
// markdown
132+
if (opts.markdown) {
133+
try page.markdown(std.io.getStdOut());
134+
// do not dump HTML if both options are provided
135+
return;
136+
}
137+
131138
// dump
132139
if (opts.dump) {
133140
try page.dump(std.io.getStdOut());
@@ -193,6 +200,7 @@ const Command = struct {
193200
const Fetch = struct {
194201
url: []const u8,
195202
dump: bool = false,
203+
markdown: bool = false,
196204
common: Common,
197205
};
198206

@@ -241,6 +249,9 @@ const Command = struct {
241249
\\--dump Dumps document to stdout.
242250
\\ Defaults to false.
243251
\\
252+
\\--markdown Converts document in Markdown format and dumps it to stdout.
253+
\\ Defaults to false.
254+
\\
244255
++ common_options ++
245256
\\
246257
\\serve command
@@ -317,6 +328,9 @@ fn inferMode(opt: []const u8) ?App.RunMode {
317328
if (std.mem.eql(u8, opt, "--dump")) {
318329
return .fetch;
319330
}
331+
if (std.mem.eql(u8, opt, "--markdown")) {
332+
return .fetch;
333+
}
320334
if (std.mem.startsWith(u8, opt, "--") == false) {
321335
return .fetch;
322336
}
@@ -402,6 +416,7 @@ fn parseFetchArgs(
402416
args: *std.process.ArgIterator,
403417
) !Command.Fetch {
404418
var dump: bool = false;
419+
var markdown: bool = false;
405420
var url: ?[]const u8 = null;
406421
var common: Command.Common = .{};
407422

@@ -410,6 +425,10 @@ fn parseFetchArgs(
410425
dump = true;
411426
continue;
412427
}
428+
if (std.mem.eql(u8, "--markdown", opt)) {
429+
markdown = true;
430+
continue;
431+
}
413432

414433
if (try parseCommonArg(allocator, opt, args, &common)) {
415434
continue;
@@ -435,6 +454,7 @@ fn parseFetchArgs(
435454
return .{
436455
.url = url.?,
437456
.dump = dump,
457+
.markdown = markdown,
438458
.common = common,
439459
};
440460
}

0 commit comments

Comments
 (0)