Skip to content

Commit 521458b

Browse files
html: implement language code validation (#93)
* html: lang: implement language code validation * html: lang: add completion for language and script * html: lang: implement iso 639 and iso 15924 check * html: lang: parse iana registry registry: https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry * html: lang: add check for grandfathered tags * html: lang: rewrite parser and use static maps for lookup * html: lang: add completion for region * html: lang: implement rfc 5646 * html: lang: add registry fetch step to build.zig * html: lang: propose region completion after any number of dashes it can be prepended by any number of extlang tags and a script tag * html: lang: check extlang and variant prefix * html: lang: use rule for global lang attribute * html: lang: move validate and completions to module * html: lang: fix multi-subtag prefixes * html: lang: support subtags with multiple prefixes * html: lang: add test for deprecation * fix import --------- Co-authored-by: Loris Cro <kappaloris@gmail.com>
1 parent 2bf8f4d commit 521458b

File tree

10 files changed

+49662
-5
lines changed

10 files changed

+49662
-5
lines changed

build.zig

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,18 @@ pub fn build(b: *std.Build) !void {
2626
superhtml.addImport("scripty", scripty.module("scripty"));
2727
superhtml.addImport("tracy", tracy.module("tracy"));
2828

29+
const language_tag_parser = b.addExecutable(.{
30+
.name = "language-tag-parser",
31+
.root_module = b.createModule(.{
32+
.root_source_file = b.path("src/html/language_tag/parse.zig"),
33+
.target = target,
34+
}),
35+
});
36+
const language_tag_parse = b.addRunArtifact(language_tag_parser);
37+
superhtml.addImport("language-tag-registry", b.createModule(.{
38+
.root_source_file = language_tag_parse.addOutputFileArg("registry.zon"),
39+
}));
40+
2941
if (target.result.os.tag == .windows) {
3042
superhtml.linkSystemLibrary("advapi32", .{});
3143

@@ -61,6 +73,7 @@ pub fn build(b: *std.Build) !void {
6173
setupTestStep(b, superhtml, check);
6274
setupCliTool(b, target, optimize, options, superhtml, folders, lsp);
6375
setupWasmStep(b, optimize, options, superhtml, lsp);
76+
setupFetchLanguageSubtagRegistryStep(b, target);
6477
if (version == .tag) {
6578
setupReleaseStep(b, options, superhtml, folders, lsp);
6679
}
@@ -181,6 +194,27 @@ fn setupWasmStep(
181194
wasm.dependOn(&target_output.step);
182195
}
183196

197+
fn setupFetchLanguageSubtagRegistryStep(
198+
b: *std.Build,
199+
target: std.Build.ResolvedTarget,
200+
) void {
201+
const step = b.step(
202+
"fetch-language-subtag-registry",
203+
"Fetch the IANA language subtag registry",
204+
);
205+
const fetcher = b.addExecutable(.{
206+
.name = "language-subtag-fetcher",
207+
.root_module = b.createModule(.{
208+
.root_source_file = b.path("src/html/language_tag/fetch.zig"),
209+
.target = target,
210+
}),
211+
});
212+
const fetch = b.addRunArtifact(fetcher);
213+
fetch.has_side_effects = true;
214+
fetch.addFileArg(b.path("src/html/language_tag/registry.txt"));
215+
step.dependOn(&fetch.step);
216+
}
217+
184218
fn setupReleaseStep(
185219
b: *std.Build,
186220
options: *std.Build.Step.Options,

src/cli/lsp.zig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ pub fn initialize(
125125
.triggerCharacters = &.{
126126
"<", "/", " ",
127127
"\n", "'", "\"",
128-
"=", ",",
128+
"=", ",", "-",
129129
},
130130
},
131131

src/html/Attribute.zig

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ const root = @import("../root.zig");
99
const Language = root.Language;
1010
const Span = root.Span;
1111
const log = std.log.scoped(.attribute);
12+
const language_tag = @import("language_tag.zig");
1213

1314
rule: Rule,
1415
desc: []const u8,
@@ -74,6 +75,9 @@ pub const Rule = union(enum) {
7475
/// MIME
7576
mime,
7677

78+
/// BCP 47 language tag
79+
lang,
80+
7781
/// Not negative integer
7882
non_neg_int: struct {
7983
min: usize = 0,
@@ -257,6 +261,20 @@ pub const Rule = union(enum) {
257261
},
258262
.mime => return validateMime(gpa, errors, src, node_idx, attr),
259263
.cors => continue :rule .{ .list = cors_list },
264+
.lang => {
265+
const value = attr.value orelse return;
266+
const value_slice = value.span.slice(src);
267+
if (language_tag.validate(value_slice)) |rejection| return errors.append(gpa, .{
268+
.tag = .{
269+
.invalid_attr_value = .{ .reason = rejection.reason },
270+
},
271+
.main_location = .{
272+
.start = value.span.start + rejection.offset,
273+
.end = value.span.start + rejection.offset + rejection.length,
274+
},
275+
.node_idx = node_idx,
276+
});
277+
},
260278
.not_empty => {
261279
const value = attr.value orelse return errors.append(gpa, .{
262280
.tag = .missing_attr_value,
@@ -944,6 +962,9 @@ pub fn completions(
944962
.cors => if (value_content.len == 0) {
945963
return Rule.cors_list.completions;
946964
},
965+
.lang => {
966+
return language_tag.completions(value_content);
967+
},
947968
.list => |l| {
948969
if (value_content.len == 0) {
949970
return l.completions;
@@ -1625,7 +1646,7 @@ pub const global: AttributeSet = .init(&.{
16251646
.{
16261647
.name = "lang",
16271648
.model = .{
1628-
.rule = .not_empty,
1649+
.rule = .lang,
16291650
.desc = "The `lang` global attribute helps define the language of an element: the language that non-editable elements are written in, or the language that the editable elements should be written in by the user. The attribute contains a single BCP 47 language tag.",
16301651
},
16311652
},

src/html/elements/a.zig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ pub const attributes: AttributeSet = .init(&.{
9898
.{
9999
.name = "hreflang",
100100
.model = .{
101-
.rule = .any,
101+
.rule = .lang,
102102
.desc =
103103
\\Hints at the human language of the linked URL. No built-in
104104
\\functionality. Allowed values are the same as the global lang

src/html/elements/link.zig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ pub const attributes: AttributeSet = .init(&.{
187187
.{
188188
.name = "hreflang",
189189
.model = .{
190-
.rule = .not_empty, // TODO validate
190+
.rule = .lang,
191191
.desc = "This attribute indicates the language of the linked resource. It is purely advisory.",
192192
},
193193
},

src/html/elements/track.zig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ pub const attributes: AttributeSet = .init(&.{
7979
.{
8080
.name = "srclang",
8181
.model = .{
82-
.rule = .not_empty,
82+
.rule = .lang,
8383
.desc = "Language of the track text data. It must be a valid BCP 47 language tag. If the `kind` attribute is set to 'subtitles', then `srclang` must be defined.",
8484
},
8585
},

src/html/language_tag.zig

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
const std = @import("std");
2+
const Completion = @import("Ast.zig").Completion;
3+
const Registry = @import("language_tag/parse.zig").Registry;
4+
const registry: Registry = @import("language-tag-registry");
5+
6+
pub const Rejection = struct {
7+
reason: []const u8,
8+
offset: u32,
9+
length: u32,
10+
11+
pub fn init(bytes: []const u8, subtag: []const u8, reason: []const u8) Rejection {
12+
return .{
13+
.reason = reason,
14+
.offset = @intCast(@intFromPtr(subtag.ptr) - @intFromPtr(bytes.ptr)),
15+
.length = @intCast(subtag.len),
16+
};
17+
}
18+
};
19+
20+
pub fn validate(bytes: []const u8) ?Rejection {
21+
if (maps.grandfathered.get(bytes)) |data| {
22+
if (!data.is_deprecated) return null;
23+
}
24+
25+
const ParseState = enum {
26+
language,
27+
extlang,
28+
script,
29+
region,
30+
variant,
31+
singleton,
32+
extension,
33+
extension_extra,
34+
privateuse,
35+
privateuse_extra,
36+
};
37+
var parse_state: ParseState = .language;
38+
39+
var subtags = std.mem.splitScalar(u8, bytes, '-');
40+
while (subtags.next()) |subtag| state: switch (parse_state) {
41+
.language => switch (subtag.len) {
42+
0 => return .init(bytes, subtag, "cannot be empty"),
43+
1 => return .init(bytes, subtag, "too short"),
44+
2...8 => {
45+
if (maps.language.get(subtag)) |data| {
46+
if (data.is_deprecated) return .init(bytes, subtag, "deprecated language");
47+
} else {
48+
return .init(bytes, subtag, "unknown language");
49+
}
50+
parse_state = .extlang;
51+
},
52+
else => return .init(bytes, subtag, "too long"),
53+
},
54+
.extlang => switch (subtag.len) {
55+
3 => {
56+
if (std.ascii.isDigit(subtag[0])) continue :state .region;
57+
if (maps.extlang.get(subtag)) |data| {
58+
if (data.is_deprecated) return .init(bytes, subtag, "deprecated language extension");
59+
for (data.prefixes) |prefix| {
60+
if (std.ascii.startsWithIgnoreCase(bytes, prefix)) break;
61+
} else {
62+
return .init(bytes, subtag, "incompatible language extension");
63+
}
64+
} else {
65+
return .init(bytes, subtag, "unknown language extension");
66+
}
67+
parse_state = .script;
68+
},
69+
else => continue :state .script,
70+
},
71+
.script => switch (subtag.len) {
72+
4 => {
73+
if (std.ascii.isDigit(subtag[0])) continue :state .variant;
74+
if (!maps.script.has(subtag)) {
75+
return .init(bytes, subtag, "unknown language script");
76+
}
77+
parse_state = .region;
78+
},
79+
else => continue :state .region,
80+
},
81+
.region => switch (subtag.len) {
82+
2...3 => {
83+
if (maps.region.get(subtag)) |data| {
84+
if (data.is_deprecated) return .init(bytes, subtag, "deprecated language region");
85+
} else {
86+
return .init(bytes, subtag, "unknown language region");
87+
}
88+
parse_state = .variant;
89+
},
90+
else => continue :state .variant,
91+
},
92+
.variant => switch (subtag.len) {
93+
4...8 => {
94+
if (maps.variant.get(subtag)) |data| {
95+
if (data.is_deprecated) return .init(bytes, subtag, "deprecated language variant");
96+
for (data.prefixes) |prefix| {
97+
if (std.ascii.startsWithIgnoreCase(bytes, prefix)) break;
98+
} else {
99+
return .init(bytes, subtag, "incompatible language variant");
100+
}
101+
} else {
102+
return .init(bytes, subtag, "unknown language variant");
103+
}
104+
parse_state = .variant;
105+
},
106+
else => continue :state .singleton,
107+
},
108+
.singleton => {
109+
if (subtag.len != 1) {
110+
return .init(bytes, subtag, "extension prefix must be a single character");
111+
}
112+
parse_state = switch (std.ascii.toLower(subtag[0])) {
113+
'x' => .privateuse,
114+
'a'...'w', 'y'...'z', '0'...'9' => .extension,
115+
else => return .init(bytes, subtag, "extension prefix must be alphanumeric"),
116+
};
117+
},
118+
.extension => switch (subtag.len) {
119+
2...8 => {
120+
for (subtag) |char| if (!std.ascii.isAlphanumeric(char)) {
121+
return .init(bytes, subtag, "extension must be alphanumeric");
122+
};
123+
parse_state = .extension_extra;
124+
},
125+
else => return .init(bytes, subtag, "wrong extension length"),
126+
},
127+
.extension_extra => switch (subtag.len) {
128+
2...8 => continue :state .extension,
129+
else => continue :state .singleton,
130+
},
131+
.privateuse => switch (subtag.len) {
132+
1...8 => {
133+
for (subtag) |char| if (!std.ascii.isAlphanumeric(char)) {
134+
return .init(bytes, subtag, "private use extension must be alphanumeric");
135+
};
136+
parse_state = .privateuse_extra;
137+
},
138+
else => return .init(bytes, subtag, "wrong private use extension length"),
139+
},
140+
.privateuse_extra => switch (subtag.len) {
141+
1...8 => continue :state .privateuse,
142+
else => return .init(bytes, subtag, "subtag after private use extension"),
143+
},
144+
};
145+
return null;
146+
}
147+
148+
pub fn completions(value: []const u8) []const Completion {
149+
if (value.len == 0) {
150+
return &language_completions;
151+
}
152+
153+
if (std.mem.endsWith(u8, value, "-")) {
154+
return &region_completions;
155+
}
156+
157+
return &.{};
158+
}
159+
160+
const Map = std.StaticStringMapWithEql(Registry.Subtag.Data, std.ascii.eqlIgnoreCase);
161+
162+
const maps = struct {
163+
pub const language = makeMap("language");
164+
pub const extlang = makeMap("extlang");
165+
pub const script = makeMap("script");
166+
pub const region = makeMap("region");
167+
pub const variant = makeMap("variant");
168+
pub const grandfathered = makeMap("grandfathered");
169+
};
170+
171+
fn makeMap(comptime kind: []const u8) Map {
172+
const KV = struct { []const u8, Registry.Subtag.Data };
173+
const subtags = @field(registry, kind);
174+
@setEvalBranchQuota(subtags.len * 2);
175+
var kvs: [subtags.len]KV = undefined;
176+
for (subtags, &kvs) |subtag, *kv| {
177+
kv.* = .{ subtag.name, subtag.data };
178+
}
179+
return .initComptime(kvs);
180+
}
181+
182+
const language_completions = makeCompletions("language");
183+
const region_completions = makeCompletions("region");
184+
185+
fn makeCompletions(comptime kind: []const u8) [@field(registry, kind).len]Completion {
186+
const subtags = @field(registry, kind);
187+
@setEvalBranchQuota(subtags.len * 2);
188+
var comps: [subtags.len]Completion = undefined;
189+
for (subtags, &comps) |subtag, *comp| {
190+
comp.* = .{
191+
.label = subtag.name,
192+
.desc = subtag.data.description orelse subtag.name,
193+
};
194+
}
195+
return comps;
196+
}
197+
198+
test "validate: all subtags" {
199+
const value = "sgn-ase-Latn-US-blasl-a-abcd-x-1234";
200+
try std.testing.expectEqual(null, validate(value));
201+
}
202+
203+
test "validate: deprecated language" {
204+
const value = "in";
205+
try std.testing.expect(validate(value) != null);
206+
}
207+
208+
test "validate: multiple prefixes" {
209+
const valid_1 = "sgn-ase-blasl";
210+
const valid_2 = "ase-blasl";
211+
try std.testing.expectEqual(null, validate(valid_1));
212+
try std.testing.expectEqual(null, validate(valid_2));
213+
214+
const invalid = "it-blasl";
215+
try std.testing.expect(validate(invalid) != null);
216+
}

src/html/language_tag/fetch.zig

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
const std = @import("std");
2+
3+
pub fn main() !void {
4+
var arena_instance: std.heap.ArenaAllocator = .init(std.heap.page_allocator);
5+
const arena = arena_instance.allocator();
6+
defer arena_instance.deinit();
7+
8+
var args = try std.process.argsWithAllocator(arena);
9+
std.debug.assert(args.skip());
10+
11+
const file = file: {
12+
const output_path = args.next() orelse break :file std.fs.File.stdout();
13+
break :file try std.fs.cwd().createFile(output_path, .{});
14+
};
15+
var buffer: [1024]u8 = undefined;
16+
var writer = file.writer(&buffer);
17+
const output = &writer.interface;
18+
19+
var client: std.http.Client = .{ .allocator = arena };
20+
_ = try client.fetch(.{
21+
.location = .{ .url = "https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry" },
22+
.response_writer = output,
23+
});
24+
25+
try output.flush();
26+
}

0 commit comments

Comments
 (0)