Skip to content

Commit 6f6e04c

Browse files
authored
Merge pull request #39 from crwsolutions/copilot/add-html-tokenizer
Add streaming HTML tokenizer with CSS and JavaScript delegation
2 parents e2ff0c0 + ba4941f commit 6f6e04c

39 files changed

+1727
-153
lines changed

NTokenizers.sln

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11

22
Microsoft Visual Studio Solution File, Format Version 12.00
33
# Visual Studio Version 18
4-
VisualStudioVersion = 18.1.11312.151 d18.0
4+
VisualStudioVersion = 18.1.11312.151
55
MinimumVisualStudioVersion = 10.0.40219.1
66
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NTokenizers.Tests", "tests\NTokenizers.Tests\NTokenizers.Tests.csproj", "{6F939D11-A6EB-E490-AFCF-37CFEA2DAB5D}"
77
EndProject
@@ -28,6 +28,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NTokenizers.ShowCase.Markdo
2828
EndProject
2929
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NTokenizers.ShowCase.Css", "tests\NTokenizers.ShowCase.Css\NTokenizers.ShowCase.Css.csproj", "{B38D1D9B-1805-BD4D-91F6-890276FF5368}"
3030
EndProject
31+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "NTokenizers.ShowCase.Html", "tests\NTokenizers.ShowCase.Html\NTokenizers.ShowCase.Html.csproj", "{61882FA7-6EF1-3632-7A2C-695AE9D9095F}"
32+
EndProject
3133
Global
3234
GlobalSection(SolutionConfigurationPlatforms) = preSolution
3335
Debug|Any CPU = Debug|Any CPU
@@ -158,6 +160,18 @@ Global
158160
{B38D1D9B-1805-BD4D-91F6-890276FF5368}.Release|x64.Build.0 = Release|Any CPU
159161
{B38D1D9B-1805-BD4D-91F6-890276FF5368}.Release|x86.ActiveCfg = Release|Any CPU
160162
{B38D1D9B-1805-BD4D-91F6-890276FF5368}.Release|x86.Build.0 = Release|Any CPU
163+
{61882FA7-6EF1-3632-7A2C-695AE9D9095F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
164+
{61882FA7-6EF1-3632-7A2C-695AE9D9095F}.Debug|Any CPU.Build.0 = Debug|Any CPU
165+
{61882FA7-6EF1-3632-7A2C-695AE9D9095F}.Debug|x64.ActiveCfg = Debug|Any CPU
166+
{61882FA7-6EF1-3632-7A2C-695AE9D9095F}.Debug|x64.Build.0 = Debug|Any CPU
167+
{61882FA7-6EF1-3632-7A2C-695AE9D9095F}.Debug|x86.ActiveCfg = Debug|Any CPU
168+
{61882FA7-6EF1-3632-7A2C-695AE9D9095F}.Debug|x86.Build.0 = Debug|Any CPU
169+
{61882FA7-6EF1-3632-7A2C-695AE9D9095F}.Release|Any CPU.ActiveCfg = Release|Any CPU
170+
{61882FA7-6EF1-3632-7A2C-695AE9D9095F}.Release|Any CPU.Build.0 = Release|Any CPU
171+
{61882FA7-6EF1-3632-7A2C-695AE9D9095F}.Release|x64.ActiveCfg = Release|Any CPU
172+
{61882FA7-6EF1-3632-7A2C-695AE9D9095F}.Release|x64.Build.0 = Release|Any CPU
173+
{61882FA7-6EF1-3632-7A2C-695AE9D9095F}.Release|x86.ActiveCfg = Release|Any CPU
174+
{61882FA7-6EF1-3632-7A2C-695AE9D9095F}.Release|x86.Build.0 = Release|Any CPU
161175
EndGlobalSection
162176
GlobalSection(SolutionProperties) = preSolution
163177
HideSolutionNode = FALSE

README.md

Lines changed: 148 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# NTokenizers
2-
Collection of **stream-capable** tokenizers for Markdown, JSON, XML, YAML, SQL, Typescript and CSharp processing.
2+
Collection of **stream-capable** tokenizers for Markdown, JSON, XML, HTML, YAML, SQL, Typescript, CSS and CSharp processing.
33

44
### Kickoff token processing
55

@@ -25,13 +25,18 @@ await CssTokenizer.Create().ParseAsync(stream, onToken: token => { /* handle css
2525
// kickoff xml tokenizer
2626
await XmlTokenizer.Create().ParseAsync(stream, onToken: token => { /* handle xml-tokens here */ });
2727

28+
// kickoff html tokenizer*
29+
await HtmlTokenizer.Create().ParseAsync(stream, onToken: token => { /* handle html-tokens here */ });
30+
2831
// kickoff yaml tokenizer
2932
await YamlTokenizer.Create().ParseAsync(stream, onToken: token => { /* handle yaml-tokens here */ });
3033
```
3134

35+
* You also have to handle the script and style Tokenizers. Check out the [docs](https://crwsolutions.github.io/ntokenizers/html) for more information.
36+
3237
## Overview
3338

34-
NTokenizers is a .NET library written in C# that provides tokenizers for processing structured text formats like Markdown, JSON, XML, YAML, SQL, Typescript, CSS and CSharp. The `Tokenize` method is the core functionality that breaks down structured text into meaningful components (tokens) for processing. Its key feature is **stream processing capability** - it can handle data as it arrives in real-time, making it ideal for processing large files or streaming data without loading everything into memory at once.
39+
NTokenizers is a .NET library written in C# that provides tokenizers for processing structured text formats like Markdown, JSON, XML, HTML, YAML, SQL, Typescript, CSS and CSharp. The `Tokenize` method is the core functionality that breaks down structured text into meaningful components (tokens) for processing. Its key feature is **stream processing capability** - it can handle data as it arrives in real-time, making it ideal for processing large files or streaming data without loading everything into memory at once.
3540

3641
> [!WARNING]
3742
>
@@ -75,6 +80,17 @@ The same principle applies to inline tokenizers such as Heading, Blockquote, Lis
7580
│ └─────────┘
7681
7782
│ ┌─────────┐
83+
├──────►│ html │ ───► fire html tokens
84+
│ └─────────┘
85+
│ │
86+
│ ▼ ┌─────────┐
87+
│ ├──────►│ css │ ───► fire css tokens
88+
│ │ └─────────┘
89+
│ │
90+
│ │ ┌─────────┐
91+
│ └──────►│ script │ ───► fire typescript tokens
92+
│ └─────────┘
93+
│ ┌─────────┐
7894
└──────►│ etc.. │ ───► etc
7995
└─────────┘
8096
```
@@ -84,12 +100,16 @@ The same principle applies to inline tokenizers such as Heading, Blockquote, Lis
84100
Here's a simple example showing how to use the `MarkdownTokenizer`:
85101

86102
```csharp
103+
using NTokenizers.Core;
104+
using NTokenizers.Css;
105+
using NTokenizers.Html;
87106
using NTokenizers.Json;
88107
using NTokenizers.Markdown;
89108
using NTokenizers.Markdown.Metadata;
90109
using NTokenizers.Typescript;
91110
using NTokenizers.Xml;
92111
using Spectre.Console;
112+
using System.Diagnostics;
93113
using System.IO.Pipes;
94114
using System.Text;
95115

@@ -101,6 +121,14 @@ class Program
101121
Here is some **bold** text and some *italic* text.
102122
103123
# NTokenizers Showcase
124+
125+
## Css example
126+
```css
127+
.user {
128+
color: #FFFFFF;
129+
active: true;
130+
}
131+
```
104132
105133
## XML example
106134
```xml
@@ -109,6 +137,28 @@ class Program
109137
</user>
110138
```
111139
140+
## HTML example
141+
```html
142+
<html>
143+
<head>
144+
<style>
145+
body { font-family: Arial, sans-serif; background-color: #f0f8ff; }
146+
.header { color: #4682b4; text-align: center; }
147+
.content { margin: 20px; padding: 15px; background-color: white; border-radius: 5px; }
148+
</style>
149+
</head>
150+
<body>
151+
<p>Hello world!</p>
152+
<script>
153+
console.log("Hello from the sample script!");
154+
document.addEventListener('DOMContentLoaded', function() {
155+
console.log("DOM is fully loaded");
156+
});
157+
</script>
158+
</body>
159+
</html>
160+
```
161+
112162
## JSON example
113163
```json
114164
{
@@ -128,15 +178,33 @@ class Program
128178

129179
// Create connected streams
130180
using var pipe = new AnonymousPipeServerStream(PipeDirection.Out);
131-
using var stream = new AnonymousPipeClientStream(PipeDirection.In, pipe.ClientSafePipeHandle);
181+
using var reader = new AnonymousPipeClientStream(PipeDirection.In, pipe.ClientSafePipeHandle);
132182

133183
// Start slow writer
134184
var writerTask = EmitSlowlyAsync(markdown, pipe);
135185

136-
// Parse markdown
137-
await MarkdownTokenizer.Create().ParseAsync(stream, onToken: async token =>
186+
// Parse markup
187+
await MarkdownTokenizer.Create().ParseAsync(reader, onToken: async token =>
138188
{
139-
if (token.Metadata is HeadingMetadata headingMetadata)
189+
if (token.Metadata is ICodeBlockMetadata codeBlock)
190+
{
191+
AnsiConsole.WriteLine();
192+
AnsiConsole.Write(new Markup($"[bold lime]{codeBlock.Language}:[/]"));
193+
AnsiConsole.WriteLine();
194+
}
195+
196+
if (token.Metadata is ListItemMetadata listMetadata)
197+
{
198+
AnsiConsole.Write(new Markup($"[bold lime]{listMetadata.Marker} [/]"));
199+
await listMetadata.RegisterInlineTokenHandler(inlineToken =>
200+
{
201+
var value = Markup.Escape(inlineToken.Value);
202+
AnsiConsole.Write(new Markup($"[bold red]{value}[/]"));
203+
});
204+
Debug.WriteLine("Written listItem inlines");
205+
206+
}
207+
else if (token.Metadata is HeadingMetadata headingMetadata)
140208
{
141209
await headingMetadata.RegisterInlineTokenHandler(inlineToken =>
142210
{
@@ -146,6 +214,7 @@ class Program
146214
new Markup($"[bold yellow]** {value} **[/]");
147215
AnsiConsole.Write(colored);
148216
});
217+
Debug.WriteLine("Written Heading inlines");
149218
}
150219
else if (token.Metadata is XmlCodeBlockMetadata xmlMetadata)
151220
{
@@ -195,28 +264,50 @@ class Program
195264
AnsiConsole.Write(colored);
196265
});
197266
}
198-
else if (token.Metadata is TypeScriptCodeBlockMetadata tsMetadata)
267+
else if (token.Metadata is HtmlCodeBlockMetadata htmlMetadata)
199268
{
200-
await tsMetadata.RegisterInlineTokenHandler(inlineToken =>
269+
await htmlMetadata.RegisterInlineTokenHandler(async inlineToken =>
201270
{
271+
if (inlineToken.Metadata is TypeScriptCodeBlockMetadata tsMeta)
272+
{
273+
await HandleScript(tsMeta);
274+
}
275+
else if (inlineToken.Metadata is CssCodeBlockMetadata cssMeta)
276+
{
277+
await HandleCss(cssMeta);
278+
}
279+
else
280+
{
202281
var value = Markup.Escape(inlineToken.Value);
203282
var colored = inlineToken.TokenType switch
204283
{
205-
TypescriptTokenType.Identifier => new Markup($"[cyan]{value}[/]"),
206-
TypescriptTokenType.Keyword => new Markup($"[blue]{value}[/]"),
207-
TypescriptTokenType.StringValue => new Markup($"[green]{value}[/]"),
208-
TypescriptTokenType.Number => new Markup($"[magenta]{value}[/]"),
209-
TypescriptTokenType.Operator => new Markup($"[yellow]{value}[/]"),
210-
TypescriptTokenType.Comment => new Markup($"[grey]{value}[/]"),
211-
TypescriptTokenType.Whitespace => new Markup($"[grey]{value}[/]"),
284+
HtmlTokenType.OpeningAngleBracket => new Markup($"[yellow]{value}[/]"),
285+
HtmlTokenType.ClosingAngleBracket => new Markup($"[yellow]{value}[/]"),
286+
HtmlTokenType.SelfClosingSlash => new Markup($"[yellow]{value}[/]"),
287+
HtmlTokenType.AttributeName => new Markup($"[cyan]{value}[/]"),
288+
HtmlTokenType.AttributeEquals => new Markup($"[yellow]{value}[/]"),
289+
HtmlTokenType.AttributeQuote => new Markup($"[grey]{value}[/]"),
290+
HtmlTokenType.AttributeValue => new Markup($"[green]{value}[/]"),
291+
HtmlTokenType.Text => new Markup($"[white]{value}[/]"),
292+
HtmlTokenType.Comment => new Markup($"[grey]{value}[/]"),
293+
HtmlTokenType.Whitespace => new Markup($"[grey]{value}[/]"),
212294
_ => new Markup(value)
213295
};
214296
AnsiConsole.Write(colored);
297+
}
215298
});
216299
}
300+
else if (token.Metadata is TypeScriptCodeBlockMetadata tsMetadata)
301+
{
302+
await HandleScript(tsMetadata);
303+
}
304+
else if (token.Metadata is CssCodeBlockMetadata cssMetadata)
305+
{
306+
await HandleCss(cssMetadata);
307+
}
217308
else
218309
{
219-
// Handle regular markdown tokens
310+
// Handle regular markup tokens
220311
var value = Markup.Escape(token.Value);
221312
var colored = token.TokenType switch
222313
{
@@ -229,7 +320,7 @@ class Program
229320
AnsiConsole.Write(colored);
230321
}
231322

232-
if (token.Metadata is InlineMarkdownMetadata)
323+
if (token.Metadata is InlineMetadata)
233324
{
234325
AnsiConsole.WriteLine();
235326
}
@@ -241,6 +332,45 @@ class Program
241332
Console.WriteLine("Done.");
242333
}
243334

335+
private static async Task HandleScript(TypeScriptCodeBlockMetadata tsMetadata)
336+
{
337+
await tsMetadata.RegisterInlineTokenHandler(inlineToken =>
338+
{
339+
var value = Markup.Escape(inlineToken.Value);
340+
var colored = inlineToken.TokenType switch
341+
{
342+
TypescriptTokenType.Identifier => new Markup($"[cyan]{value}[/]"),
343+
TypescriptTokenType.Keyword => new Markup($"[blue]{value}[/]"),
344+
TypescriptTokenType.StringValue => new Markup($"[green]{value}[/]"),
345+
TypescriptTokenType.Number => new Markup($"[magenta]{value}[/]"),
346+
TypescriptTokenType.Operator => new Markup($"[yellow]{value}[/]"),
347+
TypescriptTokenType.Comment => new Markup($"[grey]{value}[/]"),
348+
TypescriptTokenType.Whitespace => new Markup($"[grey]{value}[/]"),
349+
_ => new Markup(value)
350+
};
351+
AnsiConsole.Write(colored);
352+
});
353+
}
354+
355+
private static async Task HandleCss(CssCodeBlockMetadata cssMetadata)
356+
{
357+
await cssMetadata.RegisterInlineTokenHandler(inlineToken =>
358+
{
359+
var value = Markup.Escape(inlineToken.Value);
360+
var colored = inlineToken.TokenType switch
361+
{
362+
CssTokenType.Identifier => new Markup($"[white]{value}[/]"),
363+
CssTokenType.Number => new Markup($"[magenta]{value}[/]"),
364+
CssTokenType.Operator => new Markup($"[yellow]{value}[/]"),
365+
CssTokenType.Selector => new Markup($"[yellow]{value}[/]"),
366+
CssTokenType.Comment => new Markup($"[green]{value}[/]"),
367+
CssTokenType.Whitespace => new Markup($"[grey]{value}[/]"),
368+
_ => new Markup(value)
369+
};
370+
AnsiConsole.Write(colored);
371+
});
372+
}
373+
244374
static async Task EmitSlowlyAsync(string markdown, Stream output)
245375
{
246376
var rng = new Random();
@@ -250,7 +380,7 @@ class Program
250380
{
251381
await output.WriteAsync(new[] { b }.AsMemory(0, 1));
252382
await output.FlushAsync();
253-
await Task.Delay(rng.Next(2, 8));
383+
await Task.Delay(rng.Next(0, 2));
254384
}
255385

256386
output.Close(); // EOF

docs/_config.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ sidebar:
2121
url: "css"
2222
- title: "Xml"
2323
url: "xml"
24+
- title: "Html"
25+
url: "html"
2426
- title: "Yaml"
2527
url: "yaml"
2628
- title: "Encoding"

0 commit comments

Comments
 (0)