Skip to content

Commit aae35bb

Browse files
committed
fix: recursively include linked markdown pages
1 parent d0f76a0 commit aae35bb

10 files changed

Lines changed: 665 additions & 73 deletions
Lines changed: 45 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,71 +1,81 @@
11
using System.Runtime.CompilerServices;
22
using System.Text.RegularExpressions;
3-
using OwlCore.Diagnostics;
43
using OwlCore.Storage;
54

65
namespace WindowsAppCommunity.Blog.Assets;
76

87
/// <summary>
9-
/// Detects relative asset links in rendered using path-pattern regex (no element parsing).
8+
/// Detects relative asset links in markdown and HTML text.
109
/// </summary>
1110
public sealed partial class RegexAssetLinkDetector : IAssetLinkDetector
1211
{
1312
/// <summary>
14-
/// Regex pattern for relative path segments: alphanumerics, underscore, hyphen, dot.
15-
/// Matches paths with optional ./ or ../ prefixes and / or \ separators.
13+
/// Regex pattern for markdown links and images.
1614
/// </summary>
17-
[GeneratedRegex(@"(?:\.\.?/(?:[A-Za-z0-9_\-\.]+/)*[A-Za-z0-9_\-\.]+|[A-Za-z0-9_\-\.]+(?:/[A-Za-z0-9_\-\.]+)+)", RegexOptions.Compiled)]
18-
private static partial Regex RelativePathPattern();
15+
[GeneratedRegex("""!?\[[^\]]*\]\((?<path>[^)\s]+)(?:\s+[^)]*)?\)""", RegexOptions.Compiled)]
16+
private static partial Regex MarkdownLinkPattern();
1917

2018
/// <summary>
21-
/// Regex pattern to detect protocol schemes (e.g., http://, custom://, drive://).
19+
/// Regex pattern for HTML href/src attributes.
2220
/// </summary>
23-
[GeneratedRegex(@"[A-Za-z][A-Za-z0-9+\-\.]*://", RegexOptions.Compiled)]
24-
private static partial Regex ProtocolSchemePattern();
25-
26-
[GeneratedRegex(@"\b[A-Za-z0-9_\-]+\.[A-Za-z0-9]+\b", RegexOptions.Compiled)]
27-
private static partial Regex FilenamePattern();
21+
[GeneratedRegex("""(?:href|src)\s*=\s*["'](?<path>[^"']+)["']""", RegexOptions.IgnoreCase | RegexOptions.Compiled)]
22+
private static partial Regex HtmlAttributePattern();
2823

2924
/// <inheritdoc/>
3025
public async IAsyncEnumerable<string> DetectAsync(IFile source, [EnumeratorCancellation] CancellationToken ct = default)
3126
{
3227
var text = await source.ReadTextAsync(ct);
28+
var seen = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
3329

34-
foreach (Match match in RelativePathPattern().Matches(text))
30+
foreach (Match match in MarkdownLinkPattern().Matches(text))
3531
{
3632
if (ct.IsCancellationRequested)
3733
yield break;
3834

39-
var path = match.Value;
40-
41-
// Filter out non-relative patterns
42-
if (string.IsNullOrWhiteSpace(path))
35+
var path = match.Groups["path"].Value;
36+
if (!ShouldYield(path, seen))
4337
continue;
4438

45-
// Exclude absolute root paths (optional - treating these as non-relative)
46-
if (path.StartsWith('/') || path.StartsWith('\\'))
47-
continue;
39+
yield return path;
40+
}
4841

49-
// Check if this path is preceded by a protocol scheme (e.g., custom://path/to/file)
50-
// Look back to see if there's a protocol before this match
51-
var startIndex = match.Index;
52-
if (startIndex > 0)
53-
{
54-
// Check up to 50 characters before the match for a protocol scheme
55-
var lookbackLength = Math.Min(50, startIndex);
56-
var precedingText = text.Substring(startIndex - lookbackLength, lookbackLength);
42+
foreach (Match match in HtmlAttributePattern().Matches(text))
43+
{
44+
if (ct.IsCancellationRequested)
45+
yield break;
5746

58-
// If the preceding text ends with a protocol scheme (e.g., "custom://"), skip this match
59-
if (ProtocolSchemePattern().IsMatch(precedingText) && precedingText.TrimEnd().EndsWith("://"))
60-
continue;
61-
}
47+
var path = match.Groups["path"].Value;
48+
if (!ShouldYield(path, seen))
49+
continue;
6250

6351
yield return path;
6452
}
53+
}
54+
55+
private static bool ShouldYield(string path, HashSet<string> seen)
56+
{
57+
if (string.IsNullOrWhiteSpace(path))
58+
return false;
6559

66-
foreach (Match match in FilenamePattern().Matches(text))
60+
path = path.Trim().Trim('<', '>');
61+
62+
if (string.IsNullOrWhiteSpace(path))
63+
return false;
64+
65+
if (path.StartsWith('#') || path.StartsWith('/') || path.StartsWith('\\'))
66+
return false;
67+
if (path.StartsWith("//", StringComparison.Ordinal))
68+
return false;
69+
if (path.Contains("://", StringComparison.Ordinal))
70+
return false;
71+
if (path.StartsWith("mailto:", StringComparison.OrdinalIgnoreCase) ||
72+
path.StartsWith("data:", StringComparison.OrdinalIgnoreCase) ||
73+
path.StartsWith("javascript:", StringComparison.OrdinalIgnoreCase) ||
74+
path.StartsWith("tel:", StringComparison.OrdinalIgnoreCase))
6775
{
68-
yield return match.Value;
76+
return false;
6977
}
78+
79+
return seen.Add(path);
7080
}
71-
}
81+
}

0 commit comments

Comments
 (0)