11using System . Runtime . CompilerServices ;
22using System . Text . RegularExpressions ;
3- using OwlCore . Diagnostics ;
43using OwlCore . Storage ;
54
65namespace WindowsAppCommunity . Blog . Assets ;
76
87/// <summary>
9- /// Detects relative asset links in rendered using path-pattern regex (no element parsing) .
8+ /// Detects relative asset links in markdown and HTML text .
109/// </summary>
1110public sealed partial class RegexAssetLinkDetector : IAssetLinkDetector
1211{
1312 /// <summary>
14- /// Regex pattern for relative path segments: alphanumerics, underscore, hyphen, dot.
15- /// Matches paths with optional ./ or ../ prefixes and / or \ separators.
13+ /// Regex pattern for markdown links and images.
1614 /// </summary>
17- [ GeneratedRegex ( @"(?:\.\.?/(?:[A-Za-z0-9_\-\.]+/)*[A-Za-z0-9_\-\.]+|[A-Za-z0-9_\-\.]+ (?:/[A-Za-z0-9_\-\.]+)+) ", RegexOptions . Compiled ) ]
18- private static partial Regex RelativePathPattern ( ) ;
15+ [ GeneratedRegex ( """!?\[[^\]]*\]\((?<path>[^)\s]+) (?:\s+[^)]*)?\)"" ", RegexOptions . Compiled ) ]
16+ private static partial Regex MarkdownLinkPattern ( ) ;
1917
2018 /// <summary>
21- /// Regex pattern to detect protocol schemes (e.g., http://, custom://, drive://) .
19+ /// Regex pattern for HTML href/src attributes .
2220 /// </summary>
23- [ GeneratedRegex ( @"[A-Za-z][A-Za-z0-9+\-\.]*://" , RegexOptions . Compiled ) ]
24- private static partial Regex ProtocolSchemePattern ( ) ;
25-
26- [ GeneratedRegex ( @"\b[A-Za-z0-9_\-]+\.[A-Za-z0-9]+\b" , RegexOptions . Compiled ) ]
27- private static partial Regex FilenamePattern ( ) ;
21+ [ GeneratedRegex ( """(?:href|src)\s*=\s*["'](?<path>[^"']+)["']""" , RegexOptions . IgnoreCase | RegexOptions . Compiled ) ]
22+ private static partial Regex HtmlAttributePattern ( ) ;
2823
2924 /// <inheritdoc/>
3025 public async IAsyncEnumerable < string > DetectAsync ( IFile source , [ EnumeratorCancellation ] CancellationToken ct = default )
3126 {
3227 var text = await source . ReadTextAsync ( ct ) ;
28+ var seen = new HashSet < string > ( StringComparer . OrdinalIgnoreCase ) ;
3329
34- foreach ( Match match in RelativePathPattern ( ) . Matches ( text ) )
30+ foreach ( Match match in MarkdownLinkPattern ( ) . Matches ( text ) )
3531 {
3632 if ( ct . IsCancellationRequested )
3733 yield break ;
3834
39- var path = match . Value ;
40-
41- // Filter out non-relative patterns
42- if ( string . IsNullOrWhiteSpace ( path ) )
35+ var path = match . Groups [ "path" ] . Value ;
36+ if ( ! ShouldYield ( path , seen ) )
4337 continue ;
4438
45- // Exclude absolute root paths (optional - treating these as non-relative)
46- if ( path . StartsWith ( '/' ) || path . StartsWith ( '\\ ' ) )
47- continue ;
39+ yield return path ;
40+ }
4841
49- // Check if this path is preceded by a protocol scheme (e.g., custom://path/to/file)
50- // Look back to see if there's a protocol before this match
51- var startIndex = match . Index ;
52- if ( startIndex > 0 )
53- {
54- // Check up to 50 characters before the match for a protocol scheme
55- var lookbackLength = Math . Min ( 50 , startIndex ) ;
56- var precedingText = text . Substring ( startIndex - lookbackLength , lookbackLength ) ;
42+ foreach ( Match match in HtmlAttributePattern ( ) . Matches ( text ) )
43+ {
44+ if ( ct . IsCancellationRequested )
45+ yield break ;
5746
58- // If the preceding text ends with a protocol scheme (e.g., "custom://"), skip this match
59- if ( ProtocolSchemePattern ( ) . IsMatch ( precedingText ) && precedingText . TrimEnd ( ) . EndsWith ( "://" ) )
60- continue ;
61- }
47+ var path = match . Groups [ "path" ] . Value ;
48+ if ( ! ShouldYield ( path , seen ) )
49+ continue ;
6250
6351 yield return path ;
6452 }
53+ }
54+
55+ private static bool ShouldYield ( string path , HashSet < string > seen )
56+ {
57+ if ( string . IsNullOrWhiteSpace ( path ) )
58+ return false ;
6559
66- foreach ( Match match in FilenamePattern ( ) . Matches ( text ) )
60+ path = path . Trim ( ) . Trim ( '<' , '>' ) ;
61+
62+ if ( string . IsNullOrWhiteSpace ( path ) )
63+ return false ;
64+
65+ if ( path . StartsWith ( '#' ) || path . StartsWith ( '/' ) || path . StartsWith ( '\\ ' ) )
66+ return false ;
67+ if ( path . StartsWith ( "//" , StringComparison . Ordinal ) )
68+ return false ;
69+ if ( path . Contains ( "://" , StringComparison . Ordinal ) )
70+ return false ;
71+ if ( path . StartsWith ( "mailto:" , StringComparison . OrdinalIgnoreCase ) ||
72+ path . StartsWith ( "data:" , StringComparison . OrdinalIgnoreCase ) ||
73+ path . StartsWith ( "javascript:" , StringComparison . OrdinalIgnoreCase ) ||
74+ path . StartsWith ( "tel:" , StringComparison . OrdinalIgnoreCase ) )
6775 {
68- yield return match . Value ;
76+ return false ;
6977 }
78+
79+ return seen . Add ( path ) ;
7080 }
71- }
81+ }
0 commit comments