diff --git a/RobotsTxt/Extensions.cs b/RobotsTxt/Extensions.cs index b874a79..1edfdec 100644 --- a/RobotsTxt/Extensions.cs +++ b/RobotsTxt/Extensions.cs @@ -1,21 +1,7 @@ namespace RobotsTxt; -public static class MyExtensions +internal static class MyExtensions { -#if !NETCOREAPP - public static bool Contains(this ReadOnlySpan self, byte other) - { - foreach (var c in self) - { - if (c == other) - { - return true; - } - } - return false; - } -#endif - public static bool EqualsIgnoreCase(this ReadOnlySpan self, ReadOnlySpan other) { if (self.Length != other.Length) @@ -27,6 +13,7 @@ public static bool EqualsIgnoreCase(this ReadOnlySpan self, ReadOnlySpan span, ReadOnlySp { var c1 = span[i]; var c2 = value[i]; + if (c1 == c2) continue; if ('A' <= c1 && c1 <= 'Z') c1 += 32; if ('A' <= c2 && c2 <= 'Z') diff --git a/RobotsTxt/LongestMatchRobotsMatchStrategy.cs b/RobotsTxt/LongestMatchRobotsMatchStrategy.cs index b1942b6..3cc1770 100644 --- a/RobotsTxt/LongestMatchRobotsMatchStrategy.cs +++ b/RobotsTxt/LongestMatchRobotsMatchStrategy.cs @@ -71,28 +71,19 @@ internal static bool MatchesSlow(ReadOnlySpan path, ReadOnlySpan pat } [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static int MatchAllowFast(ReadOnlySpan path, ReadOnlySpan pattern, bool haveWildcards) + internal static int MatchFast(ReadOnlySpan path, ReadOnlySpan pattern, bool isSimplePattern) { - return MatchesFast(path, pattern, haveWildcards) ? pattern.Length : -1; - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static int MatchDisallowFast(ReadOnlySpan path, ReadOnlySpan pattern, bool haveWildcards) - { - return MatchesFast(path, pattern, haveWildcards) ? pattern.Length : -1; - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static bool MatchesFast(ReadOnlySpan path, ReadOnlySpan pattern, bool haveWildcards) - { - if (pattern.Length == 0) return true; - if (path.Length == 0) return pattern.Length == 0; - - if (!haveWildcards) + if (pattern.Length == 0) return 0; + if (path.Length == 0) return -1; + if (isSimplePattern) { - return path.IndexOf(pattern) != -1; + return path.StartsWith(pattern) ? pattern.Length : -1; } + return Matches(path, pattern) ? pattern.Length : -1; + } + private static bool Matches(ReadOnlySpan path, ReadOnlySpan pattern) + { Span pos = stackalloc int[path.Length + 1]; var numpos = 1; diff --git a/RobotsTxt/RobotsMachine.cs b/RobotsTxt/RobotsMachine.cs index a23be98..a733b68 100644 --- a/RobotsTxt/RobotsMachine.cs +++ b/RobotsTxt/RobotsMachine.cs @@ -10,16 +10,16 @@ private class State; private class UserAgentState : State; - private class AllowState(byte[] pattern, bool haveWildcards) : State + private class AllowState(ReadOnlyMemory pattern, bool isSimplePattern) : State { - public byte[] Pattern { get; } = pattern; - public bool HaveWildcards { get; } = haveWildcards; + public ReadOnlyMemory Pattern { get; } = pattern; + public bool IsSimplePattern { get; } = isSimplePattern; } - private class DisallowState(byte[] pattern, bool haveWildcards) : State + private class DisallowState(ReadOnlyMemory pattern, bool isSimplePattern) : State { - public byte[] Pattern { get; } = pattern; - public bool HaveWildcards { get; } = haveWildcards; + public ReadOnlyMemory Pattern { get; } = pattern; + public bool IsSimplePattern { get; } = isSimplePattern; } private readonly List _userAgents; @@ -90,7 +90,18 @@ public void HandleUserAgent(int lineNum, ReadOnlySpan userAgent) userAgent = ExtractUserAgent(userAgent); foreach (var ua in _userAgents) { - if (!userAgent.EqualsIgnoreCase(ua)) continue; + if (userAgent.Length != ua.Length) continue; + bool match = true; + for (int i = 0; i < ua.Length; i++) + { + byte a = userAgent[i]; + byte b = ua[i]; + if (a == b || (a >= 'A' && a <= 'Z' && a + 32 == b) || (b >= 'A' && b <= 'Z' && b + 32 == a)) + continue; + match = false; + break; + } + if (!match) continue; _specificStates.Add(new UserAgentState()); _everSeenSpecificAgent = _seenSpecificAgent = true; return; @@ -102,20 +113,49 @@ public void HandleAllow(int lineNum, ReadOnlySpan value) if (!CurrentAgentIsSignificant) return; _seenSeparator = true; - var haveWildcards = value.Length >= 1 && (value.Contains((byte)'*') || value[^1] == '$'); - var state = new AllowState(value.ToArray(), haveWildcards); + + var isSimplePattern = !value.ContainsAny("*$"u8); + + AllowState? rootState = null; + // Google-specific optimization: 'index.htm' and 'index.html' are normalized + // to '/'. + var slashPos = value.LastIndexOf((byte)'/'); + if (slashPos != -1 && value[slashPos..].StartsWith(IndexHtmBytes)) + { + var len = slashPos + 1; + var newValue = new byte[len + 1]; + value[..len].CopyTo(newValue); + newValue[len] = (byte)'$'; + rootState = new AllowState(newValue, false); + } + + var state = new AllowState(value.ToArray(), isSimplePattern); if (_seenSpecificAgent) + { _specificStates.Add(state); + if (rootState != null) + { + _specificStates.Add(rootState); + } + } if (_seenGlobalAgent) + { _globalStates.Add(state); + if (rootState != null) + { + _globalStates.Add(rootState); + } + } } + public void HandleDisallow(int lineNum, ReadOnlySpan value) { if (!CurrentAgentIsSignificant) return; _seenSeparator = true; - var haveWildcards = value.Length >= 1 && (value.Contains((byte)'*') || value[^1] == '$'); - var state = new DisallowState(value.ToArray(), haveWildcards); + + var isSimplePattern = !value.ContainsAny("*$"u8); + var state = new DisallowState(value.ToArray(), isSimplePattern); if (_seenSpecificAgent) _specificStates.Add(state); if (_seenGlobalAgent) @@ -132,51 +172,54 @@ public void HandleUnknownAction(int lineNum, ReadOnlySpan action, ReadOnly public bool PathAllowedByRobots(byte[] path) { - return !Disallow(path); - } - - private bool Disallow(byte[] path) - { - if (!SeenAnyAgent) - return false; + return !Disallow(); - var (allowHierarchy, disallowHierarchy) = AssessAccessRules(path, _specificStates); - if (allowHierarchy > 0 || disallowHierarchy > 0) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + bool Disallow() { - return disallowHierarchy > allowHierarchy; - } + if (!SeenAnyAgent) + return false; - if (_everSeenSpecificAgent) - { - // Matching group for user-agent but either without disallow or empty one, - // i.e. priority == 0. - return false; - } + var (allowHierarchy, disallowHierarchy) = AssessAccessRules(path, _specificStates); + if (allowHierarchy > 0 || disallowHierarchy > 0) + { + return disallowHierarchy > allowHierarchy; + } - (allowHierarchy, disallowHierarchy) = AssessAccessRules(path, _globalStates); + if (_everSeenSpecificAgent) + { + // Matching group for user-agent but either without disallow or empty one, + // i.e. priority == 0. + return false; + } - if (disallowHierarchy > 0 || allowHierarchy > 0) - { - return disallowHierarchy > allowHierarchy; - } + (allowHierarchy, disallowHierarchy) = AssessAccessRules(path, _globalStates); + + if (disallowHierarchy > 0 || allowHierarchy > 0) + { + return disallowHierarchy > allowHierarchy; + } - return false; + return false; + } } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static (int, int) AssessAccessRules(byte[] path, List states) { var allowHierarchy = NoMatchPriority; // Characters of 'url' matching Allow. var disallowHierarchy = NoMatchPriority; // Characters of 'url' matching Disallow. - foreach (var state in states) + for (int i = 0; i < states.Count; i++) { + var state = states[i]; switch (state) { case AllowState allow: - allowHierarchy = CheckAllow(path, allow.Pattern, allow.HaveWildcards, allowHierarchy); + allowHierarchy = Check(path, allow.Pattern.Span, allow.IsSimplePattern, allowHierarchy); break; case DisallowState disallow: - disallowHierarchy = CheckDisallow(path, disallow.Pattern, disallow.HaveWildcards, disallowHierarchy); + disallowHierarchy = Check(path, disallow.Pattern.Span, disallow.IsSimplePattern, disallowHierarchy); break; } } @@ -186,49 +229,14 @@ private static (int, int) AssessAccessRules(byte[] path, List states) private static readonly byte[] IndexHtmBytes = "/index.htm"u8.ToArray(); [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static int CheckAllow(byte[] path, ReadOnlySpan pattern, bool haveWildcards, int allow) - { - while (true) - { - var priority = LongestMatchRobotsMatchStrategy.MatchAllowFast(path, pattern, haveWildcards); - if (priority >= 0) - { - if (allow < priority) - { - allow = priority; - } - } - else - { - // Google-specific optimization: 'index.htm' and 'index.html' are normalized - // to '/'. - var slashPos = pattern.LastIndexOf((byte)'/'); - - if (slashPos != -1 && pattern[slashPos..].StartsWith(IndexHtmBytes)) - { - var len = slashPos + 1; - var newpattern = new byte[len + 1]; - pattern[..len].CopyTo(newpattern); - newpattern[len] = (byte)'$'; - pattern = newpattern; - haveWildcards = true; - continue; - } - } - break; - } - return allow; - } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static int CheckDisallow(byte[] path, ReadOnlySpan value, bool haveWildcards, int disallow) + private static int Check(byte[] path, ReadOnlySpan pattern, bool isSimplePattern, int currentPriority) { - var priority = LongestMatchRobotsMatchStrategy.MatchDisallowFast(path, value, haveWildcards); - if (priority < 0) return disallow; - if (disallow < priority) + var priority = LongestMatchRobotsMatchStrategy.MatchFast(path, pattern, isSimplePattern); + if (priority < 0) return currentPriority; + if (currentPriority < priority) { - disallow = priority; + currentPriority = priority; } - return disallow; + return currentPriority; } } diff --git a/RobotsTxt/RobotsTxt.csproj b/RobotsTxt/RobotsTxt.csproj index f8f3d1d..2d589f8 100644 --- a/RobotsTxt/RobotsTxt.csproj +++ b/RobotsTxt/RobotsTxt.csproj @@ -15,7 +15,6 @@ - diff --git a/TestRobotsTxt/TestLongestMatchRobotsMatchStrategy.cs b/TestRobotsTxt/TestLongestMatchRobotsMatchStrategy.cs index 7c40643..74439ee 100644 --- a/TestRobotsTxt/TestLongestMatchRobotsMatchStrategy.cs +++ b/TestRobotsTxt/TestLongestMatchRobotsMatchStrategy.cs @@ -9,19 +9,20 @@ namespace TestRobotsTxt public class TestsLongestMatchRobotsMatchStrategy { [Theory] - [InlineData("/", "/", true)] - [InlineData("/", "/$", true)] - [InlineData("a", "b", false)] - [InlineData("abcd", "a", true)] - [InlineData("abcd", "a$", false)] - [InlineData("abcd", "a*", true)] - [InlineData("abcd", "a*b", true)] - [InlineData("abcd", "a*c", true)] - [InlineData("abcd", "a*d", true)] - [InlineData("abcd", "a*d$", true)] - [InlineData("abcd", "a*c$", false)] - [InlineData("/abcd/e//fg/hij/k/lm/nop/q/r/", "/*/*/*/*/*/*/*/*/*/*/*", true)] - public void TestMatch(string path, string pattern, bool expected) + [InlineData("/", "/", true, 1)] + [InlineData("/", "/$", true, 2)] + [InlineData("a", "b", false, -1)] + [InlineData("/foo/bar", "/bar", false, -1)] + [InlineData("abcd", "a", true, 1)] + [InlineData("abcd", "a$", false, -1)] + [InlineData("abcd", "a*", true, 2)] + [InlineData("abcd", "a*b", true, 3)] + [InlineData("abcd", "a*c", true, 3)] + [InlineData("abcd", "a*d", true, 3)] + [InlineData("abcd", "a*d$", true, 4)] + [InlineData("abcd", "a*c$", false, -1)] + [InlineData("/abcd/e//fg/hij/k/lm/nop/q/r/", "/*/*/*/*/*/*/*/*/*/*/*", true, 22)] + public void TestMatch(string path, string pattern, bool expected, int len) { var actual = LongestMatchRobotsMatchStrategy.MatchesSlow( @@ -29,14 +30,13 @@ public void TestMatch(string path, string pattern, bool expected) Encoding.UTF8.GetBytes(pattern) ); Assert.Equal(expected, actual); - var haveWildcards = pattern.Length >= 1 && (pattern.Contains('*') || pattern[^1] == '$'); - actual = - LongestMatchRobotsMatchStrategy.MatchesFast( - Encoding.UTF8.GetBytes(path), - Encoding.UTF8.GetBytes(pattern), - haveWildcards + var isSimplePattern = !pattern.AsSpan().ContainsAny("*$"); + var actualLen = + LongestMatchRobotsMatchStrategy.MatchFast(Encoding.UTF8.GetBytes(path), + Encoding.UTF8.GetBytes(pattern), isSimplePattern ); - Assert.Equal(expected, actual); + Assert.Equal(len, actualLen); + Assert.Equal(expected, actualLen >= 0); } } }