-
Notifications
You must be signed in to change notification settings - Fork 0
Refactor and optimize a bit robots parsing module #4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
9990025
42b19e8
6a4272a
4c1584d
b6af189
75ad0e4
ab65c88
813ddf4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -71,28 +71,19 @@ internal static bool MatchesSlow(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pat | |
| } | ||
|
|
||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| internal static int MatchAllowFast(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pattern, bool haveWildcards) | ||
| internal static int MatchFast(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pattern, bool isSimplePattern) | ||
| { | ||
| return MatchesFast(path, pattern, haveWildcards) ? pattern.Length : -1; | ||
| } | ||
|
|
||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| internal static int MatchDisallowFast(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pattern, bool haveWildcards) | ||
| { | ||
| return MatchesFast(path, pattern, haveWildcards) ? pattern.Length : -1; | ||
| } | ||
|
|
||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| internal static bool MatchesFast(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pattern, bool haveWildcards) | ||
| { | ||
| if (pattern.Length == 0) return true; | ||
| if (path.Length == 0) return pattern.Length == 0; | ||
|
|
||
| if (!haveWildcards) | ||
| if (pattern.Length == 0) return 0; | ||
| if (path.Length == 0) return -1; | ||
| if (isSimplePattern) | ||
| { | ||
| return path.IndexOf(pattern) != -1; | ||
| return path.StartsWith(pattern) ? pattern.Length : -1; | ||
| } | ||
| return Matches(path, pattern) ? pattern.Length : -1; | ||
ybastide marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| private static bool Matches(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pattern) | ||
| { | ||
| Span<int> pos = stackalloc int[path.Length + 1]; | ||
|
Comment on lines
+85
to
87
|
||
| var numpos = 1; | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,16 +10,16 @@ private class State; | |
|
|
||
| private class UserAgentState : State; | ||
|
|
||
| private class AllowState(byte[] pattern, bool haveWildcards) : State | ||
| private class AllowState(ReadOnlyMemory<byte> pattern, bool isSimplePattern) : State | ||
| { | ||
| public byte[] Pattern { get; } = pattern; | ||
| public bool HaveWildcards { get; } = haveWildcards; | ||
| public ReadOnlyMemory<byte> Pattern { get; } = pattern; | ||
| public bool IsSimplePattern { get; } = isSimplePattern; | ||
| } | ||
|
|
||
| private class DisallowState(byte[] pattern, bool haveWildcards) : State | ||
| private class DisallowState(ReadOnlyMemory<byte> pattern, bool isSimplePattern) : State | ||
| { | ||
| public byte[] Pattern { get; } = pattern; | ||
| public bool HaveWildcards { get; } = haveWildcards; | ||
| public ReadOnlyMemory<byte> Pattern { get; } = pattern; | ||
| public bool IsSimplePattern { get; } = isSimplePattern; | ||
| } | ||
|
|
||
| private readonly List<byte[]> _userAgents; | ||
|
|
@@ -90,7 +90,18 @@ public void HandleUserAgent(int lineNum, ReadOnlySpan<byte> userAgent) | |
| userAgent = ExtractUserAgent(userAgent); | ||
| foreach (var ua in _userAgents) | ||
| { | ||
| if (!userAgent.EqualsIgnoreCase(ua)) continue; | ||
| if (userAgent.Length != ua.Length) continue; | ||
| bool match = true; | ||
| for (int i = 0; i < ua.Length; i++) | ||
| { | ||
| byte a = userAgent[i]; | ||
| byte b = ua[i]; | ||
| if (a == b || (a >= 'A' && a <= 'Z' && a + 32 == b) || (b >= 'A' && b <= 'Z' && b + 32 == a)) | ||
| continue; | ||
| match = false; | ||
| break; | ||
| } | ||
|
Comment on lines
+93
to
+103
|
||
| if (!match) continue; | ||
| _specificStates.Add(new UserAgentState()); | ||
| _everSeenSpecificAgent = _seenSpecificAgent = true; | ||
| return; | ||
|
|
@@ -102,20 +113,49 @@ public void HandleAllow(int lineNum, ReadOnlySpan<byte> value) | |
| if (!CurrentAgentIsSignificant) | ||
| return; | ||
| _seenSeparator = true; | ||
| var haveWildcards = value.Length >= 1 && (value.Contains((byte)'*') || value[^1] == '$'); | ||
| var state = new AllowState(value.ToArray(), haveWildcards); | ||
|
|
||
| var isSimplePattern = !value.ContainsAny("*$"u8); | ||
|
|
||
| AllowState? rootState = null; | ||
| // Google-specific optimization: 'index.htm' and 'index.html' are normalized | ||
| // to '/'. | ||
| var slashPos = value.LastIndexOf((byte)'/'); | ||
| if (slashPos != -1 && value[slashPos..].StartsWith(IndexHtmBytes)) | ||
| { | ||
| var len = slashPos + 1; | ||
| var newValue = new byte[len + 1]; | ||
| value[..len].CopyTo(newValue); | ||
| newValue[len] = (byte)'$'; | ||
| rootState = new AllowState(newValue, false); | ||
| } | ||
|
|
||
| var state = new AllowState(value.ToArray(), isSimplePattern); | ||
| if (_seenSpecificAgent) | ||
| { | ||
| _specificStates.Add(state); | ||
| if (rootState != null) | ||
| { | ||
| _specificStates.Add(rootState); | ||
| } | ||
| } | ||
| if (_seenGlobalAgent) | ||
| { | ||
| _globalStates.Add(state); | ||
| if (rootState != null) | ||
| { | ||
| _globalStates.Add(rootState); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| public void HandleDisallow(int lineNum, ReadOnlySpan<byte> value) | ||
| { | ||
| if (!CurrentAgentIsSignificant) | ||
| return; | ||
| _seenSeparator = true; | ||
| var haveWildcards = value.Length >= 1 && (value.Contains((byte)'*') || value[^1] == '$'); | ||
| var state = new DisallowState(value.ToArray(), haveWildcards); | ||
|
|
||
| var isSimplePattern = !value.ContainsAny("*$"u8); | ||
| var state = new DisallowState(value.ToArray(), isSimplePattern); | ||
| if (_seenSpecificAgent) | ||
| _specificStates.Add(state); | ||
| if (_seenGlobalAgent) | ||
|
|
@@ -132,51 +172,54 @@ public void HandleUnknownAction(int lineNum, ReadOnlySpan<byte> action, ReadOnly | |
|
|
||
| public bool PathAllowedByRobots(byte[] path) | ||
| { | ||
| return !Disallow(path); | ||
| } | ||
|
|
||
| private bool Disallow(byte[] path) | ||
| { | ||
| if (!SeenAnyAgent) | ||
| return false; | ||
| return !Disallow(); | ||
|
|
||
| var (allowHierarchy, disallowHierarchy) = AssessAccessRules(path, _specificStates); | ||
| if (allowHierarchy > 0 || disallowHierarchy > 0) | ||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| bool Disallow() | ||
| { | ||
| return disallowHierarchy > allowHierarchy; | ||
| } | ||
| if (!SeenAnyAgent) | ||
| return false; | ||
|
|
||
| if (_everSeenSpecificAgent) | ||
| { | ||
| // Matching group for user-agent but either without disallow or empty one, | ||
| // i.e. priority == 0. | ||
| return false; | ||
| } | ||
| var (allowHierarchy, disallowHierarchy) = AssessAccessRules(path, _specificStates); | ||
| if (allowHierarchy > 0 || disallowHierarchy > 0) | ||
| { | ||
| return disallowHierarchy > allowHierarchy; | ||
| } | ||
|
|
||
| (allowHierarchy, disallowHierarchy) = AssessAccessRules(path, _globalStates); | ||
| if (_everSeenSpecificAgent) | ||
| { | ||
| // Matching group for user-agent but either without disallow or empty one, | ||
| // i.e. priority == 0. | ||
| return false; | ||
| } | ||
|
|
||
| if (disallowHierarchy > 0 || allowHierarchy > 0) | ||
| { | ||
| return disallowHierarchy > allowHierarchy; | ||
| } | ||
| (allowHierarchy, disallowHierarchy) = AssessAccessRules(path, _globalStates); | ||
|
|
||
| if (disallowHierarchy > 0 || allowHierarchy > 0) | ||
| { | ||
| return disallowHierarchy > allowHierarchy; | ||
| } | ||
|
|
||
| return false; | ||
| return false; | ||
| } | ||
| } | ||
|
|
||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| private static (int, int) AssessAccessRules(byte[] path, List<State> states) | ||
| { | ||
| var allowHierarchy = NoMatchPriority; // Characters of 'url' matching Allow. | ||
| var disallowHierarchy = NoMatchPriority; // Characters of 'url' matching Disallow. | ||
|
|
||
| foreach (var state in states) | ||
| for (int i = 0; i < states.Count; i++) | ||
| { | ||
| var state = states[i]; | ||
| switch (state) | ||
| { | ||
| case AllowState allow: | ||
| allowHierarchy = CheckAllow(path, allow.Pattern, allow.HaveWildcards, allowHierarchy); | ||
| allowHierarchy = Check(path, allow.Pattern.Span, allow.IsSimplePattern, allowHierarchy); | ||
| break; | ||
| case DisallowState disallow: | ||
| disallowHierarchy = CheckDisallow(path, disallow.Pattern, disallow.HaveWildcards, disallowHierarchy); | ||
| disallowHierarchy = Check(path, disallow.Pattern.Span, disallow.IsSimplePattern, disallowHierarchy); | ||
| break; | ||
| } | ||
| } | ||
|
|
@@ -186,49 +229,14 @@ private static (int, int) AssessAccessRules(byte[] path, List<State> states) | |
| private static readonly byte[] IndexHtmBytes = "/index.htm"u8.ToArray(); | ||
|
|
||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| private static int CheckAllow(byte[] path, ReadOnlySpan<byte> pattern, bool haveWildcards, int allow) | ||
| { | ||
| while (true) | ||
| { | ||
| var priority = LongestMatchRobotsMatchStrategy.MatchAllowFast(path, pattern, haveWildcards); | ||
| if (priority >= 0) | ||
| { | ||
| if (allow < priority) | ||
| { | ||
| allow = priority; | ||
| } | ||
| } | ||
| else | ||
| { | ||
| // Google-specific optimization: 'index.htm' and 'index.html' are normalized | ||
| // to '/'. | ||
| var slashPos = pattern.LastIndexOf((byte)'/'); | ||
|
|
||
| if (slashPos != -1 && pattern[slashPos..].StartsWith(IndexHtmBytes)) | ||
| { | ||
| var len = slashPos + 1; | ||
| var newpattern = new byte[len + 1]; | ||
| pattern[..len].CopyTo(newpattern); | ||
| newpattern[len] = (byte)'$'; | ||
| pattern = newpattern; | ||
| haveWildcards = true; | ||
| continue; | ||
| } | ||
| } | ||
| break; | ||
| } | ||
| return allow; | ||
| } | ||
|
|
||
| [MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
| private static int CheckDisallow(byte[] path, ReadOnlySpan<byte> value, bool haveWildcards, int disallow) | ||
| private static int Check(byte[] path, ReadOnlySpan<byte> pattern, bool isSimplePattern, int currentPriority) | ||
| { | ||
| var priority = LongestMatchRobotsMatchStrategy.MatchDisallowFast(path, value, haveWildcards); | ||
| if (priority < 0) return disallow; | ||
| if (disallow < priority) | ||
| var priority = LongestMatchRobotsMatchStrategy.MatchFast(path, pattern, isSimplePattern); | ||
| if (priority < 0) return currentPriority; | ||
| if (currentPriority < priority) | ||
| { | ||
| disallow = priority; | ||
| currentPriority = priority; | ||
| } | ||
| return disallow; | ||
| return currentPriority; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,34 +9,34 @@ namespace TestRobotsTxt | |
| public class TestsLongestMatchRobotsMatchStrategy | ||
| { | ||
| [Theory] | ||
| [InlineData("/", "/", true)] | ||
| [InlineData("/", "/$", true)] | ||
| [InlineData("a", "b", false)] | ||
| [InlineData("abcd", "a", true)] | ||
| [InlineData("abcd", "a$", false)] | ||
| [InlineData("abcd", "a*", true)] | ||
| [InlineData("abcd", "a*b", true)] | ||
| [InlineData("abcd", "a*c", true)] | ||
| [InlineData("abcd", "a*d", true)] | ||
| [InlineData("abcd", "a*d$", true)] | ||
| [InlineData("abcd", "a*c$", false)] | ||
| [InlineData("/abcd/e//fg/hij/k/lm/nop/q/r/", "/*/*/*/*/*/*/*/*/*/*/*", true)] | ||
| public void TestMatch(string path, string pattern, bool expected) | ||
| [InlineData("/", "/", true, 1)] | ||
| [InlineData("/", "/$", true, 2)] | ||
| [InlineData("a", "b", false, -1)] | ||
| [InlineData("/foo/bar", "/bar", false, -1)] | ||
|
||
| [InlineData("abcd", "a", true, 1)] | ||
| [InlineData("abcd", "a$", false, -1)] | ||
| [InlineData("abcd", "a*", true, 2)] | ||
| [InlineData("abcd", "a*b", true, 3)] | ||
| [InlineData("abcd", "a*c", true, 3)] | ||
| [InlineData("abcd", "a*d", true, 3)] | ||
| [InlineData("abcd", "a*d$", true, 4)] | ||
| [InlineData("abcd", "a*c$", false, -1)] | ||
| [InlineData("/abcd/e//fg/hij/k/lm/nop/q/r/", "/*/*/*/*/*/*/*/*/*/*/*", true, 22)] | ||
| public void TestMatch(string path, string pattern, bool expected, int len) | ||
| { | ||
| var actual = | ||
| LongestMatchRobotsMatchStrategy.MatchesSlow( | ||
| Encoding.UTF8.GetBytes(path), | ||
| Encoding.UTF8.GetBytes(pattern) | ||
| ); | ||
| Assert.Equal(expected, actual); | ||
| var haveWildcards = pattern.Length >= 1 && (pattern.Contains('*') || pattern[^1] == '$'); | ||
| actual = | ||
| LongestMatchRobotsMatchStrategy.MatchesFast( | ||
| Encoding.UTF8.GetBytes(path), | ||
| Encoding.UTF8.GetBytes(pattern), | ||
| haveWildcards | ||
| var isSimplePattern = !pattern.AsSpan().ContainsAny("*$"); | ||
| var actualLen = | ||
| LongestMatchRobotsMatchStrategy.MatchFast(Encoding.UTF8.GetBytes(path), | ||
| Encoding.UTF8.GetBytes(pattern), isSimplePattern | ||
| ); | ||
| Assert.Equal(expected, actual); | ||
| Assert.Equal(len, actualLen); | ||
| Assert.Equal(expected, actualLen >= 0); | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For simple patterns (no wildcards), the original logic checked if the pattern appeared anywhere in the path using IndexOf. The new logic only checks if the path starts with the pattern. This changes the behavior for non-wildcard patterns and may break existing functionality where patterns like '/bar' should match paths like '/foo/bar'.