Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 3 additions & 15 deletions RobotsTxt/Extensions.cs
Original file line number Diff line number Diff line change
@@ -1,21 +1,7 @@
namespace RobotsTxt;

public static class MyExtensions
internal static class MyExtensions
{
#if !NETCOREAPP
public static bool Contains(this ReadOnlySpan<byte> self, byte other)
{
foreach (var c in self)
{
if (c == other)
{
return true;
}
}
return false;
}
#endif

public static bool EqualsIgnoreCase(this ReadOnlySpan<byte> self, ReadOnlySpan<byte> other)
{
if (self.Length != other.Length)
Expand All @@ -27,6 +13,7 @@ public static bool EqualsIgnoreCase(this ReadOnlySpan<byte> self, ReadOnlySpan<b
{
var c1 = self[i];
var c2 = other[i];
if (c1 == c2) continue;
if ('A' <= c1 && c1 <= 'Z')
c1 += 32;
if ('A' <= c2 && c2 <= 'Z')
Expand All @@ -51,6 +38,7 @@ public static bool StartsWithIgnoreCase(this ReadOnlySpan<byte> span, ReadOnlySp
{
var c1 = span[i];
var c2 = value[i];
if (c1 == c2) continue;
if ('A' <= c1 && c1 <= 'Z')
c1 += 32;
if ('A' <= c2 && c2 <= 'Z')
Expand Down
27 changes: 9 additions & 18 deletions RobotsTxt/LongestMatchRobotsMatchStrategy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -71,28 +71,19 @@ internal static bool MatchesSlow(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pat
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int MatchAllowFast(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pattern, bool haveWildcards)
internal static int MatchFast(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pattern, bool isSimplePattern)
{
return MatchesFast(path, pattern, haveWildcards) ? pattern.Length : -1;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int MatchDisallowFast(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pattern, bool haveWildcards)
{
return MatchesFast(path, pattern, haveWildcards) ? pattern.Length : -1;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static bool MatchesFast(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pattern, bool haveWildcards)
{
if (pattern.Length == 0) return true;
if (path.Length == 0) return pattern.Length == 0;

if (!haveWildcards)
if (pattern.Length == 0) return 0;
if (path.Length == 0) return -1;
if (isSimplePattern)
{
return path.IndexOf(pattern) != -1;
return path.StartsWith(pattern) ? pattern.Length : -1;
Copy link

Copilot AI Jan 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For simple patterns (no wildcards), the original logic checked if the pattern appeared anywhere in the path using IndexOf. The new logic only checks if the path starts with the pattern. This changes the behavior for non-wildcard patterns and may break existing functionality where patterns like '/bar' should match paths like '/foo/bar'.

Suggested change
return path.StartsWith(pattern) ? pattern.Length : -1;
return path.IndexOf(pattern) >= 0 ? pattern.Length : -1;

Copilot uses AI. Check for mistakes.
}
return Matches(path, pattern) ? pattern.Length : -1;
}

private static bool Matches(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pattern)
{
Span<int> pos = stackalloc int[path.Length + 1];
Comment on lines +85 to 87
Copy link

Copilot AI Jan 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Matches method is now private but appears to be incomplete in the diff. The method allocates a stack buffer but the implementation details are cut off. Ensure that this method body is complete and correctly implements the wildcard matching logic that was previously in MatchesFast.

Copilot uses AI. Check for mistakes.
var numpos = 1;

Expand Down
164 changes: 86 additions & 78 deletions RobotsTxt/RobotsMachine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,16 @@ private class State;

private class UserAgentState : State;

private class AllowState(byte[] pattern, bool haveWildcards) : State
private class AllowState(ReadOnlyMemory<byte> pattern, bool isSimplePattern) : State
{
public byte[] Pattern { get; } = pattern;
public bool HaveWildcards { get; } = haveWildcards;
public ReadOnlyMemory<byte> Pattern { get; } = pattern;
public bool IsSimplePattern { get; } = isSimplePattern;
}

private class DisallowState(byte[] pattern, bool haveWildcards) : State
private class DisallowState(ReadOnlyMemory<byte> pattern, bool isSimplePattern) : State
{
public byte[] Pattern { get; } = pattern;
public bool HaveWildcards { get; } = haveWildcards;
public ReadOnlyMemory<byte> Pattern { get; } = pattern;
public bool IsSimplePattern { get; } = isSimplePattern;
}

private readonly List<byte[]> _userAgents;
Expand Down Expand Up @@ -90,7 +90,18 @@ public void HandleUserAgent(int lineNum, ReadOnlySpan<byte> userAgent)
userAgent = ExtractUserAgent(userAgent);
foreach (var ua in _userAgents)
{
if (!userAgent.EqualsIgnoreCase(ua)) continue;
if (userAgent.Length != ua.Length) continue;
bool match = true;
for (int i = 0; i < ua.Length; i++)
{
byte a = userAgent[i];
byte b = ua[i];
if (a == b || (a >= 'A' && a <= 'Z' && a + 32 == b) || (b >= 'A' && b <= 'Z' && b + 32 == a))
continue;
match = false;
break;
}
Comment on lines +93 to +103
Copy link

Copilot AI Jan 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This manual case-insensitive comparison duplicates the logic already implemented in MyExtensions.EqualsIgnoreCase(). Consider using the existing extension method instead to reduce code duplication and improve maintainability.

Copilot uses AI. Check for mistakes.
if (!match) continue;
_specificStates.Add(new UserAgentState());
_everSeenSpecificAgent = _seenSpecificAgent = true;
return;
Expand All @@ -102,20 +113,49 @@ public void HandleAllow(int lineNum, ReadOnlySpan<byte> value)
if (!CurrentAgentIsSignificant)
return;
_seenSeparator = true;
var haveWildcards = value.Length >= 1 && (value.Contains((byte)'*') || value[^1] == '$');
var state = new AllowState(value.ToArray(), haveWildcards);

var isSimplePattern = !value.ContainsAny("*$"u8);

AllowState? rootState = null;
// Google-specific optimization: 'index.htm' and 'index.html' are normalized
// to '/'.
var slashPos = value.LastIndexOf((byte)'/');
if (slashPos != -1 && value[slashPos..].StartsWith(IndexHtmBytes))
{
var len = slashPos + 1;
var newValue = new byte[len + 1];
value[..len].CopyTo(newValue);
newValue[len] = (byte)'$';
rootState = new AllowState(newValue, false);
}

var state = new AllowState(value.ToArray(), isSimplePattern);
if (_seenSpecificAgent)
{
_specificStates.Add(state);
if (rootState != null)
{
_specificStates.Add(rootState);
}
}
if (_seenGlobalAgent)
{
_globalStates.Add(state);
if (rootState != null)
{
_globalStates.Add(rootState);
}
}
}

public void HandleDisallow(int lineNum, ReadOnlySpan<byte> value)
{
if (!CurrentAgentIsSignificant)
return;
_seenSeparator = true;
var haveWildcards = value.Length >= 1 && (value.Contains((byte)'*') || value[^1] == '$');
var state = new DisallowState(value.ToArray(), haveWildcards);

var isSimplePattern = !value.ContainsAny("*$"u8);
var state = new DisallowState(value.ToArray(), isSimplePattern);
if (_seenSpecificAgent)
_specificStates.Add(state);
if (_seenGlobalAgent)
Expand All @@ -132,51 +172,54 @@ public void HandleUnknownAction(int lineNum, ReadOnlySpan<byte> action, ReadOnly

public bool PathAllowedByRobots(byte[] path)
{
return !Disallow(path);
}

private bool Disallow(byte[] path)
{
if (!SeenAnyAgent)
return false;
return !Disallow();

var (allowHierarchy, disallowHierarchy) = AssessAccessRules(path, _specificStates);
if (allowHierarchy > 0 || disallowHierarchy > 0)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
bool Disallow()
{
return disallowHierarchy > allowHierarchy;
}
if (!SeenAnyAgent)
return false;

if (_everSeenSpecificAgent)
{
// Matching group for user-agent but either without disallow or empty one,
// i.e. priority == 0.
return false;
}
var (allowHierarchy, disallowHierarchy) = AssessAccessRules(path, _specificStates);
if (allowHierarchy > 0 || disallowHierarchy > 0)
{
return disallowHierarchy > allowHierarchy;
}

(allowHierarchy, disallowHierarchy) = AssessAccessRules(path, _globalStates);
if (_everSeenSpecificAgent)
{
// Matching group for user-agent but either without disallow or empty one,
// i.e. priority == 0.
return false;
}

if (disallowHierarchy > 0 || allowHierarchy > 0)
{
return disallowHierarchy > allowHierarchy;
}
(allowHierarchy, disallowHierarchy) = AssessAccessRules(path, _globalStates);

if (disallowHierarchy > 0 || allowHierarchy > 0)
{
return disallowHierarchy > allowHierarchy;
}

return false;
return false;
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static (int, int) AssessAccessRules(byte[] path, List<State> states)
{
var allowHierarchy = NoMatchPriority; // Characters of 'url' matching Allow.
var disallowHierarchy = NoMatchPriority; // Characters of 'url' matching Disallow.

foreach (var state in states)
for (int i = 0; i < states.Count; i++)
{
var state = states[i];
switch (state)
{
case AllowState allow:
allowHierarchy = CheckAllow(path, allow.Pattern, allow.HaveWildcards, allowHierarchy);
allowHierarchy = Check(path, allow.Pattern.Span, allow.IsSimplePattern, allowHierarchy);
break;
case DisallowState disallow:
disallowHierarchy = CheckDisallow(path, disallow.Pattern, disallow.HaveWildcards, disallowHierarchy);
disallowHierarchy = Check(path, disallow.Pattern.Span, disallow.IsSimplePattern, disallowHierarchy);
break;
}
}
Expand All @@ -186,49 +229,14 @@ private static (int, int) AssessAccessRules(byte[] path, List<State> states)
private static readonly byte[] IndexHtmBytes = "/index.htm"u8.ToArray();

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int CheckAllow(byte[] path, ReadOnlySpan<byte> pattern, bool haveWildcards, int allow)
{
while (true)
{
var priority = LongestMatchRobotsMatchStrategy.MatchAllowFast(path, pattern, haveWildcards);
if (priority >= 0)
{
if (allow < priority)
{
allow = priority;
}
}
else
{
// Google-specific optimization: 'index.htm' and 'index.html' are normalized
// to '/'.
var slashPos = pattern.LastIndexOf((byte)'/');

if (slashPos != -1 && pattern[slashPos..].StartsWith(IndexHtmBytes))
{
var len = slashPos + 1;
var newpattern = new byte[len + 1];
pattern[..len].CopyTo(newpattern);
newpattern[len] = (byte)'$';
pattern = newpattern;
haveWildcards = true;
continue;
}
}
break;
}
return allow;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int CheckDisallow(byte[] path, ReadOnlySpan<byte> value, bool haveWildcards, int disallow)
private static int Check(byte[] path, ReadOnlySpan<byte> pattern, bool isSimplePattern, int currentPriority)
{
var priority = LongestMatchRobotsMatchStrategy.MatchDisallowFast(path, value, haveWildcards);
if (priority < 0) return disallow;
if (disallow < priority)
var priority = LongestMatchRobotsMatchStrategy.MatchFast(path, pattern, isSimplePattern);
if (priority < 0) return currentPriority;
if (currentPriority < priority)
{
disallow = priority;
currentPriority = priority;
}
return disallow;
return currentPriority;
}
}
1 change: 0 additions & 1 deletion RobotsTxt/RobotsTxt.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

<ItemGroup>
<PackageReference Include="Microsoft.Bcl.Memory" Version="9.0.9" />
<PackageReference Include="System.Memory" Version="4.6.3" />
</ItemGroup>

</Project>
40 changes: 20 additions & 20 deletions TestRobotsTxt/TestLongestMatchRobotsMatchStrategy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,34 @@ namespace TestRobotsTxt
public class TestsLongestMatchRobotsMatchStrategy
{
[Theory]
[InlineData("/", "/", true)]
[InlineData("/", "/$", true)]
[InlineData("a", "b", false)]
[InlineData("abcd", "a", true)]
[InlineData("abcd", "a$", false)]
[InlineData("abcd", "a*", true)]
[InlineData("abcd", "a*b", true)]
[InlineData("abcd", "a*c", true)]
[InlineData("abcd", "a*d", true)]
[InlineData("abcd", "a*d$", true)]
[InlineData("abcd", "a*c$", false)]
[InlineData("/abcd/e//fg/hij/k/lm/nop/q/r/", "/*/*/*/*/*/*/*/*/*/*/*", true)]
public void TestMatch(string path, string pattern, bool expected)
[InlineData("/", "/", true, 1)]
[InlineData("/", "/$", true, 2)]
[InlineData("a", "b", false, -1)]
[InlineData("/foo/bar", "/bar", false, -1)]
Copy link

Copilot AI Jan 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test case verifies that substring matching doesn't occur when it shouldn't (e.g., '/bar' should not match '/foo/bar'). However, there's no corresponding test for the opposite scenario where a pattern should match when it appears at the start of the path. Consider adding a test case like [InlineData("/bar/foo", "/bar", true, 4)] to ensure the fast path correctly matches path prefixes.

Copilot uses AI. Check for mistakes.
Copy link

Copilot AI Jan 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This new test case validates that patterns must match from the start of the path. Consider adding a corresponding test in the 'Slow' path to ensure both implementations handle this scenario consistently.

Copilot uses AI. Check for mistakes.
[InlineData("abcd", "a", true, 1)]
[InlineData("abcd", "a$", false, -1)]
[InlineData("abcd", "a*", true, 2)]
[InlineData("abcd", "a*b", true, 3)]
[InlineData("abcd", "a*c", true, 3)]
[InlineData("abcd", "a*d", true, 3)]
[InlineData("abcd", "a*d$", true, 4)]
[InlineData("abcd", "a*c$", false, -1)]
[InlineData("/abcd/e//fg/hij/k/lm/nop/q/r/", "/*/*/*/*/*/*/*/*/*/*/*", true, 22)]
public void TestMatch(string path, string pattern, bool expected, int len)
{
var actual =
LongestMatchRobotsMatchStrategy.MatchesSlow(
Encoding.UTF8.GetBytes(path),
Encoding.UTF8.GetBytes(pattern)
);
Assert.Equal(expected, actual);
var haveWildcards = pattern.Length >= 1 && (pattern.Contains('*') || pattern[^1] == '$');
actual =
LongestMatchRobotsMatchStrategy.MatchesFast(
Encoding.UTF8.GetBytes(path),
Encoding.UTF8.GetBytes(pattern),
haveWildcards
var isSimplePattern = !pattern.AsSpan().ContainsAny("*$");
var actualLen =
LongestMatchRobotsMatchStrategy.MatchFast(Encoding.UTF8.GetBytes(path),
Encoding.UTF8.GetBytes(pattern), isSimplePattern
);
Assert.Equal(expected, actual);
Assert.Equal(len, actualLen);
Assert.Equal(expected, actualLen >= 0);
}
}
}
Loading