Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 3 additions & 15 deletions RobotsTxt/Extensions.cs
Original file line number Diff line number Diff line change
@@ -1,21 +1,7 @@
namespace RobotsTxt;

public static class MyExtensions
internal static class MyExtensions
{
#if !NETCOREAPP
public static bool Contains(this ReadOnlySpan<byte> self, byte other)
{
foreach (var c in self)
{
if (c == other)
{
return true;
}
}
return false;
}
#endif

public static bool EqualsIgnoreCase(this ReadOnlySpan<byte> self, ReadOnlySpan<byte> other)
{
if (self.Length != other.Length)
Expand All @@ -27,6 +13,7 @@ public static bool EqualsIgnoreCase(this ReadOnlySpan<byte> self, ReadOnlySpan<b
{
var c1 = self[i];
var c2 = other[i];
if (c1 == c2) continue;
if ('A' <= c1 && c1 <= 'Z')
c1 += 32;
if ('A' <= c2 && c2 <= 'Z')
Expand All @@ -51,6 +38,7 @@ public static bool StartsWithIgnoreCase(this ReadOnlySpan<byte> span, ReadOnlySp
{
var c1 = span[i];
var c2 = value[i];
if (c1 == c2) continue;
if ('A' <= c1 && c1 <= 'Z')
c1 += 32;
if ('A' <= c2 && c2 <= 'Z')
Expand Down
27 changes: 9 additions & 18 deletions RobotsTxt/LongestMatchRobotsMatchStrategy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -71,28 +71,19 @@ internal static bool MatchesSlow(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pat
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int MatchAllowFast(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pattern, bool haveWildcards)
internal static int MatchFast(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pattern, bool fastPath)
{
return MatchesFast(path, pattern, haveWildcards) ? pattern.Length : -1;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int MatchDisallowFast(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pattern, bool haveWildcards)
{
return MatchesFast(path, pattern, haveWildcards) ? pattern.Length : -1;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static bool MatchesFast(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pattern, bool haveWildcards)
{
if (pattern.Length == 0) return true;
if (path.Length == 0) return pattern.Length == 0;

if (!haveWildcards)
if (pattern.Length == 0) return 0;
if (path.Length == 0) return -1;
if (fastPath)
{
return path.IndexOf(pattern) != -1;
return path.StartsWith(pattern) ? pattern.Length : -1;
Copy link

Copilot AI Jan 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For simple patterns (no wildcards), the original logic checked if the pattern appeared anywhere in the path using IndexOf. The new logic only checks if the path starts with the pattern. This changes the behavior for non-wildcard patterns and may break existing functionality where patterns like '/bar' should match paths like '/foo/bar'.

Suggested change
return path.StartsWith(pattern) ? pattern.Length : -1;
return path.IndexOf(pattern) >= 0 ? pattern.Length : -1;

Copilot uses AI. Check for mistakes.
}
return Matches(path, pattern) ? pattern.Length : -1;
}

private static bool Matches(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pattern)
{
Span<int> pos = stackalloc int[path.Length + 1];
Comment on lines +85 to 87
Copy link

Copilot AI Jan 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Matches method is now private but appears to be incomplete in the diff. The method allocates a stack buffer but the implementation details are cut off. Ensure that this method body is complete and correctly implements the wildcard matching logic that was previously in MatchesFast.

Copilot uses AI. Check for mistakes.
var numpos = 1;

Expand Down
164 changes: 86 additions & 78 deletions RobotsTxt/RobotsMachine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,16 @@ private class State;

private class UserAgentState : State;

private class AllowState(byte[] pattern, bool haveWildcards) : State
private class AllowState(ReadOnlyMemory<byte> pattern, bool fastPath) : State
{
public byte[] Pattern { get; } = pattern;
public bool HaveWildcards { get; } = haveWildcards;
public ReadOnlyMemory<byte> Pattern { get; } = pattern;
public bool FastPath { get; } = fastPath;
}

private class DisallowState(byte[] pattern, bool haveWildcards) : State
private class DisallowState(ReadOnlyMemory<byte> pattern, bool fastPath) : State
{
public byte[] Pattern { get; } = pattern;
public bool HaveWildcards { get; } = haveWildcards;
public ReadOnlyMemory<byte> Pattern { get; } = pattern;
public bool FastPath { get; } = fastPath;
}

private readonly List<byte[]> _userAgents;
Expand Down Expand Up @@ -90,7 +90,18 @@ public void HandleUserAgent(int lineNum, ReadOnlySpan<byte> userAgent)
userAgent = ExtractUserAgent(userAgent);
foreach (var ua in _userAgents)
{
if (!userAgent.EqualsIgnoreCase(ua)) continue;
if (userAgent.Length != ua.Length) continue;
bool match = true;
for (int i = 0; i < ua.Length; i++)
{
byte a = userAgent[i];
byte b = ua[i];
if (a == b || (a >= 'A' && a <= 'Z' && a + 32 == b) || (b >= 'A' && b <= 'Z' && b + 32 == a))
continue;
match = false;
break;
}
Comment on lines +93 to +103
Copy link

Copilot AI Jan 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This manual case-insensitive comparison duplicates the logic already implemented in MyExtensions.EqualsIgnoreCase(). Consider using the existing extension method instead to reduce code duplication and improve maintainability.

Copilot uses AI. Check for mistakes.
if (!match) continue;
_specificStates.Add(new UserAgentState());
_everSeenSpecificAgent = _seenSpecificAgent = true;
return;
Expand All @@ -102,20 +113,49 @@ public void HandleAllow(int lineNum, ReadOnlySpan<byte> value)
if (!CurrentAgentIsSignificant)
return;
_seenSeparator = true;
var haveWildcards = value.Length >= 1 && (value.Contains((byte)'*') || value[^1] == '$');
var state = new AllowState(value.ToArray(), haveWildcards);

var fastPath = !value.ContainsAny("*$"u8);
Copy link

Copilot AI Jan 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The variable name 'fastPath' is ambiguous in this context. It's not immediately clear that this flag indicates whether the pattern lacks wildcards. Consider renaming to 'hasNoWildcards' or 'isSimplePattern' to better convey its meaning.

Copilot uses AI. Check for mistakes.
Copy link

Copilot AI Jan 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The variable name 'fastPath' is ambiguous. It should be renamed to 'isSimplePattern' to match the naming used in the AllowState constructor and throughout the codebase.

Copilot uses AI. Check for mistakes.

AllowState? rootState = null;
// Google-specific optimization: 'index.htm' and 'index.html' are normalized
// to '/'.
var slashPos = value.LastIndexOf((byte)'/');
if (slashPos != -1 && value[slashPos..].StartsWith(IndexHtmBytes))
{
var len = slashPos + 1;
var newValue = new byte[len + 1];
value[..len].CopyTo(newValue);
newValue[len] = (byte)'$';
rootState = new AllowState(newValue, false);
}

var state = new AllowState(value.ToArray(), fastPath);
if (_seenSpecificAgent)
{
_specificStates.Add(state);
if (rootState != null)
{
_specificStates.Add(rootState);
}
}
if (_seenGlobalAgent)
{
_globalStates.Add(state);
if (rootState != null)
{
_globalStates.Add(rootState);
}
}
}

public void HandleDisallow(int lineNum, ReadOnlySpan<byte> value)
{
if (!CurrentAgentIsSignificant)
return;
_seenSeparator = true;
var haveWildcards = value.Length >= 1 && (value.Contains((byte)'*') || value[^1] == '$');
var state = new DisallowState(value.ToArray(), haveWildcards);

var fastPath = !value.ContainsAny("*$"u8);
var state = new DisallowState(value.ToArray(), fastPath);
Copy link

Copilot AI Jan 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The variable name 'fastPath' is ambiguous in this context. It's not immediately clear that this flag indicates whether the pattern lacks wildcards. Consider renaming to 'hasNoWildcards' or 'isSimplePattern' to better convey its meaning.

Suggested change
var fastPath = !value.ContainsAny("*$"u8);
var state = new DisallowState(value.ToArray(), fastPath);
var hasNoWildcards = !value.ContainsAny("*$"u8);
var state = new DisallowState(value.ToArray(), hasNoWildcards);

Copilot uses AI. Check for mistakes.
if (_seenSpecificAgent)
_specificStates.Add(state);
if (_seenGlobalAgent)
Expand All @@ -132,51 +172,54 @@ public void HandleUnknownAction(int lineNum, ReadOnlySpan<byte> action, ReadOnly

public bool PathAllowedByRobots(byte[] path)
{
return !Disallow(path);
}

private bool Disallow(byte[] path)
{
if (!SeenAnyAgent)
return false;
return !Disallow();

var (allowHierarchy, disallowHierarchy) = AssessAccessRules(path, _specificStates);
if (allowHierarchy > 0 || disallowHierarchy > 0)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
bool Disallow()
{
return disallowHierarchy > allowHierarchy;
}
if (!SeenAnyAgent)
return false;

if (_everSeenSpecificAgent)
{
// Matching group for user-agent but either without disallow or empty one,
// i.e. priority == 0.
return false;
}
var (allowHierarchy, disallowHierarchy) = AssessAccessRules(path, _specificStates);
if (allowHierarchy > 0 || disallowHierarchy > 0)
{
return disallowHierarchy > allowHierarchy;
}

(allowHierarchy, disallowHierarchy) = AssessAccessRules(path, _globalStates);
if (_everSeenSpecificAgent)
{
// Matching group for user-agent but either without disallow or empty one,
// i.e. priority == 0.
return false;
}

if (disallowHierarchy > 0 || allowHierarchy > 0)
{
return disallowHierarchy > allowHierarchy;
}
(allowHierarchy, disallowHierarchy) = AssessAccessRules(path, _globalStates);

if (disallowHierarchy > 0 || allowHierarchy > 0)
{
return disallowHierarchy > allowHierarchy;
}

return false;
return false;
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static (int, int) AssessAccessRules(byte[] path, List<State> states)
{
var allowHierarchy = NoMatchPriority; // Characters of 'url' matching Allow.
var disallowHierarchy = NoMatchPriority; // Characters of 'url' matching Disallow.

foreach (var state in states)
for (int i = 0; i < states.Count; i++)
{
var state = states[i];
switch (state)
{
case AllowState allow:
allowHierarchy = CheckAllow(path, allow.Pattern, allow.HaveWildcards, allowHierarchy);
allowHierarchy = Check(path, allow.Pattern.Span, allow.FastPath, allowHierarchy);
break;
case DisallowState disallow:
disallowHierarchy = CheckDisallow(path, disallow.Pattern, disallow.HaveWildcards, disallowHierarchy);
disallowHierarchy = Check(path, disallow.Pattern.Span, disallow.FastPath, disallowHierarchy);
break;
}
}
Expand All @@ -186,49 +229,14 @@ private static (int, int) AssessAccessRules(byte[] path, List<State> states)
private static readonly byte[] IndexHtmBytes = "/index.htm"u8.ToArray();

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int CheckAllow(byte[] path, ReadOnlySpan<byte> pattern, bool haveWildcards, int allow)
{
while (true)
{
var priority = LongestMatchRobotsMatchStrategy.MatchAllowFast(path, pattern, haveWildcards);
if (priority >= 0)
{
if (allow < priority)
{
allow = priority;
}
}
else
{
// Google-specific optimization: 'index.htm' and 'index.html' are normalized
// to '/'.
var slashPos = pattern.LastIndexOf((byte)'/');

if (slashPos != -1 && pattern[slashPos..].StartsWith(IndexHtmBytes))
{
var len = slashPos + 1;
var newpattern = new byte[len + 1];
pattern[..len].CopyTo(newpattern);
newpattern[len] = (byte)'$';
pattern = newpattern;
haveWildcards = true;
continue;
}
}
break;
}
return allow;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int CheckDisallow(byte[] path, ReadOnlySpan<byte> value, bool haveWildcards, int disallow)
private static int Check(byte[] path, ReadOnlySpan<byte> pattern, bool fastPath, int currentPriority)
{
var priority = LongestMatchRobotsMatchStrategy.MatchDisallowFast(path, value, haveWildcards);
if (priority < 0) return disallow;
if (disallow < priority)
var priority = LongestMatchRobotsMatchStrategy.MatchFast(path, pattern, fastPath);
if (priority < 0) return currentPriority;
if (currentPriority < priority)
{
disallow = priority;
currentPriority = priority;
}
return disallow;
return currentPriority;
}
}
1 change: 0 additions & 1 deletion RobotsTxt/RobotsTxt.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

<ItemGroup>
<PackageReference Include="Microsoft.Bcl.Memory" Version="9.0.9" />
<PackageReference Include="System.Memory" Version="4.6.3" />
</ItemGroup>

</Project>
40 changes: 20 additions & 20 deletions TestRobotsTxt/TestLongestMatchRobotsMatchStrategy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,34 @@ namespace TestRobotsTxt
public class TestsLongestMatchRobotsMatchStrategy
{
[Theory]
[InlineData("/", "/", true)]
[InlineData("/", "/$", true)]
[InlineData("a", "b", false)]
[InlineData("abcd", "a", true)]
[InlineData("abcd", "a$", false)]
[InlineData("abcd", "a*", true)]
[InlineData("abcd", "a*b", true)]
[InlineData("abcd", "a*c", true)]
[InlineData("abcd", "a*d", true)]
[InlineData("abcd", "a*d$", true)]
[InlineData("abcd", "a*c$", false)]
[InlineData("/abcd/e//fg/hij/k/lm/nop/q/r/", "/*/*/*/*/*/*/*/*/*/*/*", true)]
public void TestMatch(string path, string pattern, bool expected)
[InlineData("/", "/", true, 1)]
[InlineData("/", "/$", true, 2)]
[InlineData("a", "b", false, -1)]
[InlineData("/foo/bar", "/bar", false, -1)]
Copy link

Copilot AI Jan 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test case verifies that substring matching doesn't occur when it shouldn't (e.g., '/bar' should not match '/foo/bar'). However, there's no corresponding test for the opposite scenario where a pattern should match when it appears at the start of the path. Consider adding a test case like [InlineData("/bar/foo", "/bar", true, 4)] to ensure the fast path correctly matches path prefixes.

Copilot uses AI. Check for mistakes.
Copy link

Copilot AI Jan 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This new test case validates that patterns must match from the start of the path. Consider adding a corresponding test in the 'Slow' path to ensure both implementations handle this scenario consistently.

Copilot uses AI. Check for mistakes.
[InlineData("abcd", "a", true, 1)]
[InlineData("abcd", "a$", false, -1)]
[InlineData("abcd", "a*", true, 2)]
[InlineData("abcd", "a*b", true, 3)]
[InlineData("abcd", "a*c", true, 3)]
[InlineData("abcd", "a*d", true, 3)]
[InlineData("abcd", "a*d$", true, 4)]
[InlineData("abcd", "a*c$", false, -1)]
[InlineData("/abcd/e//fg/hij/k/lm/nop/q/r/", "/*/*/*/*/*/*/*/*/*/*/*", true, 22)]
public void TestMatch(string path, string pattern, bool expected, int len)
{
var actual =
LongestMatchRobotsMatchStrategy.MatchesSlow(
Encoding.UTF8.GetBytes(path),
Encoding.UTF8.GetBytes(pattern)
);
Assert.Equal(expected, actual);
var haveWildcards = pattern.Length >= 1 && (pattern.Contains('*') || pattern[^1] == '$');
actual =
LongestMatchRobotsMatchStrategy.MatchesFast(
Encoding.UTF8.GetBytes(path),
Encoding.UTF8.GetBytes(pattern),
haveWildcards
var fastPath = !pattern.AsSpan().ContainsAny("*$");
var actualLen =
LongestMatchRobotsMatchStrategy.MatchFast(Encoding.UTF8.GetBytes(path),
Encoding.UTF8.GetBytes(pattern), fastPath
);
Assert.Equal(expected, actual);
Assert.Equal(len, actualLen);
Assert.Equal(expected, actualLen >= 0);
}
}
}
Loading