Skip to content

Commit 15e935b

Browse files
Fix identifier parsing and improve FROM clause parsing (#3368)
Co-authored-by: Martin Costello <[email protected]>
1 parent 6e15369 commit 15e935b

File tree

4 files changed

+100
-9
lines changed

4 files changed

+100
-9
lines changed

src/OpenTelemetry.Instrumentation.EntityFrameworkCore/CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
## Unreleased
44

5+
* Fix diacritic identifier parsing hang and improve FROM clause parsing.
6+
([#3368](https://github.com/open-telemetry/opentelemetry-dotnet-contrib/pull/3368))
7+
58
## 1.13.0-beta.1
69

710
Released 2025-Oct-22

src/OpenTelemetry.Instrumentation.SqlClient/CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
## Unreleased
44

5+
* Fix diacritic identifier parsing hang and improve FROM clause parsing.
6+
([#3368](https://github.com/open-telemetry/opentelemetry-dotnet-contrib/pull/3368))
7+
58
## 1.13.0-beta.1
69

710
Released 2025-Oct-22

src/Shared/SqlProcessor.cs

Lines changed: 68 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ internal static class SqlProcessor
1515
private const char SanitizationPlaceholder = '?';
1616
private const char SpaceChar = ' ';
1717
private const char CommaChar = ',';
18+
private const char OpenSquareBracketChar = '[';
1819
private const char OpenParenChar = '(';
1920
private const char CloseParenChar = ')';
2021
private const char DashChar = '-';
@@ -35,6 +36,13 @@ internal static class SqlProcessor
3536
private static readonly SearchValues<char> WhitespaceSearchValues = SearchValues.Create(WhitespaceChars);
3637
#endif
3738

39+
// This is not an exhaustive list but covers the majority of common reserved SQL keywords that may follow a FROM clause.
40+
// This is used when determining if the previous token is a keyword in order to identify the end of a comma separated FROM clause.
41+
// NOTE: These are ordered so that more likely keywords appear first to shorten the comparison loop.
42+
private static readonly string[] FromClauseReservedKeywords = [
43+
"WHERE", "BY", "AS", "JOIN", "WITH", "CROSS", "HAVING", "WINDOW", "LIMIT", "OFFSET", "TABLESAMPLE", "PIVOT", "UNPIVOT"
44+
];
45+
3846
// We can extend this in the future to include more keywords if needed.
3947
// The keywords should be ordered by frequency of use to optimize performance.
4048
// This only includes keywords that may be the first keyword in a statement.
@@ -148,9 +156,9 @@ private static bool IsAsciiDigit(char c) =>
148156
[MethodImpl(MethodImplOptions.AggressiveInlining)]
149157
private static bool IsAsciiIdentifierChar(char c) =>
150158
#if NET
151-
char.IsAsciiLetter(c) || char.IsAsciiDigit(c) || c == UnderscoreChar || c == DotChar;
159+
char.IsLetter(c) || char.IsAsciiDigit(c) || c == UnderscoreChar || c == DotChar;
152160
#else
153-
IsAsciiLetter(c) || IsAsciiDigit(c) || c == UnderscoreChar || c == DotChar;
161+
char.IsLetter(c) || IsAsciiDigit(c) || c == UnderscoreChar || c == DotChar;
154162
#endif
155163

156164
private static SqlStatementInfo SanitizeSql(ReadOnlySpan<char> sql)
@@ -371,6 +379,7 @@ private static void ParseNextToken(
371379
state.ParsePosition += keywordLength;
372380
state.PreviousTokenStartPosition = start;
373381
state.PreviousTokenEndPosition = start + keywordLength;
382+
state.InFromClause = potentialKeywordInfo.SqlKeyword == SqlKeyword.From;
374383

375384
// No further parsing needed for this token
376385
return;
@@ -401,14 +410,31 @@ private static void ParseNextToken(
401410
var length = i - start;
402411
if (length > 0)
403412
{
413+
var tokenSpan = sql.Slice(start, length);
414+
415+
// Special handling: if we are in a FROM clause, check if this identifier is a reserved keyword
416+
// that indicates the end of the FROM clause.
417+
if (state.InFromClause)
418+
{
419+
foreach (var reservedKeyword in FromClauseReservedKeywords)
420+
{
421+
if (IsCaseInsensitiveMatch(tokenSpan, reservedKeyword))
422+
{
423+
// We've matched a reserved keyword so are no longer in the FROM clause.
424+
state.InFromClause = false;
425+
break;
426+
}
427+
}
428+
}
429+
404430
// Copy to sanitized buffer.
405-
sql.Slice(start, length).CopyTo(buffer.Slice(state.SanitizedPosition));
431+
tokenSpan.CopyTo(buffer.Slice(state.SanitizedPosition));
406432
state.SanitizedPosition += length;
407433

408434
// Optionally copy to summary buffer.
409435
if (state.CaptureNextTokenInSummary)
410436
{
411-
sql.Slice(start, length).CopyTo(state.SummaryBuffer.Slice(state.SummaryPosition));
437+
tokenSpan.CopyTo(state.SummaryBuffer.Slice(state.SummaryPosition));
412438
state.SummaryPosition += length;
413439

414440
// Add a space after the identifier. The trailing space will be trimmed later.
@@ -423,15 +449,35 @@ private static void ParseNextToken(
423449
}
424450
else
425451
{
426-
// Special case: if the token is a comma and follows a FROM keyword,
427-
// we are in an old style implicit join and we want to capture the next valid identifier in the summary.
428-
var prevKeyword = state.PreviousParsedKeyword?.SqlKeyword ?? SqlKeyword.Unknown;
429-
state.CaptureNextTokenInSummary = prevKeyword == SqlKeyword.From && currentChar == CommaChar;
452+
// If we are in a FROM clause, we want to capture the next identifier following a comma or open square bracket.
453+
// Commas may occur when listing multiple tables in a FROM clause.
454+
// Brackets may occur when using schema-qualified or delimited identifiers.
455+
state.CaptureNextTokenInSummary = state.InFromClause && (currentChar == CommaChar || currentChar == OpenSquareBracketChar);
430456

431457
buffer[state.SanitizedPosition++] = currentChar;
432458
state.ParsePosition++;
433459

434-
// We don't update previous token start/end positions for single-char tokens.
460+
// NOTE: We don't update previous token start/end positions for single-char tokens.
461+
}
462+
463+
static bool IsCaseInsensitiveMatch(ReadOnlySpan<char> tokenSpan, string reservedKeyword)
464+
{
465+
if (tokenSpan.Length != reservedKeyword.Length)
466+
{
467+
return false;
468+
}
469+
470+
var reservedKeywordSpan = reservedKeyword.AsSpan();
471+
472+
for (var charPos = 0; charPos < tokenSpan.Length; charPos++)
473+
{
474+
if ((tokenSpan[charPos] | 0x20) != (reservedKeywordSpan[charPos] | 0x20))
475+
{
476+
return false;
477+
}
478+
}
479+
480+
return true;
435481
}
436482
}
437483

@@ -724,6 +770,10 @@ private ref struct ParseState
724770
// Stored in state to avoid slicing repeatedly.
725771
public Span<char> SummaryBuffer;
726772

773+
/// <summary>
774+
/// Will be set if a keyword has been matched by the parser.
775+
/// Not all keywords are necessarily matched.
776+
/// </summary>
727777
public SqlKeywordInfo? PreviousParsedKeyword; // 8 bytes (reference type)
728778

729779
public SqlKeyword FirstSummaryKeyword; // 4 bytes (enum, underlying int)
@@ -739,7 +789,16 @@ private ref struct ParseState
739789
public int PreviousTokenStartPosition; // 4 bytes
740790
public int PreviousTokenEndPosition; // 4 bytes
741791

792+
// NOTE: If the number of bool fields increases significantly, consider combining into a bitfield.
793+
742794
public bool CaptureNextTokenInSummary; // 1 byte
795+
796+
/// <summary>
797+
/// Used to track if we are in a FROM clause for special handling of comma-separated table lists.
798+
/// When set to <c>true</c>, subsequent unmatched tokens will be compared against reserved keywords.
799+
/// As soon as we match a reserved keyword, we exit the FROM clause state.
800+
/// </summary>
801+
public bool InFromClause; // 1 byte
743802
}
744803

745804
private sealed class SqlKeywordInfo

test/OpenTelemetry.Contrib.Shared.Tests/SqlProcessorAdditionalTestCases.json

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -857,5 +857,31 @@
857857
],
858858
"db.query.summary": "ALTER TABLE MyTable"
859859
}
860+
},
861+
{
862+
"name": "query_with_diacritic_characters",
863+
"input": {
864+
"db.system.name": "microsoft.sql_server",
865+
"query": "DECLARE @__geräteIds_0 nvarchar(4000) = N'[1,2,3]';\n\nSELECT [t].[Id]\nFROM [Tests] AS [t]\nWHERE [t].[Id] IN (\n\tSELECT [g].[value]\n\tFROM OPENJSON(@__geräteIds_0) WITH ([value] bigint '$') AS [g]\n)"
866+
},
867+
"expected": {
868+
"db.query.text": [
869+
"DECLARE @__geräteIds_0 nvarchar(?) = N?;\n\nSELECT [t].[Id]\nFROM [Tests] AS [t]\nWHERE [t].[Id] IN (\n\tSELECT [g].[value]\n\tFROM OPENJSON(@__geräteIds_0) WITH ([value] bigint ?) AS [g]\n)"
870+
],
871+
"db.query.summary": "SELECT Tests"
872+
}
873+
},
874+
{
875+
"name": "cross_join_comma_separated_syntax_followed_by_composite_group_by",
876+
"input": {
877+
"db.system.name": "other_sql",
878+
"query": "SELECT a, b FROM TableA, TableB GROUP BY a, b"
879+
},
880+
"expected": {
881+
"db.query.text": [
882+
"SELECT a, b FROM TableA, TableB GROUP BY a, b"
883+
],
884+
"db.query.summary": "SELECT TableA TableB"
885+
}
860886
}
861887
]

0 commit comments

Comments
 (0)