Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/orange-tools-train.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
'hive': minor
---

eliminate clickhouse query timeouts and improve read times of large amounts of traces in dashboard
104 changes: 74 additions & 30 deletions packages/services/api/src/modules/operations/providers/traces.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,9 @@ export class Traces {
WHERE
"trace_id" IN (${sql.array(traceIds, 'String')})
LIMIT 1 BY "trace_id"
SETTINGS max_threads = 8
`,
timeout: 10_000,
timeout: 30_000,
queryId: 'Traces.findTraceByTraceId',
});

Expand Down Expand Up @@ -99,11 +100,13 @@ export class Traces {
${spanFields}
FROM
"otel_traces"
WHERE
PREWHERE
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice. Clickhouse says this should be done automatically though. Do we know why we have to specify this?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ClickHouse's automatic optimization is conservative and skips PREWHERE when it sees complex expressions like map accesses (SpanAttributes['hive.target_id']) or compound pagination conditions. The explicit PREWHERE ensures the deterministic optimization behavior we expect.

"TraceId" = ${traceId}
AND "SpanAttributes"['hive.target_id'] = ${targetId}
WHERE
"SpanAttributes"['hive.target_id'] = ${targetId}
SETTINGS max_threads = 8
`,
timeout: 10_000,
timeout: 30_000,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why increase?
Realistically we dont want to wait 30s. Most people would refresh by then and we'd just be wasting time fetching.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. The timeout increase was added as a safety margin, but you're right that 30s is too long for a query. It's only there to make sure we render something at some point, regardless if the user focus is lost. I think it's harmless to have it increased.

queryId: 'Traces.findSpansForTraceId',
});

Expand Down Expand Up @@ -174,38 +177,59 @@ export class Traces {
const operator = sort?.direction === 'ASC' ? sql`>` : sql`<`;
paginationSQLFragmentPart = sql`
AND (
(
"timestamp" = ${cursor.timestamp}
AND "trace_id" < ${cursor.traceId}
"timestamp" ${operator} ${cursor.timestamp}
OR (
"timestamp" = ${cursor.timestamp}
AND "trace_id" < ${cursor.traceId}
)
OR "timestamp" ${operator} ${cursor.timestamp}
)
`;
}
}

const sqlConditions = buildTraceFilterSQLConditions(filter, false);

const filterSQLFragment = sqlConditions.length
? sql`AND ${sql.join(sqlConditions, ' AND ')}`
const timestampPrewhereConditions: SqlValue[] = [];
const otherFilterConditions: SqlValue[] = [];

for (const condition of sqlConditions) {
if (condition.sql.includes('"otel_traces_normalized"."timestamp"')) {
timestampPrewhereConditions.push(condition);
} else {
otherFilterConditions.push(condition);
}
}

const filterSQLFragment = otherFilterConditions.length
? sql`AND ${sql.join(otherFilterConditions, ' AND ')}`
: sql``;

const prewhereTimestampFragment = timestampPrewhereConditions.length
? sql`AND ${sql.join(timestampPrewhereConditions, ' AND ')}`
: sql``;

const query = sql`
SELECT
${traceFields}
FROM
"otel_traces_normalized"
PREWHERE
target_id = ${targetId}
${prewhereTimestampFragment}
WHERE
true
${paginationSQLFragmentPart}
${filterSQLFragment}
ORDER BY
${orderByFragment}
LIMIT ${sql`limit + 1`}
SETTINGS max_threads = 8
`;

const tracesQuery = await this.clickHouse.query<unknown>({
query: sql`
SELECT
${traceFields}
FROM
"otel_traces_normalized"
WHERE
target_id = ${targetId}
${paginationSQLFragmentPart}
${filterSQLFragment}
ORDER BY
${orderByFragment}
LIMIT ${sql.raw(String(limit + 1))}
`,
query,
queryId: 'traces',
timeout: 10_000,
timeout: 30_000,
});

let traces = TraceListModel.parse(tracesQuery.data);
Expand Down Expand Up @@ -298,17 +322,18 @@ export class Traces {
, sumIf(1, "graphql_error_count" != 0 ${filterSQLFragment}) AS "error_count_filtered"
FROM
"otel_traces_normalized"
WHERE
PREWHERE
"target_id" = ${targetId}
AND "otel_traces_normalized"."timestamp" >= toDateTime(${formatDate(startDate)}, 'UTC')
AND "otel_traces_normalized"."timestamp" <= toDateTime(${formatDate(endDate)}, 'UTC')
GROUP BY
"time_bucket_start"
) AS "t"
ON "t"."time_bucket_start" = "time_bucket_list"."time_bucket"
SETTINGS max_threads = 8
`,
queryId: `trace_status_breakdown_for_target_id_`,
timeout: 10_000,
timeout: 30_000,
});

return TraceStatusBreakdownBucketList.parse(result.data);
Expand Down Expand Up @@ -344,15 +369,26 @@ export class TraceBreakdownLoader {
const arrJoinColumnAlias = 'arr_join_column_value';

for (const { key, columnExpression, limit, arrayJoinColumn } of inputs) {
const prewhereConditions: SqlValue[] = [];
const whereConditions: SqlValue[] = [];

for (const condition of this.conditions) {
if (condition.sql.includes('target_id') || condition.sql.includes('"timestamp"')) {
prewhereConditions.push(condition);
} else {
whereConditions.push(condition);
}
}

statements.push(sql`
SELECT
'${sql.raw(key)}' AS "key"
, toString(${sql.raw(columnExpression ?? arrJoinColumnAlias)}) AS "value"
, count(*) AS "count"
FROM "otel_traces_normalized"
${sql.raw(arrayJoinColumn ? `ARRAY JOIN ${arrayJoinColumn} AS "${arrJoinColumnAlias}"` : '')}
WHERE
${sql.join(this.conditions, ' AND ')}
${prewhereConditions.length ? sql`PREWHERE ${sql.join(prewhereConditions, ' AND ')}` : sql``}
${whereConditions.length ? sql`WHERE ${sql.join(whereConditions, ' AND ')}` : sql``}
GROUP BY
"value"
ORDER BY
Expand All @@ -363,6 +399,7 @@ export class TraceBreakdownLoader {

const query = sql`
${sql.join(statements, ' UNION ALL ')}
SETTINGS max_threads = 8
`;

const results = await this.clickhouse.query<{
Expand All @@ -372,7 +409,7 @@ export class TraceBreakdownLoader {
}>({
query,
queryId: 'traces_filter_options',
timeout: 10_000,
timeout: 60_000,
});

const rowsGroupedByKey = results.data.reduce(
Expand Down Expand Up @@ -403,6 +440,14 @@ export class TraceBreakdownLoader {

this.conditions = [sql`target_id = ${targetId}`];

if (filter?.period) {
const period = parseDateRangeInput(filter.period);
this.conditions.push(
sql`"timestamp" >= toDateTime(${formatDate(period.from)}, 'UTC')`,
sql`"timestamp" <= toDateTime(${formatDate(period.to)}, 'UTC')`,
);
}

if (filter?.traceIds?.length) {
this.conditions.push(sql`"trace_id" IN (${sql.array(filter.traceIds, 'String')})`);
}
Expand Down Expand Up @@ -571,7 +616,6 @@ const traceFields = sql`
, "http_url" AS "httpUrl"
, "duration"
, "graphql_operation_name" AS "graphqlOperationName"
, "graphql_operation_document" AS "graphqlOperationDocument"
, "graphql_operation_hash" AS "graphqlOperationHash"
, "client_name" AS "clientName"
, "client_version" AS "clientVersion"
Expand Down
Loading