From ba203641ff9b2f079222d362fd91bda732950003 Mon Sep 17 00:00:00 2001 From: Karlo Medallo <4827074+karlomedallo@users.noreply.github.com> Date: Wed, 12 Mar 2025 02:12:33 +1100 Subject: [PATCH 1/3] feat(duckdb-driver): Fix numeric filter comparisons in DuckDB Fixes [#9281](https://github.com/cube-js/cube/issues/9281) by implementing explicit CAST for numeric filters in the DuckDB driver. This ensures correct numeric comparisons and prevents mismatches caused by string-to-number conversions. Added test cases to validate casting behavior for different filter scenarios. --- .../cubejs-duckdb-driver/src/DuckDBQuery.ts | 6 + .../birdbox-fixtures/duckdb/schema/Orders.js | 4 + .../cubejs-testing/test/smoke-duckdb.test.ts | 117 +++++++++++++++++- 3 files changed, 126 insertions(+), 1 deletion(-) diff --git a/packages/cubejs-duckdb-driver/src/DuckDBQuery.ts b/packages/cubejs-duckdb-driver/src/DuckDBQuery.ts index 96cd50a725128..54286886dc517 100644 --- a/packages/cubejs-duckdb-driver/src/DuckDBQuery.ts +++ b/packages/cubejs-duckdb-driver/src/DuckDBQuery.ts @@ -12,6 +12,12 @@ const GRANULARITY_TO_INTERVAL: Record string> = { }; class DuckDBFilter extends BaseFilter { + public castParameter() { + if (this.measure || this.definition().type === 'number') { + return 'CAST(? AS DOUBLE)'; + } + return '?'; + } } export class DuckDBQuery extends BaseQuery { diff --git a/packages/cubejs-testing/birdbox-fixtures/duckdb/schema/Orders.js b/packages/cubejs-testing/birdbox-fixtures/duckdb/schema/Orders.js index 54bd827a1806c..53dc3dcf3486d 100644 --- a/packages/cubejs-testing/birdbox-fixtures/duckdb/schema/Orders.js +++ b/packages/cubejs-testing/birdbox-fixtures/duckdb/schema/Orders.js @@ -29,5 +29,9 @@ cube(`Orders`, { sql: `status`, type: `string`, }, + amount: { + sql: `amount`, + type: `number`, + }, }, }); diff --git a/packages/cubejs-testing/test/smoke-duckdb.test.ts b/packages/cubejs-testing/test/smoke-duckdb.test.ts index b487e31a6168e..7d50c52205d5d 100644 --- a/packages/cubejs-testing/test/smoke-duckdb.test.ts +++ b/packages/cubejs-testing/test/smoke-duckdb.test.ts @@ -1,6 +1,6 @@ import cubejs, { CubeApi } from '@cubejs-client/core'; // eslint-disable-next-line import/no-extraneous-dependencies -import { afterAll, beforeAll, jest } from '@jest/globals'; +import { afterAll, beforeAll, expect, jest, test } from '@jest/globals'; import { BirdBox, getBirdbox } from '../src'; import { DEFAULT_API_TOKEN, @@ -37,4 +37,119 @@ describe('duckdb', () => { }, JEST_AFTER_ALL_DEFAULT_TIMEOUT); test('query measure', () => testQueryMeasure(client)); + + test('numeric measure filter - uses CAST', async () => { + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.totalAmount', + operator: 'gte', + values: ['300'] + } + ] + }); + + // The HAVING clause applies after counting all records + expect(response.rawData()[0]['Orders.count']).toBe('5'); + }); + + test('numeric dimension filter - uses CAST', async () => { + // This test verifies that the WHERE clause for numeric dimensions uses CAST(? AS DOUBLE) + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.amount', + operator: 'gte', + values: ['300'] + } + ] + }); + + // Only 3 orders have amount >= 300 + expect(response.rawData()[0]['Orders.count']).toBe('3'); + }); + + test('string filter - does NOT use CAST', async () => { + // This test verifies that the WHERE clause for string dimensions does not use CAST + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.status', + operator: 'equals', + values: ['processed'] + } + ] + }); + + // There are 2 'processed' orders + expect(response.rawData()[0]['Orders.count']).toBe('2'); + }); + + test('numeric exact equality filter - also uses CAST', async () => { + // This test verifies that even for equality comparisons on numeric values, CAST is used + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.amount', + operator: 'equals', + values: ['300'] + } + ] + }); + + // Only 1 order has amount exactly 300 + expect(response.rawData()[0]['Orders.count']).toBe('1'); + }); + + test('numeric string comparison - values are properly handled with CAST', async () => { + // This test verifies that numeric strings are properly cast as numbers + // This is important because '100' and 100 are different in string comparisons + // but should be treated the same with numeric CAST + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.amount', + operator: 'equals', + values: ['100'] // string '100' should be cast to number 100 + } + ] + }); + + // Only 1 order has amount 100 + expect(response.rawData()[0]['Orders.count']).toBe('1'); + }); + + test('multiple string values in filter - none use CAST', async () => { + // This test verifies that IN (...) clauses for string dimensions don't use CAST + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.status', + operator: 'equals', + values: ['new', 'processed'] + } + ] + }); + + // There are 4 orders with status 'new' or 'processed' + expect(response.rawData()[0]['Orders.count']).toBe('4'); + }); }); From 7b20e9b28d2a4651d0c2892e2bb8be82645be708 Mon Sep 17 00:00:00 2001 From: Karlo Medallo <4827074+karlomedallo@users.noreply.github.com> Date: Fri, 14 Mar 2025 22:24:44 +1100 Subject: [PATCH 2/3] feat(duckdb-driver): Fix numeric and time measure filter comparisons --- .../cubejs-duckdb-driver/src/DuckDBQuery.ts | 14 +- .../birdbox-fixtures/duckdb/schema/Orders.js | 40 +- .../cubejs-testing/test/smoke-duckdb.test.ts | 410 ++++++++++++++++++ 3 files changed, 457 insertions(+), 7 deletions(-) diff --git a/packages/cubejs-duckdb-driver/src/DuckDBQuery.ts b/packages/cubejs-duckdb-driver/src/DuckDBQuery.ts index 54286886dc517..6561568aa50e7 100644 --- a/packages/cubejs-duckdb-driver/src/DuckDBQuery.ts +++ b/packages/cubejs-duckdb-driver/src/DuckDBQuery.ts @@ -13,9 +13,13 @@ const GRANULARITY_TO_INTERVAL: Record string> = { class DuckDBFilter extends BaseFilter { public castParameter() { - if (this.measure || this.definition().type === 'number') { + const numberTypes = ['number', 'count', 'count_distinct', 'count_distinct_approx', 'sum', 'avg', 'min', 'max']; + const definition = this.definition(); + + if (numberTypes.includes(definition.type)) { return 'CAST(? AS DOUBLE)'; } + return '?'; } } @@ -61,4 +65,12 @@ export class DuckDBQuery extends BaseQuery { templates.functions.GREATEST = 'GREATEST({{ args_concat }})'; return templates; } + + public timeStampParam(timeDimension: any) { + if (timeDimension.measure) { + // For time measures, we don't need to check dateFieldType + return super.timeStampCast('?'); + } + return super.timeStampParam(timeDimension); + } } diff --git a/packages/cubejs-testing/birdbox-fixtures/duckdb/schema/Orders.js b/packages/cubejs-testing/birdbox-fixtures/duckdb/schema/Orders.js index 53dc3dcf3486d..85d185a069e4a 100644 --- a/packages/cubejs-testing/birdbox-fixtures/duckdb/schema/Orders.js +++ b/packages/cubejs-testing/birdbox-fixtures/duckdb/schema/Orders.js @@ -1,15 +1,15 @@ cube(`Orders`, { sql: ` - select id, amount, status from ( - select 1 as id, 100 as amount, 'new' status + select id, amount, status, created_at, is_paid from ( + select 1 as id, 100 as amount, 'new' status, TIMESTAMPTZ '2020-01-01 00:00:00' created_at, TRUE as is_paid UNION ALL - select 2 as id, 200 as amount, 'new' status + select 2 as id, 200 as amount, 'new' status, TIMESTAMPTZ '2020-01-02 00:00:00' created_at, TRUE as is_paid UNION ALL - select 3 as id, 300 as amount, 'processed' status + select 3 as id, 300 as amount, 'processed' status, TIMESTAMPTZ '2020-01-03 00:00:00' created_at, TRUE as is_paid UNION ALL - select 4 as id, 500 as amount, 'processed' status + select 4 as id, 500 as amount, 'processed' status, TIMESTAMPTZ '2020-01-04 00:00:00' created_at, FALSE as is_paid UNION ALL - select 5 as id, 600 as amount, 'shipped' status + select 5 as id, 600 as amount, 'shipped' status, TIMESTAMPTZ '2020-01-05 00:00:00' created_at, FALSE as is_paid ) `, measures: { @@ -20,6 +20,26 @@ cube(`Orders`, { sql: `amount`, type: `sum`, }, + avgAmount: { + sql: `amount`, + type: `avg`, + }, + statusList: { + sql: `status`, + type: `string`, + }, + lastStatus: { + sql: `MAX(status)`, + type: `string`, + }, + hasUnpaidOrders: { + sql: `BOOL_OR(NOT is_paid)`, + type: `boolean`, + }, + maxCreatedAt: { + sql: `MAX(created_at)`, + type: `time`, + }, toRemove: { type: `count`, }, @@ -33,5 +53,13 @@ cube(`Orders`, { sql: `amount`, type: `number`, }, + isPaid: { + sql: `is_paid`, + type: `boolean`, + }, + createdAt: { + sql: `created_at`, + type: `time`, + }, }, }); diff --git a/packages/cubejs-testing/test/smoke-duckdb.test.ts b/packages/cubejs-testing/test/smoke-duckdb.test.ts index 7d50c52205d5d..7da0e57b4a3b8 100644 --- a/packages/cubejs-testing/test/smoke-duckdb.test.ts +++ b/packages/cubejs-testing/test/smoke-duckdb.test.ts @@ -152,4 +152,414 @@ describe('duckdb', () => { // There are 4 orders with status 'new' or 'processed' expect(response.rawData()[0]['Orders.count']).toBe('4'); }); + + test('string measure filter - should NOT use CAST', async () => { + // This test verifies that filters on string measures don't use CAST + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.lastStatus', + operator: 'equals', + values: ['shipped'] + } + ] + }); + + // Output should match the shipped orders + expect(response.rawData()[0]['Orders.count']).toBe('5'); + }); + + test('boolean measure filter - should NOT use CAST', async () => { + // This test verifies that filters on boolean measures don't use CAST + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.hasUnpaidOrders', + operator: 'equals', + values: ['true'] + } + ] + }); + + // Output should match all orders since there are unpaid orders + expect(response.rawData()[0]['Orders.count']).toBe('5'); + }); + + test('time measure filter - afterDate operator', async () => { + // This test verifies that filters on time measures don't use CAST + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.maxCreatedAt', + operator: 'afterDate', + values: ['2020-01-03'] + } + ] + }); + + // Orders after 2020-01-03 + expect(response.rawData()[0]['Orders.count']).toBe('5'); + }); + + test('time measure filter - beforeDate operator', async () => { + // This test verifies that beforeDate operator works correctly with time measures + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.maxCreatedAt', + operator: 'beforeDate', + values: ['2020-01-06'] + } + ] + }); + // All orders should be counted since max date (2020-01-05) is before 2020-01-06 + expect(response.rawData()[0]['Orders.count']).toBe('5'); + }); + + test('time measure filter - beforeOrOnDate operator', async () => { + // This test verifies that beforeOrOnDate operator works correctly with time measures + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.maxCreatedAt', + operator: 'beforeOrOnDate', + values: ['2020-01-05'] + } + ] + }); + // All orders should be counted since max date (2020-01-05) is equal to 2020-01-05 + expect(response.rawData()[0]['Orders.count']).toBe('5'); + }); + + test('time measure filter - afterOrOnDate operator', async () => { + // This test verifies that afterOrOnDate operator works correctly with time measures + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.maxCreatedAt', + operator: 'afterOrOnDate', + values: ['2020-01-05'] + } + ] + }); + // All orders should be counted since max date (2020-01-05) is equal to 2020-01-05 + expect(response.rawData()[0]['Orders.count']).toBe('5'); + }); + + test('time measure filter - inDateRange operator', async () => { + // This test verifies that inDateRange operator works correctly with time measures + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.maxCreatedAt', + operator: 'inDateRange', + values: ['2020-01-04', '2020-01-06'] + } + ] + }); + // All orders should be counted since max date (2020-01-05) is within the range + expect(response.rawData()[0]['Orders.count']).toBe('5'); + }); + + test('time measure filter - notInDateRange operator', async () => { + // This test verifies that notInDateRange operator works correctly with time measures + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.maxCreatedAt', + operator: 'notInDateRange', + values: ['2020-01-06', '2020-01-07'] + } + ] + }); + // All orders should be counted since max date (2020-01-05) is outside the range + expect(response.rawData()[0]['Orders.count']).toBe('5'); + }); + + test('time measure filter - equals operator', async () => { + // This test verifies that equals operator works correctly with time measures + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.maxCreatedAt', + operator: 'equals', + values: ['2020-01-05'] + } + ] + }); + // All orders should be counted since max date equals 2020-01-05 + expect(response.rawData()[0]['Orders.count']).toBe('5'); + }); + + test('time measure filter - notEquals operator', async () => { + // This test verifies that notEquals operator works correctly with time measures + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.maxCreatedAt', + operator: 'notEquals', + values: ['2020-01-06'] + } + ] + }); + // All orders should be counted since max date (2020-01-05) is not equal to 2020-01-06 + expect(response.rawData()[0]['Orders.count']).toBe('5'); + }); + + test('boolean dimension filter - should NOT use CAST', async () => { + // This test verifies that filters on boolean dimensions don't use CAST + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.isPaid', + operator: 'equals', + values: ['false'] + } + ] + }); + + // There are 2 unpaid orders + expect(response.rawData()[0]['Orders.count']).toBe('2'); + }); + + test('time dimension filter - should NOT use CAST', async () => { + // This test verifies that filters on time types don't use CAST + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.createdAt', + operator: 'afterDate', + values: ['2020-01-03'] + } + ] + }); + + // There are 2 orders after 2020-01-03 + expect(response.rawData()[0]['Orders.count']).toBe('2'); + }); + + test('time dimension filter - beforeDate operator', async () => { + // This test verifies that the beforeDate operator works correctly + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.createdAt', + operator: 'beforeDate', + values: ['2020-01-03'] + } + ] + }); + + // There are 2 orders before 2020-01-03 + expect(response.rawData()[0]['Orders.count']).toBe('2'); + }); + + test('time dimension filter - beforeOrOnDate operator', async () => { + // This test verifies that the beforeOrOnDate operator works correctly + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.createdAt', + operator: 'beforeOrOnDate', + values: ['2020-01-03'] + } + ] + }); + + // There are 3 orders before or on 2020-01-03 + expect(response.rawData()[0]['Orders.count']).toBe('3'); + }); + + test('time dimension filter - afterOrOnDate operator', async () => { + // This test verifies that the afterOrOnDate operator works correctly + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.createdAt', + operator: 'afterOrOnDate', + values: ['2020-01-03'] + } + ] + }); + + // There are 3 orders after or on 2020-01-03 + expect(response.rawData()[0]['Orders.count']).toBe('3'); + }); + + test('time dimension filter - inDateRange operator', async () => { + // This test verifies that the inDateRange operator works correctly + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.createdAt', + operator: 'inDateRange', + values: ['2020-01-02', '2020-01-04'] + } + ] + }); + + // There are 3 orders between 2020-01-02 and 2020-01-04 (inclusive) + expect(response.rawData()[0]['Orders.count']).toBe('3'); + }); + + test('time dimension filter - notInDateRange operator', async () => { + // This test verifies that the notInDateRange operator works correctly + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.createdAt', + operator: 'notInDateRange', + values: ['2020-01-02', '2020-01-04'] + } + ] + }); + + // There are 2 orders outside the range 2020-01-02 to 2020-01-04 + expect(response.rawData()[0]['Orders.count']).toBe('2'); + }); + + test('time dimension filter - set operator', async () => { + // This test verifies that the set operator works correctly + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.createdAt', + operator: 'set' + } + ] + }); + + // All 5 orders have createdAt set + expect(response.rawData()[0]['Orders.count']).toBe('5'); + }); + + test('time measure filter - set operator', async () => { + // This test verifies that the set operator works correctly with measures + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.maxCreatedAt', + operator: 'set' + } + ] + }); + + // All 5 orders have maxCreatedAt set + expect(response.rawData()[0]['Orders.count']).toBe('5'); + }); + + // We can't effectively test notSet with the current schema since all records have dates + // This test is more for API completeness + test('time dimension filter - notSet operator', async () => { + // This test verifies that the notSet operator works correctly + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.createdAt', + operator: 'notSet' + } + ] + }); + + // No orders have createdAt as null + expect(response.rawData()[0]['Orders.count']).toBe('0'); + }); + + test('time dimension filter - equals operator', async () => { + // This test verifies that equals operator works correctly with time dimensions + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.createdAt', + operator: 'equals', + values: ['2020-01-03'] + } + ] + }); + + // There is 1 order on exactly 2020-01-03 + expect(response.rawData()[0]['Orders.count']).toBe('1'); + }); + + test('time dimension filter - notEquals operator', async () => { + // This test verifies that notEquals operator works correctly with time dimensions + const response = await client.load({ + measures: [ + 'Orders.count' + ], + filters: [ + { + member: 'Orders.createdAt', + operator: 'notEquals', + values: ['2020-01-03'] + } + ] + }); + + // There are 4 orders not on 2020-01-03 + expect(response.rawData()[0]['Orders.count']).toBe('4'); + }); }); From b4e95b3e0125068b3cd138eeb78a60189004fb57 Mon Sep 17 00:00:00 2001 From: Karlo Medallo <4827074+karlomedallo@users.noreply.github.com> Date: Sat, 15 Mar 2025 22:43:51 +1100 Subject: [PATCH 3/3] ci: trigger checks for PR #9281