diff --git a/googlesql/analysis/projection.go b/googlesql/analysis/projection.go new file mode 100644 index 00000000..eab6a53a --- /dev/null +++ b/googlesql/analysis/projection.go @@ -0,0 +1,1048 @@ +package analysis + +import ( + "fmt" + "strings" + + "github.com/bytebase/omni/googlesql/ast" +) + +// relation-projection resolution +// +// This is the masking-grade core that mirrors the legacy bytebase resolver's +// TableSource model (plugin/parser/bigquery query_span_extractor.go: +// PseudoTable / PhysicalTable / extractTableSourceFromSelect / +// extractTableSourceFromQuerySetOperation). Every FROM relation — a base table, +// a CTE reference, a derived `( query )` subquery, or a join — resolves to an +// ordered output PROJECTION: a list of columns, each carrying its base-column +// lineage and an IsPlainField flag. A `SELECT *` (or `rel.*`) over such a +// relation reproduces that relation's projection (with lineage), NOT a bare +// catalog-blind "*"; a set operation position-merges its two arms' projections. +// +// The one thing omni leaves to the catalog-aware consumer (the bytebase +// extractor) is enumerating a PHYSICAL table's columns: omni has no metadata, so +// a base-table star is carried as a `baseStar` projection element (a single +// element standing for "every column of table T"), which the consumer expands. +// Everything else — CTE/derived projections, column references resolved through +// relations, set-op merges, join concatenation, plain-field propagation — is +// resolved here so the consumer receives concrete per-column lineage. + +// projColumn is one resolved output column of a relation's projection. +type projColumn struct { + name string // output column name (lower/written case; consumer upper-cases explicit names) + sources []ColumnRef // resolved base-column lineage + plain bool // IsPlainField (legacy semantics) + + // baseFieldName marks name as a base-table FIELD passthrough (today: the + // JOIN ... USING coalesced key, which the legacy resolver named after the + // left PhysicalTable's field). A consumer reproducing legacy naming renders + // it in the field's metadata case rather than the written/upper-cased form. + baseFieldName bool + + // baseStar, when non-nil, means this projection element stands for "every + // column of this base table" — omni cannot enumerate it (no catalog), so the + // consumer expands it. name is "*". sources is empty. + baseStar *ColumnRef + + // baseStarExcept lists base-table column names the consumer must SKIP when + // expanding baseStar (case-insensitive). Set by a JOIN ... USING coalesce + // (fix F1): each side's star excludes the key columns, which are projected + // once as a coalesced concrete column ahead of the stars. Only meaningful + // when baseStar is non-nil. + baseStarExcept []string + + // starMerge, when non-nil, marks a set-operation merge position whose other arm + // is a base-table star of (consumer-known) arity: the consumer expands + // starMerge.table and unions that table's column at the same ordinal into this + // position. Used for `concrete UNION ALL (SELECT * FROM base)` (legacy expands + // the star arm against metadata, then position-merges). See ColumnInfo. + starMerge *starMerge + + // starGroup, when non-nil, marks this element as a whole `*` / `rel.*` star + // ITEM carrying its ordered expansion segments plus the EXCEPT/REPLACE + // modifiers that apply across them, so the modifier application (with the legacy + // name-collision last-wins dedup) happens once, after expansion, in the + // metadata-aware consumer. It is produced ONLY for a star that has EXCEPT/REPLACE + // modifiers; a modifier-free star (even one with base-table segments) expands + // inline into individual projColumns instead, keeping each segment individually + // addressable (so an enclosing relation can resolve `rel.col` and a set + // operation can position-merge). name is "*". + starGroup *starGroup + + // setOpMerge, when non-nil, marks a single element standing for a WHOLE + // set-operation merge whose arms cannot be position-merged inline because at + // least one arm carries an un-enumerable star (a base-table star, an + // EXCEPT/REPLACE starGroup, or a nested set-op merge). It carries both arms' + // resolved projections (in wire form); the metadata-aware consumer expands each + // arm fully (a base-table star against metadata, a starGroup with its modifiers, + // a nested merge recursively), then position-merges the two expanded lists — + // output names from the LEFT arm, sources unioned per position, plain=false — + // reproducing the legacy "fully resolve each arm, then zip" + // (extractTableSourceFromQuerySetOperation). name is "*". This is the masking- + // grade fix for star-involving set-op arms whose arity only the consumer knows: + // a base-star UNION base-star (one merged col per expanded position, NOT two + // concatenated stars), a nested concrete/star/concrete (the inner star-merge + // survives the outer merge), and an EXCEPT/REPLACE star arm merged with a + // concrete arm. + setOpMerge *setOpMerge +} + +// setOpMerge carries a deferred set-operation merge: the two arms' resolved +// projections (wire form), to be expanded and merged by the metadata-aware +// consumer. byName marks a BY NAME / CORRESPONDING operation (fix F3): the +// consumer merges the expanded arms by column NAME (matchColumns restricting/ +// ordering the output when present) instead of by ordinal. See +// projColumn.setOpMerge and SetOpMergeInfo. +type setOpMerge struct { + left []ColumnInfo + right []ColumnInfo + byName bool + matchColumns []string +} + +// starGroup is a `*` / `rel.*` star item's resolved expansion: the ordered +// segments to emit and its EXCEPT/REPLACE modifiers (applied across the expanded +// columns by the metadata-aware consumer). See ColumnInfo.StarSegments. +type starGroup struct { + segments []projColumn + except []string + replace []StarReplaceItem +} + +// starMerge carries a set-operation merge against a base-table-star arm: the base +// table to expand (table) and the output ordinal (index) whose lineage gains that +// table's index-th column. leftStar marks the star arm as the LEFT one (so the +// output NAME is taken from the expanded base column, not from a concrete arm). +type starMerge struct { + table ColumnRef + index int + leftStar bool +} + +// relation is a resolved FROM source: a base table, a CTE reference, a derived +// subquery, an UNNEST value relation, or a join of these. +type relation struct { + // name is the relation's reference name for qualified lookups: the alias if + // present, else the (bare) table name; "" for an unnamed join/derived. + name string + // isBase marks a single physical base table (so a bare reference qualified by + // its name resolves a column to the base table's lineage even though omni + // cannot enumerate its columns). + isBase bool + // baseRef is the base table reference (Database/Schema/Table) when isBase, so a + // column read off it (`t.col` / a bare `col`) resolves to {…, Column: col}. + baseRef ColumnRef + // valueTable marks an UNNEST / correlated-array-path value relation (fix F2): + // its first column IS the element value, so a field reference through the + // relation name (`elem.child`) that matches no projection column resolves to + // the ELEMENT's lineage (a struct field reads the element). + valueTable bool + // columns is the ordered resolved projection. For a base table it is a single + // baseStar element. + columns []projColumn +} + +// resolveSelectProjection resolves a SELECT block to its output projection, +// mirroring the legacy extractTableSourceFromSelect: a bare `*` copies all FROM +// relations' columns; a `rel.*` copies the named relation's columns; an explicit +// item resolves its expression's columns against the relations. fromRels is the +// comma-item relation list (a join collapsed into one combined relation) used to +// expand a bare `*` in FROM order; column / `rel.*` lookups use the SELECT's leaf +// relations (w.leafRels), which keep each join side findable by name. +func (w *spanWalker) resolveSelectProjection(stmt *ast.SelectStmt, fromRels []*relation) []projColumn { + var out []projColumn + for _, item := range stmt.Items { + if item == nil { + continue + } + switch { + case item.Star && item.Expr == nil: + // Bare `*`: every comma-item relation's columns, in order. + out = append(out, w.starProjColumns(allColumns(fromRels), item.Modifiers)...) + case item.Star && item.Expr != nil: + // `expr.*`: the named relation's columns (or a field-path star). + out = append(out, w.starProjColumns(w.resolveDotStar(item.Expr), item.Modifiers)...) + default: + out = append(out, w.resolveExprColumn(item)) + } + } + return out +} + +// starProjColumns turns a star item's resolved expansion `segments` into one or +// more projColumns. A modifier-free, fully-concrete star (every segment already +// resolved, no EXCEPT/REPLACE) expands inline into its concrete columns — so a +// set operation can position-merge them and the consumer needs no metadata. A +// star that has modifiers OR contains an un-enumerable base-table segment is kept +// as a single starGroup projColumn, so the consumer expands the base segments and +// applies the modifiers (with the legacy name-collision last-wins dedup) in one +// place. +func (w *spanWalker) starProjColumns(segments []projColumn, mods *ast.StarModifiers) []projColumn { + except, replace := w.resolveStarModifiers(mods) + if len(except) == 0 && len(replace) == 0 { + // Modifier-free: inline the segments (concrete columns stay concrete, a + // base-table segment stays a baseStar projColumn). Inlining — rather than + // wrapping in a starGroup — keeps each segment individually addressable so an + // enclosing relation can resolve `rel.col` / position-merge a base-star arm, + // and a fully-concrete star can be position-merged by a set operation. + return segments + } + // With EXCEPT/REPLACE the expansion + name-collision dedup must be applied as a + // unit (and a base segment can only be expanded by the metadata-aware consumer), + // so keep the star grouped with its modifiers. + return []projColumn{{ + name: "*", + starGroup: &starGroup{ + segments: segments, + except: except, + replace: replace, + }, + }} +} + +// resolveStarModifiers resolves a star item's EXCEPT/REPLACE modifiers into the +// consumer-facing form: the EXCEPT column names, and per-REPLACE the output +// column name plus its replacement expression's resolved source columns. Returns +// (nil, nil) when there are no modifiers. +func (w *spanWalker) resolveStarModifiers(mods *ast.StarModifiers) ([]string, []StarReplaceItem) { + if mods == nil { + return nil, nil + } + var except []string + if len(mods.Except) > 0 { + except = append(except, mods.Except...) + } + var replace []StarReplaceItem + for _, r := range mods.Replace { + if r == nil || r.Alias == "" { + continue + } + _, sources := w.resolveExprSources(r.Value) + replace = append(replace, StarReplaceItem{Name: r.Alias, Sources: sources}) + } + return except, replace +} + +// resolveExprColumn resolves a non-star select item to a single output column, +// mirroring the legacy extractSourceColumnSetFromExpr + alias rule. The item is +// never a plain field (an explicit select-list item is IsPlainField=false). +func (w *spanWalker) resolveExprColumn(item *ast.SelectItem) projColumn { + name, sources := w.resolveExprSources(item.Expr) + if item.Alias != "" { + name = item.Alias + } + return projColumn{name: name, sources: sources, plain: false} +} + +// resolveExprSources resolves an expression's directly-referenced columns to +// their base lineage against the FROM relations, returning a best-effort output +// name and the merged source set (the legacy extractSourceColumnSetFromExpr): each +// maximal dotted column path is resolved by resolvePath; a multi-reference +// expression has no derivable name. +func (w *spanWalker) resolveExprSources(expr ast.Node) (string, []ColumnRef) { + if expr == nil { + return "", nil + } + paths := w.collectColumnPaths(expr) + if len(paths) == 0 { + return "", nil + } + var sources []ColumnRef + name := "" + for _, parts := range paths { + n, refs := w.resolvePath(parts) + sources = unionColumnRefs(sources, refs) + name = n + } + if len(paths) > 1 { + name = "" + } + return name, sources +} + +// resolvePath resolves one dotted column path (e.g. [col] or [rel, col] or +// [s, field]) to its output name and base lineage. It covers the same cases as the +// legacy getFieldColumnSource, but resolves RELATION-FIRST rather than column- +// first, because omni — unlike the legacy resolver — has no catalog to check +// whether a leading identifier is instead a base-table column: +// - 1 part [col]: resolve `col` as a column (name = col). +// - >=2 parts [a, …]: if `a` names a leaf relation, resolve the second part as +// that relation's column (name = b; trailing struct sub-fields dropped). Else +// if the dialect-bucketed qualifier names a leaf base table (`proj.ds.t.c`), +// keep the bucketed ref. Else `a` is a struct/field column root (`s.field`, +// `s.a.b`) — resolve `a` as a column, dropping the field path (name = a). +// +// A column owned by a CTE / derived relation resolves to that relation's stored +// lineage. A column owned by (or assumed to be on) a base table resolves to a +// written ref the consumer matches against catalog metadata. +func (w *spanWalker) resolvePath(parts []string) (string, []ColumnRef) { + if len(parts) == 0 { + return "", nil + } + if len(parts) == 1 { + col := parts[0] + return col, w.resolveColumn("", col) + } + // `rel.col` where the leading identifier names a leaf relation: resolve the + // SECOND part as that relation's column (any trailing parts are struct + // sub-fields, which carry no extra base lineage). Relation-first because omni, + // unlike the legacy resolver, cannot metadata-check whether the leading + // identifier is instead a column of a base table. + if findRelation(w.leafRels, parts[0]) != nil { + return parts[1], w.resolveColumn(parts[0], parts[1]) + } + // A FULLY-qualified column reference whose qualifier names a leaf base relation + // (e.g. `proj.ds.t.c` over `FROM proj.ds.t`): bucket it by the dialect rule so + // its Catalog/Database/Schema/Table line up with that table's access. + ref := columnRefFromParts(parts, w.dialect) + if w.qualifierNamesBaseRelation(ref) { + return ref.Column, []ColumnRef{ref} + } + // Otherwise the leading identifier is a struct/field column root (`s.field`, + // `s.a.b`): resolve `parts[0]` as a column (dropping the trailing field path), + // mirroring the legacy a-as-column attempt for an unqualified field path. + return parts[0], w.resolveColumn("", parts[0]) +} + +// qualifierNamesBaseRelation reports whether a bucketed column ref's qualifier +// (its Catalog/Database/Schema/Table) matches a leaf BASE relation's table — i.e. +// the reference is a fully-qualified column of a FROM base table (e.g. +// `proj.ds.t.c`). Used to keep such a reference's dialect-bucketed qualifier +// rather than misreading its leading identifier as a struct-column root. +func (w *spanWalker) qualifierNamesBaseRelation(ref ColumnRef) bool { + if ref.Table == "" { + return false + } + for _, rel := range w.leafRels { + if !rel.isBase { + continue + } + b := rel.baseRef + if strings.EqualFold(b.Table, ref.Table) && + strings.EqualFold(b.Database, ref.Database) && + strings.EqualFold(b.Schema, ref.Schema) && + strings.EqualFold(b.Catalog, ref.Catalog) { + return true + } + } + return false +} + +// resolveColumn resolves a (relationQualifier, column) reference to base lineage +// against the SELECT's leaf relations (w.leafRels). relQualifier may be "" +// (unqualified). A CTE/derived relation's column resolves to its stored lineage; a +// base (or base-star-passthrough) relation's column resolves to a written ref the +// consumer matches against metadata; an unresolved reference falls back to a +// written ref (so the consumer can still match it by name). +func (w *spanWalker) resolveColumn(relQualifier, column string) []ColumnRef { + if refs, ok := w.resolveColumnStrict(relQualifier, column); ok { + return refs + } + // Fallback: a written reference the consumer matches against catalog metadata. + // Qualified by a base relation → carry the relation's base ref; unqualified or + // over an unknown relation → a bare column ref. + if relQualifier != "" { + if rel := findRelation(w.leafRels, relQualifier); rel != nil && rel.isBase { + ref := rel.baseRef + ref.Column = column + return []ColumnRef{ref} + } + // Qualifier is not a known relation: emit a written ref (alias.column) so + // the consumer can still match by column name (table set to the qualifier). + return []ColumnRef{{Table: relQualifier, Column: column}} + } + // Unqualified with exactly ONE base relation in scope: the column can only + // come from that relation — the strict pass already ruled out every concrete + // CTE/derived projection — so attribute it. Without the attribution the + // consumer falls back to matching the bare name across ALL expanded tables, + // which over-includes whenever another table shares the column name (e.g. a + // 3-way UNION whose arms' tables have overlapping column names) and diverges + // from the legacy resolver's exact single-table attribution. + if rel := soleBaseRelation(w.leafRels); rel != nil { + ref := rel.baseRef + ref.Column = column + return []ColumnRef{ref} + } + return []ColumnRef{{Column: column}} +} + +// soleBaseRelation returns the single base-table relation in rels when exactly +// one exists, else nil. With several base relations a bare column is ambiguous +// (omni cannot enumerate base-table columns metadata-free), so the caller keeps +// the bare ref and the consumer matches additively by name. +func soleBaseRelation(rels []*relation) *relation { + var sole *relation + for _, rel := range rels { + if !rel.isBase { + continue + } + if sole != nil { + return nil + } + sole = rel + } + return sole +} + +// resolveColumnStrict resolves a (relationQualifier, column) reference ONLY when +// it can be tied to a concrete relation projection. It returns nil when no leaf +// relation resolves it, so the caller can apply the legacy fallback. +// +// - Qualified `rel.col`: the named leaf relation's column. A base or base-star- +// passthrough relation resolves `col` to its base-table lineage (the written +// column name keyed to the base table); a CTE/derived relation matches `col` +// by name in its concrete projection. +// - Unqualified `col`: only a CTE/derived relation with a CONCRETE column named +// `col` resolves here. A base table's columns are not enumerable, so an +// unqualified column over base tables is left to the consumer to match by name +// (omni cannot know which base table — or which arm of a join — owns it). +func (w *spanWalker) resolveColumnStrict(relQualifier, column string) ([]ColumnRef, bool) { + if relQualifier != "" { + rel := findRelation(w.leafRels, relQualifier) + if rel == nil { + return nil, false + } + if rel.isBase { + ref := rel.baseRef + ref.Column = column + return []ColumnRef{ref}, true + } + if refs, ok := projectionColumnSources(rel, column, true /* allowBaseStar */); ok { + return refs, true + } + // A VALUE relation (UNNEST element): a qualified reference that names no + // projection column is a struct FIELD of the element (`elem.child`), so it + // reads the element's lineage (fix F2). + if rel.valueTable && len(rel.columns) > 0 { + return rel.columns[0].sources, true + } + return nil, false + } + for _, rel := range w.leafRels { + if rel.isBase { + continue + } + if refs, ok := projectionColumnSources(rel, column, false /* allowBaseStar */); ok { + return refs, true + } + } + return nil, false +} + +// projectionColumnSources returns the base lineage of the column named `column` +// in a relation's projection, or nil if the relation has no such named column. +// +// A concrete projection column matches by name. A baseStar element (a whole base +// table whose columns omni cannot enumerate) matches ANY column name — but ONLY +// when allowBaseStar is set: a baseStar match is sound for a QUALIFIED reference +// (`rel.col` names the relation, so `col` must be one of its columns) but NOT for +// an UNQUALIFIED reference (omni cannot know which of several base relations — or +// join arms — owns a bare `col`; that is left to the metadata-aware consumer to +// match by name). +func projectionColumnSources(rel *relation, column string, allowBaseStar bool) ([]ColumnRef, bool) { + for _, c := range rel.columns { + if c.baseStar != nil { + if !allowBaseStar { + continue + } + // A USING-coalesced star excludes its key columns — those are owned by + // the coalesced concrete column (earlier in the projection), not by this + // star segment. + if nameInListFold(c.baseStarExcept, column) { + continue + } + ref := *c.baseStar + ref.Column = column + return []ColumnRef{ref}, true + } + if strings.EqualFold(c.name, column) { + // Found — even when its lineage is empty (e.g. a CTE column projected from + // a literal like `1 AS n`); return found=true so the caller does NOT apply + // the unresolved-reference fallback (which would wrongly fabricate a bare + // column ref). Empty-but-resolved lineage is the legacy result. + return c.sources, true + } + } + return nil, false +} + +// resolveDotStar resolves a `rel.*` (or qualified/field-path star) to the named +// relation's projection columns, mirroring the legacy extractWildFromExpr: the +// leading path names a relation whose columns are reproduced. A multi-part +// qualifier (`schema.table.*` / `db.schema.table.*`) matches the relation by its +// TRAILING part with the prefix verified against the relation's base reference — +// the legacy resolver ERRORED on these (fail-closed), so resolving them is an +// omni improvement, but an UNRESOLVABLE wild path must FAIL CLOSED too (the +// structural rule): silently yielding no columns would give the masker zero +// result maskers and return every output column unmasked. +func (w *spanWalker) resolveDotStar(expr ast.Node) []projColumn { + parts := exprToParts(expr) + if len(parts) == 0 { + return nil + } + // `rel.*`: a SINGLE-part qualifier naming a leaf relation reproduces that + // relation's projection. The single-part guard is load-bearing: a MULTI-part + // path whose head names a relation is a STRUCT-FIELD star through it + // (`d.s.*` — the star expands the struct field's sub-columns, which omni + // cannot enumerate metadata-free); returning the relation's whole projection + // would misalign the positional masker (gate-round finding: the first + // result's masker lands on the struct's first sub-column → leak). Those fall + // through to the fail-closed branch, matching the legacy resolver's error. + if len(parts) == 1 { + if rel := findRelation(w.leafRels, parts[0]); rel != nil { + if rel.valueTable { + // An UNNEST/value relation's `elem.*` expands the ELEMENT's struct + // sub-fields — N engine output columns omni cannot enumerate; + // returning the relation's single projection column would shift + // every later position (re-verify finding). Fail closed. + w.failClosed(fmt.Errorf("cannot enumerate %s.* over a value table's element (fail closed)", parts[0])) + return nil + } + return rel.columns + } + } + // `schema.table.*` / `dataset.table.*`: the head does NOT name a relation, + // the trailing part names a base relation, and the written prefix matches the + // relation's schema OR database/dataset qualifier (non-empty — an unqualified + // FROM accepts no written prefix; the engine would reject the mismatched + // range variable anyway, and the legacy resolver errored on every + // schema-qualified star). + if len(parts) >= 2 && findRelation(w.leafRels, parts[0]) == nil { + last := parts[len(parts)-1] + if rel := findRelation(w.leafRels, last); rel != nil && rel.isBase { + prefix := parts[len(parts)-2] + schemaMatch := rel.baseRef.Schema != "" && strings.EqualFold(rel.baseRef.Schema, prefix) + databaseMatch := rel.baseRef.Database != "" && strings.EqualFold(rel.baseRef.Database, prefix) + if schemaMatch || databaseMatch { + return rel.columns + } + } + } + // Unresolvable wild path (a struct-path star or a qualifier naming nothing in + // FROM): fail closed rather than silently produce zero output columns. + w.failClosed(fmt.Errorf("cannot resolve %s.* to a FROM relation's columns (fail closed)", strings.Join(parts, "."))) + return nil +} + +// mergeProjections merges two set-operation arms' projections position-wise, +// reproducing the legacy extractTableSourceFromQuerySetOperation (fully resolve +// each arm, then zip: output names from the LEFT arm, sources unioned per +// position). Because omni — unlike the catalog-aware legacy resolver — cannot +// expand a base-table star before the merge, an arm carrying an un-enumerable +// star is handled by deferring the whole merge to the metadata-aware consumer. +// +// - Both arms fully concrete (no star markers): a plain position-wise union. +// Arity mismatch ⇒ left wins unchanged (the legacy length-mismatch guard; +// legacy errors on an unequal-width set op, which the masking layer rejects — +// omni's left-only best effort never DROPS a left column, so it does not +// under-attribute the surviving result). +// - Concrete LEFT + single base-star RIGHT (the common `concrete UNION ALL +// SELECT * FROM base`): keep the left columns (names from the left) and mark +// each with a starMerge so the consumer expands the right base table and +// unions its i-th column into position i. plain=false. +// - Single base-star LEFT + concrete RIGHT: symmetric, names from the expanded +// LEFT base table (legacy first-select-name) via starMerge{leftStar:true}. +// - Both single base-star OVER THE SAME TABLE: idempotent — the merge of a base +// table's star with itself is that same star (a plain passthrough). This is +// the recursive-CTE base-star shape (`(SELECT * FROM seed) UNION ALL SELECT * +// FROM r`, where the recursive self-reference resolves back to seed); legacy +// publishes seed's columns once (plain), not duplicated. +// - Any other star-involving combination (base-star × base-star over DIFFERENT +// tables, an EXCEPT/REPLACE starGroup arm, a nested set-op merge whose arm +// carries a starMerge, …): defer the whole merge to the consumer via a single +// setOpMerge element carrying both arms' wire-form projections. The consumer +// expands each arm fully and position-merges — the only place the star arities +// are known. This preserves every arm's lineage (no concatenation, no dropped +// star marker), closing the set-op masking leaks. +func mergeProjections(left, right []projColumn) []projColumn { + if len(right) == 0 { + return left + } + if len(left) == 0 { + return right + } + lStar := singleBaseStar(left) + rStar := singleBaseStar(right) + lConcrete := allConcrete(left) + rConcrete := allConcrete(right) + + switch { + case lConcrete && rConcrete: + if len(left) != len(right) { + return left + } + merged := make([]projColumn, len(left)) + for i := range left { + merged[i] = projColumn{ + name: left[i].name, + sources: unionColumnRefs(left[i].sources, right[i].sources), + plain: false, + } + } + return merged + case lConcrete && rStar != nil: + // Concrete LEFT, single base-star RIGHT: per-position starMerge. + merged := make([]projColumn, len(left)) + for i := range left { + merged[i] = projColumn{ + name: left[i].name, + sources: left[i].sources, + plain: false, + starMerge: &starMerge{table: *rStar, index: i}, + } + } + return merged + case lStar != nil && rConcrete: + // Single base-star LEFT, concrete RIGHT: per-position starMerge (names from + // the expanded left base table). + merged := make([]projColumn, len(right)) + for i := range right { + merged[i] = projColumn{ + name: "", // named from the expanded left base table by the consumer + sources: right[i].sources, + plain: false, + starMerge: &starMerge{table: *lStar, index: i, leftStar: true}, + } + } + return merged + case lStar != nil && rStar != nil && sameBaseTable(*lStar, *rStar): + // Both single base-star over the SAME table: idempotent passthrough (the + // recursive-CTE base-star shape). Keep one base-star, plain. + return []projColumn{{name: "*", plain: true, baseStar: lStar}} + default: + // Any other star-involving combination: defer the whole merge to the + // consumer, which expands each arm's stars and position-merges. + return []projColumn{{ + name: "*", + setOpMerge: &setOpMerge{ + left: projColumnsToColumnInfos(left), + right: projColumnsToColumnInfos(right), + }, + }} + } +} + +// mergeProjectionsByName merges a BY NAME / CORRESPONDING set operation's arms +// by case-insensitive column NAME (fix F3), reproducing the engine's column +// matching: output order is the LEFT arm's columns (each unioning every +// same-named right column's lineage), followed by any right-only names (the +// FULL BY NAME shape; for the plain/LEFT variants a trailing extra never +// shifts an earlier position, so including it is over-attribution-safe). A +// non-empty matchColumns (`ON (cols)` / `BY (cols)`) restricts the output to +// exactly those columns, in list order. Merged columns are never plain. +// +// When either arm carries a non-concrete marker (a base-table star, a star +// group, or a nested merge) the name partition needs the arm's expanded column +// NAMES, which only the metadata-aware consumer knows — the whole merge defers +// via a single setOpMerge element with byName set (NEVER a silent ordinal +// merge, which is the verified mis-attribution this fix closes). +func mergeProjectionsByName(left, right []projColumn, matchColumns []string) []projColumn { + if len(right) == 0 { + return left + } + if len(left) == 0 { + return right + } + if !allConcrete(left) || !allConcrete(right) { + return []projColumn{{ + name: "*", + setOpMerge: &setOpMerge{ + left: projColumnsToColumnInfos(left), + right: projColumnsToColumnInfos(right), + byName: true, + matchColumns: matchColumns, + }, + }} + } + if len(matchColumns) > 0 { + out := make([]projColumn, 0, len(matchColumns)) + for _, m := range matchColumns { + out = append(out, projColumn{ + name: m, + sources: unionColumnRefs(namedSources(left, m), namedSources(right, m)), + plain: false, + }) + } + return out + } + out := make([]projColumn, 0, len(left)+len(right)) + for _, l := range left { + out = append(out, projColumn{ + name: l.name, + sources: unionColumnRefs(l.sources, namedSources(right, l.name)), + plain: false, + }) + } + for _, r := range right { + if !projHasName(left, r.name) { + out = append(out, projColumn{name: r.name, sources: r.sources, plain: false}) + } + } + return out +} + +// namedSources returns the union of the sources of every concrete projection +// column whose name matches `name` case-insensitively. +func namedSources(cols []projColumn, name string) []ColumnRef { + var out []ColumnRef + for _, c := range cols { + if strings.EqualFold(c.name, name) { + out = unionColumnRefs(out, c.sources) + } + } + return out +} + +// projHasName reports whether any projection column is named `name` +// (case-insensitive). +func projHasName(cols []projColumn, name string) bool { + for _, c := range cols { + if strings.EqualFold(c.name, name) { + return true + } + } + return false +} + +// coalesceUsingJoin builds the projection of `left JOIN right USING (keys)` +// (fix F1): each key ONCE, first, in USING order — its lineage the union of +// BOTH sides' key columns — then the left side's non-key columns (rewrapped +// non-plain, the legacy join-left rule), then the right side's (plainness +// kept). It returns ok=false when a side cannot be name-partitioned without +// metadata (a deferred set-op marker in its projection); the caller fails +// closed. +func coalesceUsingJoin(left, right *relation, keys []string) (*relation, bool) { + leftRest, ok := stripUsingKeys(left.columns, keys) + if !ok { + return nil, false + } + rightRest, ok := stripUsingKeys(right.columns, keys) + if !ok { + return nil, false + } + cols := make([]projColumn, 0, len(keys)+len(leftRest)+len(rightRest)) + for _, key := range keys { + cols = append(cols, projColumn{ + name: key, + sources: unionColumnRefs(usingKeyLineage(left.columns, key), usingKeyLineage(right.columns, key)), + plain: false, + baseFieldName: true, + }) + } + for _, c := range leftRest { + cols = append(cols, rewrapNonPlain(c)) + } + cols = append(cols, rightRest...) + return &relation{columns: cols}, true +} + +// stripUsingKeys returns a join side's projection with the USING key columns +// removed: a concrete column named like a key is dropped (it is owned by the +// coalesced key column); a base-table star carries the keys in its except list +// (the consumer's expansion skips them); a star group adds them to its EXCEPT +// modifier. ok=false when an element cannot be name-partitioned without +// metadata (a starMerge / setOpMerge marker, whose column names only the +// consumer knows). All returned elements are copies — the side relations stay +// leaf-resolvable with their own un-coalesced projections. +func stripUsingKeys(cols []projColumn, keys []string) ([]projColumn, bool) { + out := make([]projColumn, 0, len(cols)) + for _, c := range cols { + switch { + case c.starMerge != nil || c.setOpMerge != nil: + return nil, false + case c.starGroup != nil: + for _, s := range c.starGroup.segments { + if s.starMerge != nil || s.setOpMerge != nil || s.starGroup != nil { + return nil, false + } + } + c.starGroup = &starGroup{ + segments: append([]projColumn{}, c.starGroup.segments...), + except: appendNamesFold(c.starGroup.except, keys), + replace: c.starGroup.replace, + } + out = append(out, c) + case c.baseStar != nil: + c.baseStarExcept = appendNamesFold(c.baseStarExcept, keys) + out = append(out, c) + case nameMatchesAnyFold(c.name, keys): + // Dropped: this side's key column is folded into the coalesced key. + default: + out = append(out, c) + } + } + return out, true +} + +// usingKeyLineage returns the lineage a join side contributes to the USING key +// `key`: every concrete column named like it, every base-table star that may +// own it (the star's table with Column=key — when several stars could own the +// key, all contribute: over-attribution is safe, under is not), and a star +// group's segments likewise (its EXCEPT removing the key means the side does +// not project it; a REPLACE re-points it to the replacement's sources). +func usingKeyLineage(cols []projColumn, key string) []ColumnRef { + var out []ColumnRef + for _, c := range cols { + switch { + case c.starMerge != nil || c.setOpMerge != nil: + // Unreachable behind stripUsingKeys' ok=false; contribute nothing. + case c.starGroup != nil: + if nameInListFold(c.starGroup.except, key) { + continue + } + if srcs, ok := replaceSourcesFor(c.starGroup.replace, key); ok { + out = unionColumnRefs(out, srcs) + continue + } + out = unionColumnRefs(out, usingKeyLineage(c.starGroup.segments, key)) + case c.baseStar != nil: + if nameInListFold(c.baseStarExcept, key) { + continue + } + ref := *c.baseStar + ref.Column = key + out = unionColumnRefs(out, []ColumnRef{ref}) + case strings.EqualFold(c.name, key): + out = unionColumnRefs(out, c.sources) + } + } + return out +} + +// replaceSourcesFor returns the replacement sources of the REPLACE item named +// `name` (case-insensitive), if any. +func replaceSourcesFor(replace []StarReplaceItem, name string) ([]ColumnRef, bool) { + for _, r := range replace { + if strings.EqualFold(r.Name, name) { + return r.Sources, true + } + } + return nil, false +} + +// rewrapNonPlain marks a join-left projection column non-plain (the legacy +// joinTable rewraps the anchor side without IsPlainField), including a star +// element's per-segment plainness (the wire carries plainness on the star / +// its segments). +func rewrapNonPlain(c projColumn) projColumn { + c.plain = false + if c.starGroup != nil { + segs := make([]projColumn, len(c.starGroup.segments)) + for i, s := range c.starGroup.segments { + s.plain = false + segs[i] = s + } + c.starGroup = &starGroup{segments: segs, except: c.starGroup.except, replace: c.starGroup.replace} + } + return c +} + +// nameInListFold reports whether list contains name under case-insensitive +// comparison. +func nameInListFold(list []string, name string) bool { + for _, e := range list { + if strings.EqualFold(e, name) { + return true + } + } + return false +} + +// nameMatchesAnyFold reports whether name matches any of names +// (case-insensitive). +func nameMatchesAnyFold(name string, names []string) bool { + for _, n := range names { + if strings.EqualFold(n, name) { + return true + } + } + return false +} + +// appendNamesFold returns a COPY of list with each name appended unless already +// present (case-insensitive). The copy keeps the caller's slice unaliased (a +// join side's projection is shared with its leaf relation). +func appendNamesFold(list, names []string) []string { + out := append([]string{}, list...) + for _, n := range names { + if !nameInListFold(out, n) { + out = append(out, n) + } + } + return out +} + +// allConcrete reports whether every element of a projection is a concrete output +// column (no baseStar / starGroup / starMerge / setOpMerge marker) — i.e. the +// projection can be position-merged inline without metadata. +func allConcrete(cols []projColumn) bool { + for _, c := range cols { + if c.baseStar != nil || c.starGroup != nil || c.starMerge != nil || c.setOpMerge != nil { + return false + } + } + return true +} + +// sameBaseTable reports whether two base-table refs name the same physical table +// (case-insensitive on every qualifier component, the GoogleSQL identifier rule). +func sameBaseTable(a, b ColumnRef) bool { + return strings.EqualFold(a.Catalog, b.Catalog) && + strings.EqualFold(a.Database, b.Database) && + strings.EqualFold(a.Schema, b.Schema) && + strings.EqualFold(a.Table, b.Table) +} + +// allColumns concatenates every relation's projection columns in FROM order +// (the legacy `fromFields`). +func allColumns(rels []*relation) []projColumn { + var out []projColumn + for _, rel := range rels { + out = append(out, rel.columns...) + } + return out +} + +// singleBaseStar returns the base-table ref when the projection is exactly a +// single baseStar element (an un-enumerable base table star), else nil. A star +// carrying a USING except list does not qualify — the starMerge / idempotent +// merge paths have no way to carry the exclusions, so such a projection takes +// the deferred-merge path instead (which preserves them in wire form). +func singleBaseStar(cols []projColumn) *ColumnRef { + if len(cols) == 1 && cols[0].baseStar != nil && cols[0].starMerge == nil && len(cols[0].baseStarExcept) == 0 { + return cols[0].baseStar + } + return nil +} + +// findRelation returns the FROM relation whose reference name matches `name` +// (case-insensitive, GoogleSQL identifier rule), or nil. +func findRelation(rels []*relation, name string) *relation { + for _, rel := range rels { + if rel.name != "" && strings.EqualFold(rel.name, name) { + return rel + } + } + return nil +} + +// projColumnsToColumnInfos converts a resolved top-level projection into the +// ColumnInfo wire form the metadata-aware consumer (the bytebase extractor) +// reads. A concrete column maps to a plain ColumnInfo; a base-table star (or a +// star item with modifiers / base segments) maps to a StarSegments item the +// consumer expands; a set-operation star-merge position carries StarMerge. +func projColumnsToColumnInfos(cols []projColumn) []ColumnInfo { + if cols == nil { + return nil + } + out := make([]ColumnInfo, 0, len(cols)) + for _, c := range cols { + switch { + case c.setOpMerge != nil: + out = append(out, ColumnInfo{ + Name: "*", + SetOpMerge: &SetOpMergeInfo{ + Left: c.setOpMerge.left, + Right: c.setOpMerge.right, + ByName: c.setOpMerge.byName, + MatchColumns: c.setOpMerge.matchColumns, + }, + }) + case c.starGroup != nil: + out = append(out, ColumnInfo{ + Name: "*", + StarSegments: segmentsToWire(c.starGroup.segments), + StarExcept: c.starGroup.except, + StarReplace: c.starGroup.replace, + }) + case c.baseStar != nil && c.starMerge == nil: + // A standalone base-table star (a bare `SELECT *` over a single base + // table, or a `rel.*` over a base table) — one segment to expand, + // skipping any USING-coalesced key columns. + ref := *c.baseStar + out = append(out, ColumnInfo{ + Name: "*", + StarSegments: []StarSegment{{BaseTable: &ref, ExceptColumns: c.baseStarExcept, Plain: c.plain}}, + }) + case c.starMerge != nil: + info := ColumnInfo{ + Name: c.name, + SourceColumns: c.sources, + IsPlain: c.plain, + StarMerge: &StarMergeInfo{ + Table: c.starMerge.table, + Index: c.starMerge.index, + LeftStar: c.starMerge.leftStar, + }, + } + out = append(out, info) + default: + out = append(out, ColumnInfo{ + Name: c.name, + SourceColumns: c.sources, + IsPlain: c.plain, + BaseFieldName: c.baseFieldName, + }) + } + } + return out +} + +// segmentsToWire converts a star item's resolved expansion segments into the +// StarSegment wire form: a baseStar element becomes a BaseTable segment (the +// consumer enumerates the table's columns), a concrete element becomes a resolved +// (Name, Sources, Plain) segment the consumer emits directly. +func segmentsToWire(segs []projColumn) []StarSegment { + out := make([]StarSegment, 0, len(segs)) + for _, s := range segs { + if s.baseStar != nil { + ref := *s.baseStar + out = append(out, StarSegment{BaseTable: &ref, ExceptColumns: s.baseStarExcept, Plain: s.plain}) + continue + } + out = append(out, StarSegment{Name: s.name, Sources: s.sources, Plain: s.plain, BaseFieldName: s.baseFieldName}) + } + return out +} + +// collectColumnPaths returns the raw dotted-name component lists of every column +// reference directly mentioned in expr (in source order, excluding nested +// subqueries), mirroring the legacy getPossibleColumnResources. The flat parts — +// not a dialect-bucketed ColumnRef — are what relation-aware resolution needs (a +// 2-part `rel.col` must be tried as relation.column, not dataset.column). +func (w *spanWalker) collectColumnPaths(expr ast.Node) [][]string { + if expr == nil { + return nil + } + var paths [][]string + ew := &exprWalk{ + w: w, + followSub: false, + onParts: func(parts []string) { paths = append(paths, parts) }, + } + ew.walk(expr) + return paths +} + +// exprToParts renders a dotted-name expression (Identifier / PathExpr / +// FieldAccess chain) into its component parts, or nil if it is not a plain name +// chain. +func exprToParts(expr ast.Node) []string { + switch e := expr.(type) { + case *ast.Identifier: + if e.Name == "" { + return nil + } + return []string{e.Name} + case *ast.PathExpr: + if len(e.Parts) == 0 { + return nil + } + return append([]string{}, e.Parts...) + case *ast.FieldAccess: + return flattenFieldAccess(e) + case *ast.ParenExpr: + return exprToParts(e.Expr) + } + return nil +} diff --git a/googlesql/analysis/projection_test.go b/googlesql/analysis/projection_test.go new file mode 100644 index 00000000..fc5dd5d3 --- /dev/null +++ b/googlesql/analysis/projection_test.go @@ -0,0 +1,772 @@ +package analysis + +import ( + "sort" + "strings" + "testing" +) + +// These tests pin the relation-projection resolution contract (the masking-grade +// core): a `SELECT *` / `rel.*` / column reference over a CTE, a derived +// subquery, or a join resolves to that relation's RESOLVED output projection — +// concrete columns with base-column lineage, or a base-table-star segment the +// catalog-aware consumer expands — and a set operation position-merges resolved +// projections. They assert the omni-side ColumnInfo wire form (the bytebase +// extractor's behaviour on top of it is covered by the bigquery query-span +// corpus, recorded from the legacy resolver). + +// resolvedSources renders one resolved ColumnInfo's source lineage as sorted +// "table.column" (or "column" when unqualified) strings. +func resolvedSources(info ColumnInfo) []string { + var out []string + for _, sc := range info.SourceColumns { + if sc.Table != "" { + out = append(out, sc.Table+"."+sc.Column) + } else { + out = append(out, sc.Column) + } + } + sort.Strings(out) + return out +} + +// TestProjection_CTESubsetThenStar: a CTE projecting a SUBSET of a wider base +// table, read by `SELECT *`, resolves the star to ONLY the CTE's projected column +// (with base lineage) — not the base table's other columns. This is the core +// under-attribution guard: a `*` over `WITH c AS (SELECT email FROM users)` must +// surface only email, never ssn/name. +func TestProjection_CTESubsetThenStar(t *testing.T) { + span, err := GetQuerySpan("WITH c AS (SELECT email FROM users) SELECT * FROM c", DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if len(span.Results) != 1 { + t.Fatalf("got %d results, want 1 (the CTE projects exactly email)", len(span.Results)) + } + r := span.Results[0] + if len(r.StarSegments) != 0 { + t.Errorf("result should be a resolved concrete column, not a star-segment item; got %d segments", len(r.StarSegments)) + } + if !strings.EqualFold(r.Name, "email") { + t.Errorf("name = %q, want email", r.Name) + } + if r.IsPlain { + t.Errorf("IsPlain = true, want false (a CTE column from an explicit select is not a plain field)") + } + if got := resolvedSources(r); !eqStrings(got, []string{"users.email"}) { + t.Errorf("sources = %v, want [users.email] (the sole-base-relation attribution ties the bare column to its FROM table)", got) + } +} + +// TestProjection_DerivedStar: a `SELECT *` over a derived subquery reproduces the +// subquery's resolved projection (its explicit columns), not the base table's +// columns. +func TestProjection_DerivedStar(t *testing.T) { + span, err := GetQuerySpan("SELECT * FROM (SELECT a, b FROM t) d", DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if got := resultNames(span); !eqStrings(got, []string{"a", "b"}) { + t.Errorf("results = %v, want [a b] (the derived projection, not t's full columns)", got) + } + for i, r := range span.Results { + if len(r.StarSegments) != 0 { + t.Errorf("result[%d] should be concrete, got a star-segment item", i) + } + } +} + +// TestProjection_DerivedQualifiedColumnAndStar pins the "Table subquery" shape: a +// qualified reference through a derived relation that wraps a base-table star +// (`result.ID` over `(SELECT * FROM A)` where A = `SELECT * FROM people`) +// resolves to the base column people.ID, while a sibling `*` reproduces the whole +// base-table star (one StarSegment over people). +func TestProjection_DerivedQualifiedColumnAndStar(t *testing.T) { + span, err := GetQuerySpan("WITH A AS (SELECT * FROM people) SELECT result.ID, * FROM (SELECT * FROM A) result", DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if len(span.Results) != 2 { + t.Fatalf("got %d results, want 2 (result.ID + the star)", len(span.Results)) + } + // result.ID resolves through the derived relation and CTE A to people.ID. + idCol := span.Results[0] + if !strings.EqualFold(idCol.Name, "ID") { + t.Errorf("result[0] name = %q, want ID", idCol.Name) + } + if got := resolvedSources(idCol); !eqStrings(got, []string{"people.ID"}) { + t.Errorf("result.ID sources = %v, want [people.ID]", got) + } + // The sibling * is a single base-table-star segment over people (the consumer + // enumerates its columns). + star := span.Results[1] + if len(star.StarSegments) != 1 || star.StarSegments[0].BaseTable == nil { + t.Fatalf("result[1] should be a single base-table-star segment, got %+v", star.StarSegments) + } + if !strings.EqualFold(star.StarSegments[0].BaseTable.Table, "people") { + t.Errorf("star segment base table = %q, want people", star.StarSegments[0].BaseTable.Table) + } + if !star.StarSegments[0].Plain { + t.Errorf("a base-table * passthrough column should be a plain field") + } +} + +// TestProjection_SetOpMergesResolvedCTEs: a set operation of two subset-projecting +// CTEs position-merges their resolved projections — output i draws from BOTH arms +// at position i — with names from the left arm. +func TestProjection_SetOpMergesResolvedCTEs(t *testing.T) { + sql := "WITH a AS (SELECT id, name FROM users), b AS (SELECT pid, label FROM members) " + + "SELECT * FROM a UNION ALL SELECT * FROM b" + span, err := GetQuerySpan(sql, DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if got := resultNames(span); !eqStrings(got, []string{"id", "name"}) { + t.Fatalf("names = %v, want [id name] (left arm names)", got) + } + if got := resolvedSources(span.Results[0]); !eqStrings(got, []string{"members.pid", "users.id"}) { + t.Errorf("result[0] sources = %v, want [members.pid users.id] (both arms position 0, table-attributed)", got) + } + if got := resolvedSources(span.Results[1]); !eqStrings(got, []string{"members.label", "users.name"}) { + t.Errorf("result[1] sources = %v, want [members.label users.name] (both arms position 1, table-attributed)", got) + } +} + +// TestProjection_SetOpConcreteAndBaseStarArm pins the StarMerge contract: a +// `concrete UNION ALL (SELECT * FROM base)` keeps the concrete arm's per-position +// lineage AND attaches a StarMerge so the consumer expands the base table and +// position-merges its i-th column (omni cannot know the base table's arity). +func TestProjection_SetOpConcreteAndBaseStarArm(t *testing.T) { + span, err := GetQuerySpan("SELECT a, b FROM t1 UNION ALL SELECT * FROM t2", DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if len(span.Results) != 2 { + t.Fatalf("got %d results, want 2 (left arm arity)", len(span.Results)) + } + for i, r := range span.Results { + if r.StarMerge == nil { + t.Fatalf("result[%d] should carry a StarMerge for the base-star arm", i) + } + if !strings.EqualFold(r.StarMerge.Table.Table, "t2") { + t.Errorf("result[%d] StarMerge table = %q, want t2", i, r.StarMerge.Table.Table) + } + if r.StarMerge.Index != i { + t.Errorf("result[%d] StarMerge index = %d, want %d", i, r.StarMerge.Index, i) + } + if r.StarMerge.LeftStar { + t.Errorf("result[%d] StarMerge.LeftStar = true, want false (the star arm is the RIGHT one)", i) + } + } +} + +// TestProjection_BaseTableStarSegment: a bare `SELECT *` over a single base table +// is a single base-table-star segment (the consumer enumerates the columns), not +// a flat set of concrete columns omni cannot know. +func TestProjection_BaseTableStarSegment(t *testing.T) { + span, err := GetQuerySpan("SELECT * FROM people", DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if len(span.Results) != 1 || len(span.Results[0].StarSegments) != 1 { + t.Fatalf("want 1 star-segment result, got %+v", span.Results) + } + seg := span.Results[0].StarSegments[0] + if seg.BaseTable == nil || !strings.EqualFold(seg.BaseTable.Table, "people") { + t.Errorf("segment = %+v, want a base-table segment over people", seg) + } +} + +// TestProjection_StarOverJoinSegmentsAndPlainness: a bare `*` over a JOIN of base +// tables yields one base-table-star item per side, in FROM order (a modifier-free +// star inlines its segments), with the legacy join plain-field rule — the LEFT/ +// anchor side's columns are NOT plain, the right side's are. +func TestProjection_StarOverJoinSegmentsAndPlainness(t *testing.T) { + span, err := GetQuerySpan("SELECT * FROM a JOIN b ON a.id = b.aid", DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + // A modifier-free `*` over a join inlines one base-table-star item per side. + if len(span.Results) != 2 { + t.Fatalf("got %d results, want 2 (one base-table star per join side)", len(span.Results)) + } + left := span.Results[0] + right := span.Results[1] + if len(left.StarSegments) != 1 || left.StarSegments[0].BaseTable == nil || + !strings.EqualFold(left.StarSegments[0].BaseTable.Table, "a") || left.StarSegments[0].Plain { + t.Errorf("result[0] = %+v, want a base table a star NOT plain (join anchor/left side is rewrapped)", left.StarSegments) + } + if len(right.StarSegments) != 1 || right.StarSegments[0].BaseTable == nil || + !strings.EqualFold(right.StarSegments[0].BaseTable.Table, "b") || !right.StarSegments[0].Plain { + t.Errorf("result[1] = %+v, want a base table b star plain (right side keeps plainness)", right.StarSegments) + } +} + +// TestProjection_QualifiedColumnThroughCTE: a qualified `alias.col` over a CTE +// reference resolves to the CTE column's lineage, even when the CTE is self-joined +// under two aliases. The CTE column's lineage is the body's resolved source — here +// a bare `id` / `parent_id` (the CTE body selected them over the base table +// `nodes`, whose columns omni does not enumerate), which the catalog-aware +// consumer matches back to nodes.id / nodes.parent_id (verified in the bigquery +// corpus). The point pinned here is that `x.id` resolves THROUGH the CTE relation +// (not to an empty/aliased ref). +func TestProjection_QualifiedColumnThroughCTE(t *testing.T) { + sql := "WITH c AS (SELECT id, parent_id FROM nodes) " + + "SELECT x.id, y.parent_id FROM c x JOIN c y ON x.parent_id = y.id" + span, err := GetQuerySpan(sql, DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if got := resolvedSources(span.Results[0]); !eqStrings(got, []string{"nodes.id"}) { + t.Errorf("x.id sources = %v, want [nodes.id] (CTE column attributed to its base table)", got) + } + if got := resolvedSources(span.Results[1]); !eqStrings(got, []string{"nodes.parent_id"}) { + t.Errorf("y.parent_id sources = %v, want [nodes.parent_id] (CTE column attributed to its base table)", got) + } +} + +// TestProjection_RecursiveCTEFixpoint: a recursive CTE's projection resolves the +// recursive arm's self-references against the anchor and iterates to a fixpoint, +// so a column whose recursive expression reads another column accumulates that +// column's lineage (c3 = c3*c2 picks up c2's {a,b}). +func TestProjection_RecursiveCTEFixpoint(t *testing.T) { + sql := "WITH RECURSIVE c AS (" + + "(SELECT a AS c1, b AS c2, c AS c3 FROM t) " + + "UNION ALL SELECT c1*c2, c2+c1, c3*c2 FROM c" + + ") SELECT c1, c2, c3 FROM c" + span, err := GetQuerySpan(sql, DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if len(span.Results) != 3 { + t.Fatalf("got %d results, want 3", len(span.Results)) + } + // c1 = c1*c2 → {a,b}; c2 = c2+c1 → {a,b}; c3 = c3*c2 → {a,b,c} via the fixpoint + // (c2 grew to {a,b} in an earlier pass, then c3 reads c2 and picks up a). + if got := resolvedSources(span.Results[0]); !eqStrings(got, []string{"t.a", "t.b"}) { + t.Errorf("c1 sources = %v, want [a b]", got) + } + if got := resolvedSources(span.Results[2]); !eqStrings(got, []string{"t.a", "t.b", "t.c"}) { + t.Errorf("c3 sources = %v, want [a b c] (fixpoint propagates c2's growth)", got) + } +} + +// TestProjection_StructFieldRootIsColumn: a `s.field` / `s.a.b` where `s` is not a +// FROM relation resolves `s` as a (struct) column of the base table, dropping the +// trailing field path — the legacy a-as-column field-path behaviour. +func TestProjection_StructFieldRootIsColumn(t *testing.T) { + for _, sql := range []string{"SELECT s.field FROM t", "SELECT s.a.b FROM t"} { + span, err := GetQuerySpan(sql, DialectBigQuery) + if err != nil { + t.Fatalf("%q error: %v", sql, err) + } + if len(span.Results) != 1 { + t.Fatalf("%q got %d results, want 1", sql, len(span.Results)) + } + if !strings.EqualFold(span.Results[0].Name, "s") { + t.Errorf("%q name = %q, want s (the struct column root)", sql, span.Results[0].Name) + } + if got := resolvedSources(span.Results[0]); !eqStrings(got, []string{"t.s"}) { + t.Errorf("%q sources = %v, want [t.s] (struct root attributed to its sole base relation)", sql, got) + } + } +} + +// TestProjection_QualifiedColumnKeepsDialectBuckets: a fully-qualified column whose +// qualifier names a FROM base table (`proj.ds.t.c`) keeps its dialect-bucketed +// qualifier so it lines up with that table's access, rather than being misread as +// a struct-column root. +func TestProjection_QualifiedColumnKeepsDialectBuckets(t *testing.T) { + span, err := GetQuerySpan("SELECT proj.ds.t.c FROM proj.ds.t", DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if len(span.Results) != 1 || len(span.Results[0].SourceColumns) != 1 { + t.Fatalf("want 1 result with 1 source, got %+v", span.Results) + } + c := span.Results[0].SourceColumns[0] + if c.Catalog != "proj" || c.Database != "ds" || c.Table != "t" || c.Column != "c" { + t.Errorf("ref = {Catalog:%q Database:%q Schema:%q Table:%q Column:%q}, want {proj ds \"\" t c}", + c.Catalog, c.Database, c.Schema, c.Table, c.Column) + } +} + +// TestProjection_CTEReferenceRecorded: a CTE reference (`FROM c`) is recorded in +// CTEReferences (not AccessTables), so the consumer can surface it in the +// table-level source set while never column-expanding it. +func TestProjection_CTEReferenceRecorded(t *testing.T) { + span, err := GetQuerySpan("WITH c AS (SELECT id FROM users) SELECT id FROM c", DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if got := tableNames(span); !eqStrings(got, []string{"users"}) { + t.Errorf("AccessTables = %v, want [users] (the CTE reference c is NOT a base table)", got) + } + if len(span.CTEReferences) != 1 || !strings.EqualFold(span.CTEReferences[0].Table, "c") { + t.Errorf("CTEReferences = %+v, want one reference to c", span.CTEReferences) + } +} + +// TestProjection_LambdaParamExcludedFromResolvedLineage guards the resolver path +// of the lambda-parameter exclusion: a bound lambda parameter must not appear in +// a resolved select item's source lineage (only the free columns do). +func TestProjection_LambdaParamExcludedFromResolvedLineage(t *testing.T) { + span, err := GetQuerySpan("SELECT ARRAY_TRANSFORM(arr, e -> e + threshold) AS r FROM t", DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if got := resolvedSources(span.Results[0]); !eqStrings(got, []string{"t.arr", "t.threshold"}) { + t.Errorf("sources = %v, want [t.arr t.threshold] (lambda param e excluded; table-attributed)", got) + } +} + +// --------------------------------------------------------------------------- +// Fix F1: SELECT * over JOIN ... USING coalesces the key columns. +// +// GoogleSQL's `SELECT *` over `lt JOIN rt USING (k)` returns the key column +// ONCE (first, coalesced from both sides), then the left side's non-key +// columns, then the right side's. Concatenating both sides' stars instead +// shifts every position after the key — the positional masker then applies +// rt.k's policy to the column that actually holds rt's first NON-key (secret) +// column: a masking under-attribution leak. The legacy resolver coalesced only +// upper-case-written keys (a case-fold bug — `USING (k)` leaked); omni +// coalesces case-insensitively (GoogleSQL identifiers are case-insensitive), +// which is strictly safer, so the lowercase shape is pinned HERE (it cannot be +// corpus-recorded from legacy without recording the leak). +// --------------------------------------------------------------------------- + +// TestProjection_JoinUsingStarCoalescesLowercase pins the F1 wire shape over +// base tables with a lowercase key: one concrete coalesced key column (lineage +// from BOTH sides), then one base-table star per side carrying the key in its +// ExceptColumns so the consumer's expansion skips it. +func TestProjection_JoinUsingStarCoalescesLowercase(t *testing.T) { + span, err := GetQuerySpan("SELECT * FROM lt JOIN rt USING (k)", DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if len(span.Results) != 3 { + t.Fatalf("got %d results, want 3 (coalesced k + lt-star + rt-star), results=%+v", len(span.Results), span.Results) + } + key := span.Results[0] + if key.StarSegments != nil || !strings.EqualFold(key.Name, "k") { + t.Fatalf("result[0] = %+v, want a concrete coalesced column k", key) + } + if got := resolvedSources(key); !eqStrings(got, []string{"lt.k", "rt.k"}) { + t.Errorf("coalesced k sources = %v, want [lt.k rt.k] (BOTH sides)", got) + } + if key.IsPlain { + t.Errorf("coalesced key must not be a plain field") + } + left, right := span.Results[1], span.Results[2] + if len(left.StarSegments) != 1 || left.StarSegments[0].BaseTable == nil || + !strings.EqualFold(left.StarSegments[0].BaseTable.Table, "lt") { + t.Fatalf("result[1] = %+v, want a base-table star over lt", left.StarSegments) + } + if got := left.StarSegments[0].ExceptColumns; len(got) != 1 || !strings.EqualFold(got[0], "k") { + t.Errorf("lt star ExceptColumns = %v, want [k] (the coalesced key is excluded)", got) + } + if left.StarSegments[0].Plain { + t.Errorf("join-left star must be rewrapped non-plain") + } + if len(right.StarSegments) != 1 || right.StarSegments[0].BaseTable == nil || + !strings.EqualFold(right.StarSegments[0].BaseTable.Table, "rt") { + t.Fatalf("result[2] = %+v, want a base-table star over rt", right.StarSegments) + } + if got := right.StarSegments[0].ExceptColumns; len(got) != 1 || !strings.EqualFold(got[0], "k") { + t.Errorf("rt star ExceptColumns = %v, want [k]", got) + } + if !right.StarSegments[0].Plain { + t.Errorf("join-right star keeps plainness") + } +} + +// TestProjection_JoinUsingCaseInsensitive: the key match is case-insensitive +// (GoogleSQL identifier rule): an uppercase-written `USING (K)` coalesces the +// same way (this is also the legacy-recordable corpus shape). +func TestProjection_JoinUsingCaseInsensitive(t *testing.T) { + span, err := GetQuerySpan("SELECT * FROM lt JOIN rt USING (K)", DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if len(span.Results) != 3 { + t.Fatalf("got %d results, want 3, results=%+v", len(span.Results), span.Results) + } + if got := resolvedSources(span.Results[0]); !eqStrings(got, []string{"lt.K", "rt.K"}) { + t.Errorf("coalesced K sources = %v, want [lt.K rt.K]", got) + } +} + +// TestProjection_JoinUsingMultiKeyAndChain: multi-key USING coalesces each key +// (omni parses `USING (a, b)`; legacy could not), and chained joins compose — +// the outer join's keys come first, then the left side's remaining columns +// (which start with the inner join's coalesced keys), then the right side's. +func TestProjection_JoinUsingMultiKeyAndChain(t *testing.T) { + span, err := GetQuerySpan("SELECT * FROM a JOIN b USING (k, j) JOIN c USING (m)", DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if len(span.Results) != 6 { + t.Fatalf("got %d results, want 6 (m, k, j, a*, b*, c*), results=%+v", len(span.Results), span.Results) + } + if got := resolvedSources(span.Results[0]); !eqStrings(got, []string{"a.m", "b.m", "c.m"}) { + t.Errorf("outer key m sources = %v, want [a.m b.m c.m] (left side's stars may each own m; over-attribution is safe)", got) + } + if got := resolvedSources(span.Results[1]); !eqStrings(got, []string{"a.k", "b.k"}) { + t.Errorf("inner key k sources = %v, want [a.k b.k]", got) + } + if got := resolvedSources(span.Results[2]); !eqStrings(got, []string{"a.j", "b.j"}) { + t.Errorf("inner key j sources = %v, want [a.j b.j]", got) + } + wantExcept := map[int][]string{3: {"k", "j", "m"}, 4: {"k", "j", "m"}, 5: {"m"}} + wantTable := map[int]string{3: "a", 4: "b", 5: "c"} + for i := 3; i <= 5; i++ { + r := span.Results[i] + if len(r.StarSegments) != 1 || r.StarSegments[0].BaseTable == nil || + !strings.EqualFold(r.StarSegments[0].BaseTable.Table, wantTable[i]) { + t.Fatalf("result[%d] = %+v, want base star over %s", i, r.StarSegments, wantTable[i]) + } + got := append([]string{}, r.StarSegments[0].ExceptColumns...) + want := append([]string{}, wantExcept[i]...) + for i := range got { + got[i] = strings.ToLower(got[i]) + } + sort.Strings(got) + sort.Strings(want) + if !eqStrings(got, want) { + t.Errorf("result[%d] ExceptColumns = %v, want %v", i, got, want) + } + } +} + +// TestProjection_JoinUsingConcreteSides: USING over two CONCRETE (CTE) sides +// name-filters inline — the coalesced key unions both CTE columns' base +// lineage and each side's non-key columns follow. +func TestProjection_JoinUsingConcreteSides(t *testing.T) { + sql := "WITH lc AS (SELECT lk AS k, note FROM t1), rc AS (SELECT rk AS k, sec FROM t2) " + + "SELECT * FROM lc JOIN rc USING (k)" + span, err := GetQuerySpan(sql, DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if got := resultNames(span); !eqStrings(got, []string{"k", "note", "sec"}) { + t.Fatalf("names = %v, want [k note sec]", got) + } + if got := resolvedSources(span.Results[0]); !eqStrings(got, []string{"t1.lk", "t2.rk"}) { + t.Errorf("k sources = %v, want [lk rk]", got) + } + if got := resolvedSources(span.Results[1]); !eqStrings(got, []string{"t1.note"}) { + t.Errorf("note sources = %v, want [note]", got) + } + if got := resolvedSources(span.Results[2]); !eqStrings(got, []string{"t2.sec"}) { + t.Errorf("sec sources = %v, want [sec]", got) + } +} + +// TestProjection_JoinUsingStarGroupSide: a USING side that is an EXCEPT-star +// derived table keeps its star-group (the key joins its EXCEPT set; the key's +// lineage is read from the group's segments). +func TestProjection_JoinUsingStarGroupSide(t *testing.T) { + span, err := GetQuerySpan("SELECT * FROM (SELECT * EXCEPT (x) FROM a) d JOIN b USING (k)", DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if len(span.Results) != 3 { + t.Fatalf("got %d results, want 3 (k + d's star-group + b's star), results=%+v", len(span.Results), span.Results) + } + if got := resolvedSources(span.Results[0]); !eqStrings(got, []string{"a.k", "b.k"}) { + t.Errorf("k sources = %v, want [a.k b.k]", got) + } + grp := span.Results[1] + if len(grp.StarSegments) != 1 || grp.StarSegments[0].BaseTable == nil || + !strings.EqualFold(grp.StarSegments[0].BaseTable.Table, "a") { + t.Fatalf("result[1] = %+v, want d's star-group over a", grp) + } + gotExcept := append([]string{}, grp.StarExcept...) + for i := range gotExcept { + gotExcept[i] = strings.ToLower(gotExcept[i]) + } + sort.Strings(gotExcept) + if !eqStrings(gotExcept, []string{"k", "x"}) { + t.Errorf("d star-group EXCEPT = %v, want [k x] (original EXCEPT + the coalesced key)", gotExcept) + } +} + +// TestProjection_NaturalJoinFailsClosed: both BigQuery and Spanner reject +// NATURAL JOIN at analysis, and omni cannot know the shared columns without a +// catalog — silent concatenation would misalign positions, so the analysis +// fails closed (structural rule: correct lineage or an error, never silent +// misalignment for shapes legacy could not parse). +func TestProjection_NaturalJoinFailsClosed(t *testing.T) { + if _, err := GetQuerySpan("SELECT * FROM a NATURAL JOIN b", DialectBigQuery); err == nil { + t.Fatalf("NATURAL JOIN: want a fail-closed error, got nil") + } +} + +// TestProjection_JoinUsingDeferredSideFailsClosed: a USING side whose +// projection carries a deferred set-operation marker (its arity is known only +// to the metadata-aware consumer) cannot be name-partitioned here; the +// analysis fails closed rather than emitting misaligned lineage. +func TestProjection_JoinUsingDeferredSideFailsClosed(t *testing.T) { + sql := "SELECT * FROM (SELECT a, b FROM t UNION ALL SELECT * FROM u) d JOIN r USING (a)" + if _, err := GetQuerySpan(sql, DialectBigQuery); err == nil { + t.Fatalf("USING over a deferred set-op side: want a fail-closed error, got nil") + } +} + +// --------------------------------------------------------------------------- +// Fix F2: UNNEST table sources produce a value relation with element lineage. +// +// `FROM victim, UNNEST(victim.secret_tokens) AS elem` projects elem as a real +// output column whose data IS victim.secret_tokens' elements. Dropping the +// relation made `SELECT elem` resolve to an empty/unmatched lineage — the +// fail-open masker then returned the secret unmasked. (The legacy extractor +// ERRORED on UNNEST sources = fail-closed; omni resolves the lineage.) +// --------------------------------------------------------------------------- + +// TestProjection_UnnestElementLineage pins the core F2 contract. +func TestProjection_UnnestElementLineage(t *testing.T) { + span, err := GetQuerySpan("SELECT elem FROM victim, UNNEST(victim.secret_tokens) AS elem", DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if len(span.Results) != 1 { + t.Fatalf("got %d results, want 1, results=%+v", len(span.Results), span.Results) + } + r := span.Results[0] + if !strings.EqualFold(r.Name, "elem") { + t.Errorf("name = %q, want elem", r.Name) + } + if got := resolvedSources(r); !eqStrings(got, []string{"victim.secret_tokens"}) { + t.Errorf("elem sources = %v, want [victim.secret_tokens]", got) + } + if got := tableNames(span); !eqStrings(got, []string{"victim"}) { + t.Errorf("tables = %v, want [victim]", got) + } +} + +// TestProjection_UnnestStarAndOffset: a bare `*` includes the unnest element +// column (after the base table's star) and the WITH OFFSET companion column; +// the offset is positional, not data, so it has NO lineage. +func TestProjection_UnnestStarAndOffset(t *testing.T) { + span, err := GetQuerySpan("SELECT * FROM victim, UNNEST(victim.secret_tokens) AS elem WITH OFFSET AS pos", DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if len(span.Results) != 3 { + t.Fatalf("got %d results, want 3 (victim* + elem + pos), results=%+v", len(span.Results), span.Results) + } + if len(span.Results[0].StarSegments) != 1 || span.Results[0].StarSegments[0].BaseTable == nil { + t.Fatalf("result[0] = %+v, want victim's base star", span.Results[0]) + } + if got := resolvedSources(span.Results[1]); !eqStrings(got, []string{"victim.secret_tokens"}) { + t.Errorf("elem sources = %v, want [victim.secret_tokens]", got) + } + if !strings.EqualFold(span.Results[2].Name, "pos") || len(span.Results[2].SourceColumns) != 0 { + t.Errorf("result[2] = %+v, want offset column pos with NO lineage", span.Results[2]) + } +} + +// TestProjection_UnnestImplicitAliasAndLiteral: an unaliased UNNEST over a +// column path takes the path's last component as the implicit element name +// (the GoogleSQL implicit-alias rule), and an UNNEST over a literal array has +// a RESOLVED-empty lineage (not an unresolved name to be matched fail-open). +func TestProjection_UnnestImplicitAliasAndLiteral(t *testing.T) { + span, err := GetQuerySpan("SELECT secret_tokens FROM victim, UNNEST(victim.secret_tokens)", DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if got := resolvedSources(span.Results[0]); !eqStrings(got, []string{"victim.secret_tokens"}) { + t.Errorf("implicit-alias element sources = %v, want [victim.secret_tokens]", got) + } + + span2, err := GetQuerySpan("SELECT e FROM UNNEST([1, 2, 3]) AS e", DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if len(span2.Results) != 1 || len(span2.Results[0].SourceColumns) != 0 { + t.Errorf("literal-array element = %+v, want one column with empty (resolved) lineage", span2.Results) + } + if len(span2.AccessTables) != 0 { + t.Errorf("tables = %v, want [] (a literal array reads no table)", tableNames(span2)) + } +} + +// TestProjection_CorrelatedArrayPathRelation: the implicit-UNNEST comma form +// (`FROM t, t.arr AS a`) builds the same value relation, and a struct-field +// reference through the element alias (`a.child`) resolves to the element's +// array lineage. +func TestProjection_CorrelatedArrayPathRelation(t *testing.T) { + span, err := GetQuerySpan("SELECT a FROM t, t.arr AS a", DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if got := resolvedSources(span.Results[0]); !eqStrings(got, []string{"t.arr"}) { + t.Errorf("element sources = %v, want [t.arr]", got) + } + + span2, err := GetQuerySpan("SELECT elem.child FROM victim, UNNEST(victim.kids) AS elem", DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if got := resolvedSources(span2.Results[0]); !eqStrings(got, []string{"victim.kids"}) { + t.Errorf("value-field sources = %v, want [victim.kids] (field access reads the element)", got) + } +} + +// --------------------------------------------------------------------------- +// Fix F3: BY NAME / CORRESPONDING set operations merge arms by column NAME. +// +// `UNION ALL BY NAME` matches arm columns by (case-insensitive) name, not by +// ordinal. Merging by ordinal attributed output `id` to the right arm's +// same-POSITION column (a secret) and vice versa — under-attribution both +// ways. Legacy could not parse BY NAME at all (fail-closed); omni must merge +// by name (or defer to the metadata-aware consumer when an arm carries an +// un-enumerable star). +// --------------------------------------------------------------------------- + +// TestProjection_SetOpByNameConcrete: both arms concrete → inline name merge; +// output order = left arm's order; right-only names append (FULL BY NAME). +func TestProjection_SetOpByNameConcrete(t *testing.T) { + sql := "WITH l AS (SELECT a AS id, b AS sec FROM t1), r AS (SELECT c AS sec, d AS id FROM t2) " + + "SELECT * FROM l UNION ALL BY NAME SELECT * FROM r" + span, err := GetQuerySpan(sql, DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if got := resultNames(span); !eqStrings(got, []string{"id", "sec"}) { + t.Fatalf("names = %v, want [id sec] (left arm order)", got) + } + if got := resolvedSources(span.Results[0]); !eqStrings(got, []string{"t1.a", "t2.d"}) { + t.Errorf("id sources = %v, want [t1.a t2.d] (NAME-matched, not ordinal)", got) + } + if got := resolvedSources(span.Results[1]); !eqStrings(got, []string{"t1.b", "t2.c"}) { + t.Errorf("sec sources = %v, want [t1.b t2.c]", got) + } +} + +// TestProjection_SetOpByNameRightOnlyAppends: BY NAME appends right-only names +// after the left arm's columns (over-attribution-safe: a trailing extra never +// shifts earlier positions; with mismatched name sets BigQuery errors at +// execution anyway, so the extra entry is conservative). The FULL/LEFT prefix +// spelling (`FULL UNION ALL BY NAME`) is NOT parsed by the omni parser — it +// fail-closes at parse time — so only the bare BY NAME form reaches here. +func TestProjection_SetOpByNameRightOnlyAppends(t *testing.T) { + sql := "WITH l AS (SELECT a AS id FROM t1), r AS (SELECT d AS id, e AS extra FROM t2) " + + "SELECT * FROM l UNION ALL BY NAME SELECT * FROM r" + span, err := GetQuerySpan(sql, DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if got := resultNames(span); !eqStrings(got, []string{"id", "extra"}) { + t.Fatalf("names = %v, want [id extra]", got) + } + if got := resolvedSources(span.Results[0]); !eqStrings(got, []string{"t1.a", "t2.d"}) { + t.Errorf("id sources = %v, want [t1.a t2.d]", got) + } + if got := resolvedSources(span.Results[1]); !eqStrings(got, []string{"t2.e"}) { + t.Errorf("extra sources = %v, want [t2.e]", got) + } +} + +// TestProjection_SetOpByNameMatchColumns: `BY NAME ON (cols)` restricts the +// output to the listed columns, in list order (each unioning both arms' +// same-named lineage). +func TestProjection_SetOpByNameMatchColumns(t *testing.T) { + sql := "WITH l AS (SELECT a AS id, b AS sec FROM t1), r AS (SELECT c AS sec, d AS id FROM t2) " + + "SELECT * FROM l UNION ALL BY NAME ON (sec) SELECT * FROM r" + span, err := GetQuerySpan(sql, DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if got := resultNames(span); !eqStrings(got, []string{"sec"}) { + t.Fatalf("names = %v, want [sec]", got) + } + if got := resolvedSources(span.Results[0]); !eqStrings(got, []string{"t1.b", "t2.c"}) { + t.Errorf("sec sources = %v, want [t1.b t2.c]", got) + } +} + +// TestProjection_SetOpCorrespondingMergesByName: CORRESPONDING is the synonym +// family of BY NAME — same name-merge semantics. +func TestProjection_SetOpCorrespondingMergesByName(t *testing.T) { + sql := "WITH l AS (SELECT a AS id, b AS sec FROM t1), r AS (SELECT c AS sec, d AS id FROM t2) " + + "SELECT * FROM l UNION ALL CORRESPONDING SELECT * FROM r" + span, err := GetQuerySpan(sql, DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if got := resolvedSources(span.Results[0]); !eqStrings(got, []string{"t1.a", "t2.d"}) { + t.Errorf("id sources = %v, want [t1.a t2.d]", got) + } +} + +// TestProjection_SetOpByNameDeferredStarArm: an arm carrying an un-enumerable +// base-table star cannot be name-merged metadata-free — the whole merge defers +// to the consumer with the ByName flag (NEVER a silent ordinal merge). +func TestProjection_SetOpByNameDeferredStarArm(t *testing.T) { + span, err := GetQuerySpan("SELECT id, sec FROM lt UNION ALL BY NAME SELECT * FROM rt", DialectBigQuery) + if err != nil { + t.Fatalf("error: %v", err) + } + if len(span.Results) != 1 || span.Results[0].SetOpMerge == nil { + t.Fatalf("results = %+v, want a single deferred SetOpMerge item", span.Results) + } + m := span.Results[0].SetOpMerge + if !m.ByName { + t.Errorf("SetOpMerge.ByName = false, want true (the consumer must name-merge after expansion)") + } + if len(m.Left) != 2 || len(m.Right) != 1 { + t.Errorf("arm widths = %d/%d, want 2 concrete / 1 star", len(m.Left), len(m.Right)) + } +} + +// TestProjection_StructFieldStarFailsClosed pins the gate-round P0: a STRUCT- +// field star through a relation (`d.s.*` / `t.s.*`) cannot be enumerated +// metadata-free — returning the relation's whole projection would misalign the +// positional masker (the first result's masker lands on the struct's first +// sub-column → a sensitive column is returned unmasked). It must FAIL CLOSED +// (the legacy resolver errored on multi-part wild paths too). A plain `rel.*` +// and a schema-qualified `schema.table.*` still resolve. +func TestProjection_StructFieldStarFailsClosed(t *testing.T) { + for _, sql := range []string{ + "SELECT d.s.* FROM (SELECT public, STRUCT(secret AS ssn) AS s FROM t) AS d", + "SELECT t.s.* FROM t", + } { + if _, err := GetQuerySpan(sql, DialectSpanner); err == nil { + t.Errorf("GetQuerySpan(%q) = nil error, want fail-closed (struct-field star is not enumerable)", sql) + } + } + // Single-part rel.* still resolves (no error). + if _, err := GetQuerySpan("SELECT d.* FROM (SELECT a, b FROM t) AS d", DialectSpanner); err != nil { + t.Errorf("rel.* should still resolve, got error: %v", err) + } + // Schema-qualified table star still resolves (the head is not a relation). + if _, err := GetQuerySpan("SELECT analytics.events.* FROM analytics.events", DialectSpanner); err != nil { + t.Errorf("schema-qualified star should still resolve, got error: %v", err) + } +} + +// TestProjection_DotStarQualifierTightening pins the re-verify refinements: +// a value-table (UNNEST) `elem.*` fails closed (the element's struct sub-fields +// are not enumerable — returning one column would shift every later position); +// a schema-qualified star over an UNQUALIFIED FROM fails closed (no written +// prefix can match — the engine rejects the range variable; legacy errored); +// and a BigQuery dataset-qualified star resolves via the Database qualifier. +func TestProjection_DotStarQualifierTightening(t *testing.T) { + for _, sql := range []string{ + "SELECT elem.* FROM t, UNNEST(t.structs) AS elem", + "SELECT wrongschema.t.* FROM t", + } { + if _, err := GetQuerySpan(sql, DialectSpanner); err == nil { + t.Errorf("GetQuerySpan(%q) = nil error, want fail-closed", sql) + } + } + // BigQuery: a dataset-qualified star matches the relation's Database bucket. + span, err := GetQuerySpan("SELECT ds.t.* FROM ds.t", DialectBigQuery) + if err != nil { + t.Fatalf("dataset-qualified star should resolve, got error: %v", err) + } + if len(span.Results) != 1 || len(span.Results[0].StarSegments) != 1 { + t.Fatalf("want one base-star segment over ds.t, got %+v", span.Results) + } +} diff --git a/googlesql/analysis/query_span.go b/googlesql/analysis/query_span.go index 56902472..06ad2b9e 100644 --- a/googlesql/analysis/query_span.go +++ b/googlesql/analysis/query_span.go @@ -1,6 +1,7 @@ package analysis import ( + "fmt" "strings" "github.com/bytebase/omni/googlesql/ast" @@ -48,6 +49,17 @@ type QuerySpan struct { // CTEs lists the names defined by WITH clauses at any scope, in declaration // order. CTEs []string + + // CTEReferences lists the FROM-position table references that resolved to an + // in-scope CTE (e.g. `FROM c`), deduplicated on (catalog, schema, table, alias) + // like AccessTables. These are kept SEPARATE from AccessTables — a CTE is not a + // physical table, so it must not be column-expanded or counted in the user/ + // system mix — but the legacy bytebase access-table listener recorded every + // FROM table path INCLUDING CTE references in its flat source-column set, so the + // catalog-aware consumer unions CTEReferences into the span's table-level + // SourceColumns to reproduce that (a CTE reference surfaces as a + // {default-dataset, cteName} resource). + CTEReferences []TableAccess } // TableAccess represents one physical table referenced in a statement. The @@ -65,7 +77,9 @@ type TableAccess struct { Loc ast.Loc // source location of the table reference } -// ColumnInfo represents one output column of a query result. +// ColumnInfo represents one output column of a query result — or, when +// StarSegments is set, one `*`/`rel.*` star item that the catalog-aware consumer +// expands. type ColumnInfo struct { // Name is the output column name: the explicit alias if present, otherwise a // best-effort rendering (the column name for a bare column reference, "*"/ @@ -74,8 +88,136 @@ type ColumnInfo struct { // SourceColumns is the best-effort list of column references that directly // feed this output column (the column refs in the select-item expression, - // excluding those inside nested subqueries). + // excluding those inside nested subqueries). For a column omni RESOLVED through + // a CTE / derived relation, these carry the underlying BASE-table lineage (the + // relation alias is resolved away); for a reference omni could not resolve to a + // relation (a bare column over a base table whose columns omni cannot + // enumerate), they carry the written qualifier parts for the consumer to match + // against catalog metadata. SourceColumns []ColumnRef + + // IsPlain mirrors the legacy QuerySpanResult.IsPlainField: true when this output + // column is a direct base-table column passthrough (a `SELECT *` / `rel.*` + // expansion column that was not rewrapped by a join-left side, a set-operation + // merge, or an explicit select-list item). An explicit select item — even a + // bare `SELECT id` or a column resolved through a CTE/derived relation — is NOT + // plain. The consumer copies this onto base.QuerySpanResult.IsPlainField. + IsPlain bool + + // BaseFieldName marks Name as a base-table FIELD passthrough rather than a + // written/derived select-item name (today: the JOIN ... USING coalesced key, + // which the legacy resolver named after the left PhysicalTable's field). A + // consumer reproducing legacy naming renders it in the field's metadata case + // instead of the written or upper-cased form. + BaseFieldName bool + + // StarSegments, when non-nil, marks this ColumnInfo as a `*` / `rel.*` star item + // (a bare star, a qualified star, or a `SELECT *` over a CTE / derived relation) + // that the catalog-aware consumer expands IN ORDER into one output column per + // segment-column. A segment is either a base-table star (BaseTable set — expand + // every column of that physical table via metadata) or an already-resolved + // concrete projection column (a CTE/derived column with its base lineage). The + // star's EXCEPT/REPLACE modifiers (StarExcept/StarReplace) apply across the + // fully expanded column list. When StarSegments is set, Name/SourceColumns are + // not consumed for output (they remain only for star-shape detection back-compat + // — Name is "*" or "rel.*"). + StarSegments []StarSegment + + // StarExcept holds the column names of a `SELECT * EXCEPT (a, b)` modifier on a + // star item; nil for a non-star item or a star with no EXCEPT. The catalog-aware + // consumer (the bytebase extractor) drops these names from the star's expanded + // column set so an EXCEPT-ed column is not surfaced (and so over-masking does not + // occur by leaving it in). + StarExcept []string + + // StarReplace holds the `SELECT * REPLACE (expr AS name)` substitutions on a + // star item; nil for a non-star item or a star with no REPLACE. The consumer + // replaces the star-expanded output column named Name with one whose lineage is + // the replacement expression's Sources. + StarReplace []StarReplaceItem + + // StarMerge, when non-nil, marks this output position as a set-operation merge + // against a base-table-star arm whose arity only the metadata-aware consumer + // knows: the consumer expands StarMerge.Table, takes its StarMerge.Index-th + // column, unions that column into this position's SourceColumns, and (when + // StarMerge.LeftStar) takes this position's output Name from that column. This + // reproduces the legacy "expand the star arm against metadata, then position- + // merge" behaviour for `concrete UNION ALL (SELECT * FROM base)`. + StarMerge *StarMergeInfo + + // SetOpMerge, when non-nil, marks this ColumnInfo as a WHOLE deferred set- + // operation merge whose arms could not be position-merged inline because at + // least one arm carries an un-enumerable star (a base-table star, an + // EXCEPT/REPLACE star, or a nested merge). The metadata-aware consumer expands + // each arm's projection fully (recursively, reusing this same per-column + // expansion) and then position-merges the two expanded lists: output names from + // the LEFT arm, SourceColumns unioned per position, IsPlainField=false. This + // reproduces the legacy "fully resolve each arm, then zip" + // (extractTableSourceFromQuerySetOperation) for the star-involving arm + // combinations a per-position StarMerge cannot express — a base-star UNION + // base-star (one merged output column per expanded position, not two + // concatenated stars) and an EXCEPT/REPLACE star arm in a set operation. When + // set, Name is "*" and Star*/SourceColumns are not consumed for output. + SetOpMerge *SetOpMergeInfo +} + +// SetOpMergeInfo carries a deferred set-operation merge: the Left and Right arms' +// resolved projections, each a list of ColumnInfo the consumer expands fully (a +// base-table star against metadata, an EXCEPT/REPLACE star with its modifiers, a +// nested merge recursively) before merging them. See ColumnInfo.SetOpMerge. +// +// ByName marks a BY NAME / CORRESPONDING set operation (fix F3): after expanding +// both arms the consumer merges them by case-insensitive column NAME — output +// order is the left arm's columns (then any right-only names appended), each +// unioning both arms' same-named lineage — instead of by ordinal. MatchColumns, +// when non-empty, is the `ON (cols)` / `BY (cols)` restriction list: the output +// is exactly those columns, in list order. Both are zero for an ordinal merge. +type SetOpMergeInfo struct { + Left []ColumnInfo + Right []ColumnInfo + ByName bool + MatchColumns []string +} + +// StarMergeInfo carries a set-operation merge against a base-table-star arm: the +// base Table to expand, the output ordinal (Index) whose lineage gains that +// table's Index-th column, and LeftStar (the star arm was the LEFT one, so the +// output name is taken from the expanded base column rather than from a concrete +// arm). See ColumnInfo.StarMerge. +type StarMergeInfo struct { + Table ColumnRef + Index int + LeftStar bool +} + +// StarSegment is one element of a star item's resolved expansion (see +// ColumnInfo.StarSegments). Exactly one shape applies: +// - BaseTable set: a base-table star — the consumer expands EVERY column of the +// named physical table (via catalog metadata), emitting each with IsPlain +// (Plain). This is the only piece omni leaves to the metadata-aware consumer, +// because omni has no catalog to enumerate a physical table's columns. +// ExceptColumns, when non-empty, lists column names the expansion must SKIP +// (case-insensitively): a JOIN ... USING key is projected once as a coalesced +// column, so each side's star excludes it (fix F1) — expanding it again would +// shift every later position and misalign the positional masker. +// - BaseTable nil: an already-resolved concrete projection column (a CTE / +// derived / explicit column reached through a relation), with its base lineage +// in Sources, output Name, and plainness Plain. The consumer emits it directly. +type StarSegment struct { + BaseTable *ColumnRef // base table to expand; nil for a concrete segment + ExceptColumns []string // base-table column names to skip when expanding + Name string // concrete segment output column name + Sources []ColumnRef // concrete segment base lineage + Plain bool // IsPlainField for this segment's column(s) + BaseFieldName bool // Name is a base-table field passthrough (see ColumnInfo.BaseFieldName) +} + +// StarReplaceItem is one `expr AS name` entry of a star REPLACE modifier: the +// output column Name whose value is overridden, and the source columns the +// replacement expression directly references (Sources). +type StarReplaceItem struct { + Name string + Sources []ColumnRef } // ColumnRef identifies a column by its optional qualifier parts and its name. @@ -105,6 +247,14 @@ func GetQuerySpan(statement string, dialect Dialect) (*QuerySpan, error) { w := newSpanWalker(span, dialect) w.analyzeStmt(file.Stmts[0]) + // Fail closed (structural rule): a shape the walker accepted but could not + // resolve to CORRECT lineage (NATURAL JOIN, a USING join over a deferred + // set-op projection) surfaces as an error rather than silently misaligned / + // empty lineage — the masking consumer must reject the statement, not return + // a sensitive column unmasked. + if w.failure != nil { + return nil, w.failure + } return span, nil } @@ -112,14 +262,21 @@ func GetQuerySpan(statement string, dialect Dialect) (*QuerySpan, error) { // CTE scope // --------------------------------------------------------------------------- -// cteScope is a linked stack of CTE name sets. Resolving a bare table reference -// walks outwards; an inner CTE shadows an outer one of the same name. GoogleSQL -// unquoted identifiers are case-insensitive for resolution, so CTE names are -// compared case-folded (matching the legacy findTableSchema's EqualFold CTE -// lookup). +// cteScope is a linked stack of CTE definitions. Resolving a bare table +// reference walks outwards; an inner CTE shadows an outer one of the same name. +// GoogleSQL unquoted identifiers are case-insensitive for resolution, so CTE +// names are compared case-folded (matching the legacy findTableSchema's EqualFold +// CTE lookup). +// +// Each in-scope CTE retains its RESOLVED output projection (the masking-grade +// upgrade): a `SELECT *` / `rel.*` / column reference over the CTE reproduces that +// projection with base-column lineage, rather than collapsing to a catalog-blind +// star (the legacy resolver stored each CTE as a base.PseudoTable with its +// resolved columns — recordCTE). type cteScope struct { - names map[string]bool - parent *cteScope + names map[string]bool + columns map[string][]projColumn // CTE name (lower-cased) → resolved projection + parent *cteScope } func (s *cteScope) isCTE(name string) bool { @@ -131,24 +288,67 @@ func (s *cteScope) isCTE(name string) bool { return false } +// cteColumns returns the resolved projection of the nearest in-scope CTE named +// `name`, or nil if no such CTE is in scope (or its projection was not retained, +// e.g. a recursive CTE still being defined). +func (s *cteScope) cteColumns(name string) ([]projColumn, bool) { + lower := strings.ToLower(name) + for cur := s; cur != nil; cur = cur.parent { + if cur.names[lower] { + cols, ok := cur.columns[lower] + return cols, ok + } + } + return nil, false +} + // --------------------------------------------------------------------------- // Walker // --------------------------------------------------------------------------- // spanWalker walks the connected googlesql AST to populate a QuerySpan. It // maintains a CTE scope stack (so bare references that name a CTE are filtered -// out of AccessTables), dedup maps for AccessTables and PredicateColumns, the -// current SELECT block's FROM aliases (so a correlated array/field path off an -// earlier FROM source is not mistaken for a base table), and a flag tracking -// whether the outermost query has populated Results. +// out of AccessTables), dedup maps for AccessTables and PredicateColumns, and +// the current SELECT block's FROM aliases (so a correlated array/field path off +// an earlier FROM source is not mistaken for a base table). +// +// Output columns (Results) are computed bottom-up: every query body — a SELECT, +// a set operation, or a parenthesized QueryStmt — RETURNS its output ColumnInfo +// list, and the outermost caller assigns it to span.Results. Returning (rather +// than appending the first SELECT's items as a side effect) is what lets a set +// operation MERGE both arms' per-position lineage (the masking-grade union +// semantics): an out-col[i]'s sources are left.results[i].sources ∪ +// right.results[i].sources. type spanWalker struct { span *QuerySpan dialect Dialect scope *cteScope fromAliases map[string]bool // alias + bare table name of the current SELECT's FROM sources - accessed map[tableKey]bool - predSeen map[ColumnRef]bool - resolved bool // Results captured for the outermost query + // leafRels is the current SELECT's leaf FROM relations (base tables, CTE + // references, derived subqueries) by reference name, used to resolve a column + // reference (`rel.col`, a bare `col` over a CTE/derived, `rel.*`) to its + // relation's projection. A join contributes its two sides as separate leaves + // here (mirroring the legacy tableSourceFrom, which held each table AND the join + // anchor), so `x.id` over `t x JOIN t y` resolves against leaf `x`. Saved and + // restored around each SELECT so nested blocks do not leak relations. + leafRels []*relation + accessed map[tableKey]bool + predSeen map[ColumnRef]bool + // failure, when set, marks the span as fail-closed: the walker met a shape it + // parses but cannot resolve to correct lineage without catalog metadata + // (NATURAL JOIN; JOIN USING over a deferred set-op projection). GetQuerySpan + // returns it as an error so the masking consumer rejects the statement instead + // of consuming silently wrong lineage. The first failure wins. Classification + // paths (collectAccessTables) deliberately ignore it — classification is + // best-effort and not masking-grade. + failure error +} + +// failClosed records a fail-closed condition (first one wins). +func (w *spanWalker) failClosed(err error) { + if w.failure == nil { + w.failure = err + } } type tableKey struct { @@ -175,11 +375,11 @@ func newSpanWalker(span *QuerySpan, dialect Dialect) *spanWalker { func (w *spanWalker) analyzeStmt(node ast.Node) { switch n := node.(type) { case *ast.QueryStmt: - w.visitQueryStmt(n, true /* outermost */) + w.span.Results = projColumnsToColumnInfos(w.visitQueryStmt(n)) case *ast.SelectStmt: - w.visitSelect(n, true) + w.span.Results = projColumnsToColumnInfos(w.visitSelect(n)) case *ast.SetOperation: - w.visitSetOp(n, true) + w.span.Results = projColumnsToColumnInfos(w.visitSetOp(n)) case *ast.InsertStmt: w.visitInsert(n) case *ast.UpdateStmt: @@ -210,19 +410,21 @@ func (w *spanWalker) analyzeStmt(node ast.Node) { // reference to a later sibling resolves to a base table, not the CTE). A // RECURSIVE WITH additionally makes a CTE's OWN name visible inside its body (so // the recursive self-reference is filtered out, not recorded as a base table). -func (w *spanWalker) visitQueryStmt(q *ast.QueryStmt, outermost bool) { +func (w *spanWalker) visitQueryStmt(q *ast.QueryStmt) []projColumn { if q == nil { - return + return nil } - scope := &cteScope{names: make(map[string]bool), parent: w.scope} + scope := &cteScope{names: make(map[string]bool), columns: make(map[string][]projColumn), parent: w.scope} w.scope = scope if q.With != nil { recursive := q.With.Recursive // Walk CTE bodies in declaration order, adding each name to scope at the // right moment so earlier-sibling (and, when RECURSIVE, self) references are - // filtered while later-sibling names still resolve to base tables. + // filtered while later-sibling names still resolve to base tables. Each + // CTE's RESOLVED projection is retained in the scope so a `SELECT *` / column + // reference over the CTE reproduces it (legacy recordCTE → PseudoTable). for _, cte := range q.With.CTEs { if cte == nil || cte.Name == "" { continue @@ -233,16 +435,19 @@ func (w *spanWalker) visitQueryStmt(q *ast.QueryStmt, outermost bool) { if recursive { scope.names[lower] = true } + var cols []projColumn if cte.Query != nil { - w.visitBody(cte.Query, false) + cols = w.resolveCTEColumns(cte, recursive, scope, lower) } + cols = applyCTEColumnAliases(cols, cte.Columns) // Non-recursive: the name becomes visible only AFTER its body (to the // following siblings and the main body). scope.names[lower] = true + scope.columns[lower] = cols } } - w.visitBody(q.Body, outermost) + results := w.visitBody(q.Body) // Query-level ORDER BY / LIMIT / OFFSET keys may reference columns. for _, item := range q.OrderBy { @@ -254,37 +459,222 @@ func (w *spanWalker) visitQueryStmt(q *ast.QueryStmt, outermost bool) { w.walkPredicate(q.Offset) w.scope = scope.parent + return results +} + +// resolveCTEColumns resolves a CTE body to its projection. For a non-recursive +// CTE (or a recursive one whose body is not an anchor/recursive set operation) it +// is a plain body resolution. For a recursive CTE shaped `(anchor) UNION [ALL] +// recursive`, it mirrors the legacy recordRecursiveCTE: resolve the anchor arm, +// publish it as the CTE's projection so the recursive arm's self-references +// resolve to the anchor's columns (and thus to the base lineage), then resolve +// the recursive arm and position-merge it into the anchor (the union of both +// arms' per-position lineage). One pass suffices — lineage is monotone, so the +// fixpoint the legacy loop computes is reached after a single recursive-arm merge. +func (w *spanWalker) resolveCTEColumns(cte *ast.CTE, recursive bool, scope *cteScope, lower string) []projColumn { + setOp := recursiveSetOp(recursive, cte.Query) + if setOp == nil { + return w.visitBody(cte.Query) + } + // Resolve the anchor arm and publish it so the recursive arm's self-references + // resolve against it. Then iterate the recursive arm to a fixpoint (lineage is + // monotone, so it converges): each pass re-resolves the recursive arm against + // the current projection and position-merges it, until no source set grows. A + // fixpoint is needed because a self-reference to column j may pull in column k's + // lineage, which a later column's expression then reads — the legacy + // recordRecursiveCTE loops for the same reason. The pass count is bounded by the + // column count (each pass can only add sources; bounded extra guard prevents a + // pathological loop). + anchor := w.visitBody(setOp.Left) + scope.columns[lower] = anchor + current := anchor + for iter := 0; iter <= len(anchor)+1; iter++ { + recursivePart := w.visitBody(setOp.Right) + merged := mergeProjections(current, recursivePart) + if projectionsEqual(current, merged) { + break + } + current = merged + scope.columns[lower] = current + } + return current +} + +// projectionsEqual reports whether two projections have identical per-position +// output names, source-column sets (order-insensitive on the sources), AND star +// markers. Used to detect the recursive-CTE lineage fixpoint. The star-marker +// comparison matters: a base-star / starMerge / setOpMerge arm carries no concrete +// sources, so comparing only name+sources would declare a still-changing +// star-shaped projection "equal" and stop the fixpoint before the recursive arm's +// star lineage was published (the masking under-attribution this guards). Equality +// therefore also requires the per-position star shape to match. +func projectionsEqual(a, b []projColumn) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i].name != b[i].name || len(a[i].sources) != len(b[i].sources) { + return false + } + if !starShapeEqual(a[i], b[i]) { + return false + } + seen := make(map[ColumnRef]bool, len(a[i].sources)) + for _, r := range a[i].sources { + seen[r] = true + } + for _, r := range b[i].sources { + if !seen[r] { + return false + } + } + } + return true +} + +// starShapeEqual reports whether two projection columns carry the same star +// marker shape: both concrete, both a base-table star over the same table, or +// both a starMerge over the same table/index. A starGroup or setOpMerge marker is +// treated as never-equal (its expansion is metadata-dependent and not compared +// here) so the recursive fixpoint keeps iterating until the projection stops +// changing shape. This is only used by the recursive-CTE fixpoint, whose +// iteration count is bounded, so a conservative "not equal" merely costs an extra +// bounded pass rather than risking early termination with unpublished lineage. +func starShapeEqual(a, b projColumn) bool { + if (a.baseStar == nil) != (b.baseStar == nil) { + return false + } + if a.baseStar != nil && !sameBaseTable(*a.baseStar, *b.baseStar) { + return false + } + if (a.starMerge == nil) != (b.starMerge == nil) { + return false + } + if a.starMerge != nil && + (!sameBaseTable(a.starMerge.table, b.starMerge.table) || + a.starMerge.index != b.starMerge.index || + a.starMerge.leftStar != b.starMerge.leftStar) { + return false + } + if a.starGroup != nil || b.starGroup != nil { + return false + } + if a.setOpMerge != nil || b.setOpMerge != nil { + return false + } + return true } -// visitBody dispatches a query body / set-op operand / query primary. -func (w *spanWalker) visitBody(node ast.Node, outermost bool) { +// recursiveSetOp returns the CTE body's top-level set operation when the CTE is +// recursive and its body is `(anchor) UNION/INTERSECT/EXCEPT recursive` (directly, +// or wrapped in a QueryStmt with no further WITH), else nil. A recursive CTE +// without a set-operation body has no separate anchor/recursive arms (it then +// resolves like a normal body). +func recursiveSetOp(recursive bool, body ast.Node) *ast.SetOperation { + if !recursive || body == nil { + return nil + } + switch n := body.(type) { + case *ast.SetOperation: + return n + case *ast.QueryStmt: + if n.With == nil { + if so, ok := n.Body.(*ast.SetOperation); ok { + return so + } + } + } + return nil +} + +// applyCTEColumnAliases renames a CTE body's resolved projection columns to the +// CTE's explicit column-name list (`cte_name (a, b, …) AS (…)`, the Spanner form) +// when one is present, mirroring the legacy behaviour of projecting under the +// declared names. Lineage and plainness are preserved; only the output names +// change. A mismatch in arity (or an un-enumerable base-star body) leaves the +// projection unchanged (best-effort). +func applyCTEColumnAliases(cols []projColumn, names []string) []projColumn { + if len(names) == 0 || len(names) != len(cols) { + return cols + } + out := make([]projColumn, len(cols)) + for i, c := range cols { + c.name = names[i] + out[i] = c + } + return out +} + +// visitBody dispatches a query body / set-op operand / query primary, returning +// the resolved output projection it produces. +func (w *spanWalker) visitBody(node ast.Node) []projColumn { switch n := node.(type) { case *ast.QueryStmt: // A parenthesized ( query ) query_primary. - w.visitQueryStmt(n, outermost) + return w.visitQueryStmt(n) case *ast.SelectStmt: - w.visitSelect(n, outermost) + return w.visitSelect(n) case *ast.SetOperation: - w.visitSetOp(n, outermost) + return w.visitSetOp(n) } + return nil } -// visitSetOp walks a UNION/INTERSECT/EXCEPT tree. The left arm surfaces its -// Results when this is the outermost statement (SQL's first-select rule); the -// right arm never does. -func (w *spanWalker) visitSetOp(n *ast.SetOperation, outermost bool) { +// visitSetOp walks a UNION/INTERSECT/EXCEPT tree and merges the two arms' +// per-position output projection. For masking the union must be conservative: +// out-col[i]'s sources are the UNION of left[i] AND right[i] (an output column of +// a set operation reads from BOTH arms at that position). The output column NAME +// comes from the LEFT arm (SQL's "column names come from the first SELECT"), so a +// three-arm left-associative tree keeps the leftmost names while accumulating +// every arm's sources. See mergeProjections for the base-table-star arm handling. +// +// A BY NAME / CORRESPONDING operation (fix F3) merges by column NAME instead: +// merging those by ordinal attributed each output to the OTHER arm's +// same-position column (verified mis-attribution — `id` carried a secret's +// policy). See mergeProjectionsByName. +func (w *spanWalker) visitSetOp(n *ast.SetOperation) []projColumn { if n == nil { - return + return nil + } + left := w.visitBody(n.Left) + right := w.visitBody(n.Right) + if n.ByName || n.Corresponding { + return mergeProjectionsByName(left, right, n.MatchColumns) + } + return mergeProjections(left, right) +} + +// unionColumnRefs returns the order-preserving, deduplicated union of two +// ColumnRef slices (left first, then any right ref not already present). +func unionColumnRefs(left, right []ColumnRef) []ColumnRef { + if len(right) == 0 { + return left + } + seen := make(map[ColumnRef]bool, len(left)+len(right)) + out := make([]ColumnRef, 0, len(left)+len(right)) + for _, r := range left { + if !seen[r] { + seen[r] = true + out = append(out, r) + } } - w.visitBody(n.Left, outermost) - w.visitBody(n.Right, false) + for _, r := range right { + if !seen[r] { + seen[r] = true + out = append(out, r) + } + } + return out } // visitSelect processes one SelectStmt: its FROM relations, predicate clauses, -// and select list. -func (w *spanWalker) visitSelect(stmt *ast.SelectStmt, outermost bool) { +// and select list. It returns the resolved output projection (one position per +// select-list output column, with base-column lineage) so a parent set operation +// can merge per-position lineage; the outermost dispatch surfaces it as +// span.Results. +func (w *spanWalker) visitSelect(stmt *ast.SelectStmt) []projColumn { if stmt == nil { - return + return nil } // A new FROM-alias scope for this SELECT block. Populated as FROM sources are @@ -293,11 +683,16 @@ func (w *spanWalker) visitSelect(stmt *ast.SelectStmt, outermost bool) { // base table. Saved/restored so nested SELECTs do not leak aliases. prevFrom := w.fromAliases w.fromAliases = map[string]bool{} - defer func() { w.fromAliases = prevFrom }() + prevLeaves := w.leafRels + w.leafRels = nil + defer func() { w.fromAliases = prevFrom; w.leafRels = prevLeaves }() - for _, src := range stmt.From { - w.visitFromItem(src) - } + // Resolve the FROM clause into the comma-item relations (a join collapses into + // one combined relation, used for a bare `*`); building them also populates + // w.leafRels (each base table / CTE / derived leaf, used for `rel.*` and column + // resolution) and performs the AccessTable / predicate side-effects (a base path + // is recorded, a subquery body walked, ON/USING predicates collected). + fromRels := w.buildFromRelations(stmt.From) w.walkPredicate(stmt.Where) if stmt.GroupBy != nil { @@ -319,16 +714,14 @@ func (w *spanWalker) visitSelect(stmt *ast.SelectStmt, outermost bool) { } } - // SELECT list last so Results reflects final column order. + // Resolve the select list against the FROM relations (relation-aware lineage), + // then walk each item for nested-subquery tables (the direct refs are resolved + // by the projection resolver; the subquery walk only discovers tables). + results := w.resolveSelectProjection(stmt, fromRels) for _, item := range stmt.Items { if item == nil { continue } - if outermost && !w.resolved { - w.span.Results = append(w.span.Results, w.makeColumnInfo(item)) - } - // Walk the item expression for nested-subquery tables (the direct column - // refs are captured in makeColumnInfo). w.walkSubqueriesOnly(item.Expr) if item.Modifiers != nil { for _, r := range item.Modifiers.Replace { @@ -338,76 +731,273 @@ func (w *spanWalker) visitSelect(stmt *ast.SelectStmt, outermost bool) { } } } - if outermost { - w.resolved = true - } + return results } // --------------------------------------------------------------------------- // FROM relations // --------------------------------------------------------------------------- -// visitFromItem walks one FROM source (TableExpr / JoinExpr / UnnestExpr). -func (w *spanWalker) visitFromItem(node ast.Node) { +// buildFromRelations resolves a SELECT's FROM list into the ordered leaf +// relations used for `rel.*` and column resolution. Each comma-separated FROM +// item contributes one relation (a join collapses into one combined relation — +// see buildFromItem), and a bare `*` over the SELECT reproduces the in-order +// concatenation of these relations' projections (allColumns). Building a relation +// performs the same AccessTable / predicate side-effects the prior walk did. +func (w *spanWalker) buildFromRelations(from []ast.Node) []*relation { + var rels []*relation + for _, src := range from { + if rel := w.buildFromItem(src); rel != nil { + rels = append(rels, rel) + } + } + return rels +} + +// buildFromItem resolves one FROM source (TableExpr / JoinExpr / UnnestExpr) into +// a relation with its resolved projection, while performing the AccessTable / +// predicate side-effects. A JoinExpr collapses its two sides into one combined +// relation (the legacy joinTable): the left/anchor side's columns are rewrapped +// as non-plain, the right side's columns keep their plainness, and a USING join +// coalesces the key columns (fix F1 — see joinRelations). An UNNEST source +// builds a one-column value relation whose element carries the array argument's +// resolved lineage (fix F2 — the legacy extractor errored on array sources; +// dropping the relation here would shift `SELECT *` positions and resolve the +// element alias to nothing, both fail-open); its argument columns remain +// predicate references too. +func (w *spanWalker) buildFromItem(node ast.Node) *relation { switch n := node.(type) { case *ast.TableExpr: - w.visitTableExpr(n) + return w.buildTableExpr(n) case *ast.JoinExpr: - w.visitFromItem(n.Left) - w.visitFromItem(n.Right) + left := w.buildFromItem(n.Left) + right := w.buildFromItem(n.Right) w.walkPredicate(n.On) for _, col := range n.Using { if col != "" { w.addPredicateColumn(ColumnRef{Column: col}) } } + return w.joinRelations(left, right, n) case *ast.UnnestExpr: - // UNNEST(array_expr) — the array argument's columns are predicate-ish - // references; UNNEST produces no base-table access. + // UNNEST(array_expr) [AS alias] [WITH OFFSET [AS o]] — the array argument's + // columns stay predicate-ish references (and nested subqueries are walked), + // AND the source resolves to a value relation projecting the element + // column(s) with the argument's lineage. w.walkPredicate(n.Array) + return w.finishUnnestRelation(w.unnestElementColumns(n.Array), n.Alias, n.WithOffset, n.WithOffsetAlias) } + return nil } -// visitTableExpr handles a non-join FROM source: a base table path, a -// parenthesized subquery, or a table-valued function call. A subquery/TVF/array- -// path alias names a derived relation, not a base table, so it is not propagated -// into AccessTables — but the alias IS registered in the FROM scope so a later -// correlated path off it (`FROM (…) s, s.arr` or `FROM T t, t.arr a, a.child`) -// is also recognized as a correlated path rather than a base table. -func (w *spanWalker) visitTableExpr(n *ast.TableExpr) { +// joinRelations combines two join sides into one relation, mirroring the legacy +// joinTable: the anchor (left) side's columns are rebuilt as NON-plain fields +// (the legacy join rewraps them without IsPlainField), while the right side's +// columns keep their plainness. The combined relation is unnamed (a `rel.*` / +// column lookup uses the leaf relations, not the join wrapper). +// +// A `USING (keys)` join coalesces the key columns (fix F1): the real GoogleSQL +// `SELECT *` output is [each key ONCE (its value reads from BOTH sides), then +// the left side's non-key columns, then the right side's]. The legacy resolver +// only achieved this for upper-case-written keys (its key map was keyed on the +// written spelling but probed with the upper-cased field name); omni coalesces +// case-insensitively — the GoogleSQL identifier rule, and strictly safer (the +// legacy lowercase concatenation shifted positions, a masking leak). Keys-first +// ordering matches the engine; legacy emitted the coalesced key at the LEFT +// side's key position, which agrees whenever the key is the left table's first +// column (the corpus shape). +// +// NATURAL joins fail closed: both engines reject them at analysis, and the +// shared-column set is unknowable without a catalog — silently concatenating +// would misalign positions (structural rule: correct lineage or an error). +// A USING side that cannot be name-partitioned without metadata (a deferred +// set-op marker in its projection) likewise fails closed. +func (w *spanWalker) joinRelations(left, right *relation, join *ast.JoinExpr) *relation { + if join != nil && join.Natural { + w.failClosed(fmt.Errorf("NATURAL JOIN cannot be resolved to column lineage without catalog metadata (fail closed)")) + } else if join != nil && len(join.Using) > 0 && left != nil && right != nil { + if rel, ok := coalesceUsingJoin(left, right, join.Using); ok { + return rel + } + w.failClosed(fmt.Errorf("JOIN USING (%s) over a deferred set-operation projection cannot be name-partitioned without catalog metadata (fail closed)", strings.Join(join.Using, ", "))) + } + if left == nil { + return right + } + if right == nil { + return left + } + cols := make([]projColumn, 0, len(left.columns)+len(right.columns)) + for _, c := range left.columns { + c.plain = false + cols = append(cols, c) + } + cols = append(cols, right.columns...) + return &relation{columns: cols} +} + +// buildTableExpr resolves a non-join FROM source into a relation: a base table +// path (a single baseStar projection the consumer expands), a CTE reference (the +// CTE's retained resolved projection), a parenthesized subquery (its resolved +// projection), or a TVF / correlated array-path (no resolvable projection). It +// performs the same AccessTable / predicate side-effects as before: a base path +// is recorded, a subquery/TVF body is walked, a correlated array path contributes +// its root as a predicate, and an alias is registered in the FROM scope. +func (w *spanWalker) buildTableExpr(n *ast.TableExpr) *relation { if n == nil { - return + return nil } + var rel *relation switch { case n.Subquery != nil: - w.visitBody(n.Subquery, false) + // A FROM subquery: resolve its projection and expose it under the alias so a + // `SELECT *` / `alias.col` over it reproduces the subquery's resolved columns. + cols := w.visitBody(n.Subquery) w.registerFromAlias(n.Alias) + rel = &relation{name: strings.ToLower(n.Alias), columns: cols} + w.addLeaf(rel) case n.Func != nil: // Table-valued function: its arguments may reference columns and // subqueries. The argument columns are predicate-position references // (they parameterize the TVF), so collect them as predicates AND recurse // any subquery tables. The TVF's OUTPUT columns are unknown without a - // function signature, so it contributes no resolved Results — matching the + // function signature, so it contributes no resolved projection — matching the // legacy extractor, which rejected TVFs outright for lack of return-column // info. w.walkPredicate(n.Func) w.registerFromAlias(n.Alias) + rel = nil case n.Path != nil: // A multi-part path whose ROOT matches an earlier FROM alias/table in this // SELECT is a correlated array/field unnest (`FROM T t, t.arr`), NOT a base // table — the legacy extractor treats the array-path source as a non-table // (it rejected UNNEST/array sources). Record its root as a predicate column - // instead so the lineage still reflects the array column it walks, and - // register this source's own alias so a chained correlated path off it - // (`t.arr a, a.child`) is likewise recognized. + // (the lineage still reflects the array column it walks) AND build the same + // value relation an explicit UNNEST source gets (fix F2): the element column + // carries the array path's resolved lineage, so `SELECT *` keeps its + // positions and the element alias resolves. finishUnnestRelation registers + // the source's alias so a chained correlated path off it (`t.arr a, + // a.child`) is likewise recognized. if len(n.Path.Parts) >= 2 && w.fromAliases[strings.ToLower(n.Path.Parts[0])] { w.addPredicateColumn(columnRefFromParts(n.Path.Parts, w.dialect)) - w.registerFromAlias(n.Alias) + _, sources := w.resolvePath(n.Path.Parts) + elem := projColumn{name: n.Path.Parts[len(n.Path.Parts)-1], sources: sources} + rel = w.finishUnnestRelation([]projColumn{elem}, n.Alias, n.WithOffset, n.WithOffsetAlias) } else { - w.recordPath(n.Path, n.Alias) + rel = w.buildPathRelation(n.Path, n.Alias) } } w.walkPredicate(n.SystemTime) + return rel +} + +// unnestElementColumns resolves an UNNEST(...) call's element column(s): one +// projColumn per array argument, named by the argument path's last component +// when the argument is a plain column path (the GoogleSQL implicit-alias rule; +// "" otherwise), with the argument's resolved column refs as lineage. A literal +// array yields a column with RESOLVED-empty lineage (correct — it reads no +// table). A named zip-mode argument (`mode => …`) is an option, not an array. +func (w *spanWalker) unnestElementColumns(array ast.Node) []projColumn { + fc, ok := array.(*ast.FuncCall) + if !ok { + _, sources := w.resolveExprSources(array) + return []projColumn{{sources: sources}} + } + var cols []projColumn + for _, arg := range fc.Args { + if _, named := arg.(*ast.NamedArg); named { + continue + } + name := "" + if parts := exprToParts(arg); len(parts) > 0 { + name = parts[len(parts)-1] + } + _, sources := w.resolveExprSources(arg) + cols = append(cols, projColumn{name: name, sources: sources}) + } + if len(cols) == 0 { + cols = []projColumn{{}} + } + return cols +} + +// finishUnnestRelation completes an UNNEST / correlated-array-path value +// relation (fix F2): the explicit alias names a single element column (and the +// relation); `WITH OFFSET [AS o]` appends the positional companion column — +// named o, or GoogleSQL's default name `offset` — with NO lineage (it is a +// position, not data). The relation is registered as a leaf (so `alias.field` / +// the bare element name resolve) and its name enters the FROM scope (so a +// chained correlated path off it is recognized). The element columns are not +// plain fields (an array element is derived data, not a base-column +// passthrough). +func (w *spanWalker) finishUnnestRelation(cols []projColumn, alias string, withOffset bool, offsetAlias string) *relation { + if alias != "" && len(cols) == 1 { + cols[0].name = alias + } + if withOffset { + name := offsetAlias + if name == "" { + name = "offset" + } + cols = append(cols, projColumn{name: name}) + } + relName := alias + if relName == "" && len(cols) > 0 { + relName = cols[0].name + } + rel := &relation{name: strings.ToLower(relName), valueTable: true, columns: cols} + w.addLeaf(rel) + w.registerFromAlias(relName) + return rel +} + +// buildPathRelation resolves a base-table FROM path into a relation and records +// the access. A bare path naming an in-scope CTE reproduces that CTE's retained +// resolved projection (renamed to the alias / CTE name); a real physical table +// becomes a relation with a single baseStar projection element the metadata-aware +// consumer expands. recordPath keeps the existing AccessTable + FROM-scope +// side-effects (a CTE reference is excluded from AccessTables there). +func (w *spanWalker) buildPathRelation(path *ast.PathExpr, alias string) *relation { + w.recordPath(path, alias) + if path == nil || len(path.Parts) == 0 { + return nil + } + cat, db, schema, table := bucketNameParts(path.Parts, w.dialect) + refName := alias + if refName == "" { + refName = table + } + // A bare name matching an in-scope CTE reproduces the CTE's resolved projection. + if cat == "" && db == "" && schema == "" { + if cols, ok := w.scope.cteColumns(table); ok { + rel := &relation{name: strings.ToLower(refName), columns: cols} + w.addLeaf(rel) + return rel + } + } + // A physical base table: omni cannot enumerate its columns, so its projection is + // a single baseStar the consumer expands. Plain (a base-table column passthrough + // is a plain field) until a join-left/set-op/explicit-select rewraps it. + baseRef := ColumnRef{Catalog: cat, Database: db, Schema: schema, Table: table} + rel := &relation{ + name: strings.ToLower(refName), + isBase: true, + baseRef: baseRef, + columns: []projColumn{{name: "*", plain: true, baseStar: &baseRef}}, + } + w.addLeaf(rel) + return rel +} + +// addLeaf registers a leaf FROM relation in the current SELECT's lookup set +// (w.leafRels), used to resolve `rel.col` / `rel.*` / bare-column references. A +// nil or unnamed relation is skipped (a join wrapper is unnamed — its sides are +// registered individually). +func (w *spanWalker) addLeaf(rel *relation) { + if rel == nil || rel.name == "" { + return + } + w.leafRels = append(w.leafRels, rel) } // registerFromAlias records a derived-relation alias (subquery / TVF / correlated @@ -439,7 +1029,7 @@ func (w *spanWalker) visitInsert(n *ast.InsertStmt) { } } if n.Query != nil { - w.visitBody(n.Query, false) + w.visitBody(n.Query) } if n.TableClause != nil { if n.TableClause.Path != nil { @@ -482,7 +1072,9 @@ func (w *spanWalker) visitUpdate(n *ast.UpdateStmt) { } } for _, src := range n.From { - w.visitFromItem(src) + // UPDATE … FROM sources: walk for AccessTable / predicate side-effects (the + // resolved relation is not needed — UPDATE has no result projection). + w.buildFromItem(src) } w.walkPredicate(n.Where) if n.Returning != nil { @@ -514,9 +1106,10 @@ func (w *spanWalker) visitMerge(n *ast.MergeStmt) { return } w.recordTable(n.Target, n.Alias) - // The USING source is a TableExpr (a table path or a ( query ) subquery). + // The USING source is a TableExpr (a table path or a ( query ) subquery). Walk + // it for AccessTable / predicate side-effects (MERGE has no result projection). if src, ok := n.Source.(*ast.TableExpr); ok { - w.visitTableExpr(src) + w.buildTableExpr(src) } else { w.discoverTables(n.Source) } @@ -564,7 +1157,7 @@ func (w *spanWalker) discoverTables(node ast.Node) { switch n := node.(type) { case *ast.CreateTableStmt: w.recordTable(n.Name, "") - w.visitBody(n.AsQuery, false) + w.visitBody(n.AsQuery) // Source tables a CREATE TABLE references (Codex finding #4). w.recordTable(n.Like, "") w.recordTable(n.Clone, "") @@ -580,7 +1173,7 @@ func (w *spanWalker) discoverTables(node ast.Node) { } case *ast.CreateViewStmt: w.recordTable(n.Name, "") - w.visitBody(n.AsQuery, false) + w.visitBody(n.AsQuery) case *ast.CreateIndexStmt: // An index references its target table (ON