diff --git a/src/backend/access/aocs/aocsam.c b/src/backend/access/aocs/aocsam.c index a3f2b597977..d5e3fd45e59 100644 --- a/src/backend/access/aocs/aocsam.c +++ b/src/backend/access/aocs/aocsam.c @@ -124,6 +124,11 @@ open_all_datumstreamread_segfiles(AOCSScanDesc scan, AOCSFileSegInfo *segInfo) RelationOpenSmgr(rel); open_datumstreamread_segfile(basepath, rel, segInfo, ds[attno], attno); + + /* skip reading block for ANALYZE */ + if ((scan->rs_base.rs_flags & SO_TYPE_ANALYZE) != 0) + continue; + datumstreamread_block(ds[attno], blockDirectory, attno); AOCSScanDesc_UpdateTotalBytesRead(scan, attno); @@ -467,6 +472,46 @@ close_cur_scan_seg(AOCSScanDesc scan) AppendOnlyBlockDirectory_End_forInsert(scan->blockDirectory); } +static void +aocs_blkdirscan_init(AOCSScanDesc scan) +{ + if (scan->aocsfetch == NULL) + { + int natts = RelationGetNumberOfAttributes(scan->rs_base.rs_rd); + scan->proj = palloc(natts * sizeof(bool)); + MemSet(scan->proj, true, natts * sizeof(bool)); + + scan->aocsfetch = aocs_fetch_init(scan->rs_base.rs_rd, + scan->rs_base.rs_snapshot, + scan->appendOnlyMetaDataSnapshot, + scan->proj); + } + + scan->blkdirscan = palloc0(sizeof(AOBlkDirScanData)); + AOBlkDirScan_Init(scan->blkdirscan, &scan->aocsfetch->blockDirectory); +} + +static void +aocs_blkdirscan_finish(AOCSScanDesc scan) +{ + AOBlkDirScan_Finish(scan->blkdirscan); + pfree(scan->blkdirscan); + scan->blkdirscan = NULL; + + if (scan->aocsfetch != NULL) + { + aocs_fetch_finish(scan->aocsfetch); + pfree(scan->aocsfetch); + scan->aocsfetch = NULL; + } + + if (scan->proj != NULL) + { + pfree(scan->proj); + scan->proj = NULL; + } +} + /* * aocs_beginrangescan * @@ -552,6 +597,7 @@ aocs_beginscan_internal(Relation relation, AttrNumber natts; Oid visimaprelid; Oid visimapidxid; + Oid blkdirrelid; scan = (AOCSScanDesc) palloc0(sizeof(AOCSScanDescData)); scan->rs_base.rs_rd = relation; @@ -592,6 +638,12 @@ aocs_beginscan_internal(Relation relation, scan->columnScanInfo.ds = NULL; + if ((flags & SO_TYPE_ANALYZE) != 0) + { + scan->segfirstrow = 0; + scan->targrow = 0; + } + GetAppendOnlyEntryAttributes(RelationGetRelid(relation), NULL, NULL, @@ -599,16 +651,23 @@ aocs_beginscan_internal(Relation relation, NULL); GetAppendOnlyEntryAuxOids(relation, - NULL, NULL, NULL, + NULL, &blkdirrelid, NULL, &visimaprelid, &visimapidxid); if (scan->total_seg != 0) + { AppendOnlyVisimap_Init(&scan->visibilityMap, visimaprelid, visimapidxid, AccessShareLock, appendOnlyMetaDataSnapshot); + if ((flags & SO_TYPE_ANALYZE) != 0 && OidIsValid(blkdirrelid)) + { + aocs_blkdirscan_init(scan); + } + } + return scan; } @@ -666,6 +725,9 @@ aocs_endscan(AOCSScanDesc scan) if (scan->total_seg != 0) AppendOnlyVisimap_Finish(&scan->visibilityMap, AccessShareLock); + if (scan->blkdirscan != NULL) + aocs_blkdirscan_finish(scan); + /* GPDB should backport this to upstream */ if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT) UnregisterSnapshot(scan->rs_base.rs_snapshot); @@ -675,6 +737,402 @@ aocs_endscan(AOCSScanDesc scan) pfree(scan); } +static int +aocs_locate_target_segment(AOCSScanDesc scan, int64 targrow) +{ + int64 rowcount; + + for (int i = scan->cur_seg; i < scan->total_seg; i++) + { + if (i < 0) + continue; + + rowcount = scan->seginfo[i]->total_tupcount; + if (rowcount <= 0) + continue; + + if (scan->seginfo[i]->state == AOSEG_STATE_AWAITING_DROP) + { + /* skip this segment, it is awaiting drop */ + continue; + } + + if (scan->segfirstrow + rowcount - 1 >= targrow) + { + /* found the target segment */ + return i; + } + + /* continue next segment */ + scan->segfirstrow += rowcount; + scan->segrowsprocessed = 0; + } + + /* row is beyond the total number of rows in the relation */ + return -1; +} + +/* + * block directory based get_target_tuple() + */ +static bool +aocs_blkdirscan_get_target_tuple(AOCSScanDesc scan, int64 targrow, TupleTableSlot *slot) +{ + int segno, segidx; + int64 rownum = -1; + int64 rowsprocessed; + AOTupleId aotid; + int ncols = scan->columnScanInfo.relationTupleDesc->natts; + AppendOnlyBlockDirectory *blkdir = &scan->aocsfetch->blockDirectory; + + Assert(scan->blkdirscan != NULL); + + /* locate the target segment */ + segidx = aocs_locate_target_segment(scan, targrow); + if (segidx < 0) + return false; + + /* next starting position in locating segfile */ + scan->cur_seg = segidx; + + segno = scan->seginfo[segidx]->segno; + Assert(segno > InvalidFileSegNumber && segno <= AOTupleId_MaxSegmentFileNum); + + /* + * Note: It is safe to assume that the scan's segfile array and the + * blockdir's segfile array are identical. Otherwise, we should stop + * processing and throw an exception to make the error visible. + */ + if (blkdir->segmentFileInfo[segidx]->segno != segno) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("segfile array contents in both scan descriptor " + "and block directory are not identical on " + "append-optimized relation '%s'", + RelationGetRelationName(blkdir->aoRel)))); + } + + /* + * Unlike ao_row, we set currentSegmentFileNum for ao_column here + * just for passing the assertion in extract_minipage() called by + * AOBlkDirScan_GetRowNum(). + * Since we don't invoke AppendOnlyBlockDirectory_GetCachedEntry() + * for ao_column, it shoule be restored back to the original value + * for AppendOnlyBlockDirectory_GetEntry() working properly. + */ + int currentSegmentFileNum = blkdir->currentSegmentFileNum; + blkdir->currentSegmentFileNum = blkdir->segmentFileInfo[segidx]->segno; + + /* locate the target row by seqscan block directory */ + for (int col = 0; col < ncols; col++) + { + /* + * "segfirstrow" should be always pointing to the first row of + * a new segfile, only locate_target_segment could update + * its value. + * + * "segrowsprocessed" is used for tracking the position of + * processed rows in the current segfile. + */ + rowsprocessed = scan->segfirstrow + scan->segrowsprocessed; + + if ((scan->rs_base.rs_rd)->rd_att->attrs[col].attisdropped) + continue; + + rownum = AOBlkDirScan_GetRowNum(scan->blkdirscan, + segno, + col, + targrow, + &rowsprocessed); + + elog(DEBUG2, "AOBlkDirScan_GetRowNum(segno: %d, col: %d, targrow: %ld): " + "[segfirstrow: %ld, segrowsprocessed: %ld, rownum: %ld, cached_mpentry_num: %d]", + segno, col, targrow, scan->segfirstrow, scan->segrowsprocessed, rownum, + blkdir->cached_mpentry_num); + + if (rownum < 0) + continue; + + scan->segrowsprocessed = rowsprocessed - scan->segfirstrow; + + /* + * Found a column represented in the block directory. + * Here we just look for the 1st such column, no need + * to read other columns within the same row. + */ + break; + } + + /* restore to the original value as above mentioned */ + blkdir->currentSegmentFileNum = currentSegmentFileNum; + + if (rownum < 0) + return false; + + /* form the target tuple TID */ + AOTupleIdInit(&aotid, segno, rownum); + + ExecClearTuple(slot); + + /* fetch the target tuple */ + if(!aocs_fetch(scan->aocsfetch, &aotid, slot)) + return false; + + /* OK to return this tuple */ + ExecStoreVirtualTuple(slot); + pgstat_count_heap_fetch(scan->rs_base.rs_rd); + + return true; +} + +/* + * returns the segfile number in which `targrow` locates + */ +static int +aocs_getsegment(AOCSScanDesc scan, int64 targrow) +{ + int segno, segidx; + + /* locate the target segment */ + segidx = aocs_locate_target_segment(scan, targrow); + if (segidx < 0) + { + /* done reading all segments */ + close_cur_scan_seg(scan); + scan->cur_seg = -1; + return -1; + } + + segno = scan->seginfo[segidx]->segno; + Assert(segno > InvalidFileSegNumber && segno <= AOTupleId_MaxSegmentFileNum); + + if (segidx > scan->cur_seg) + { + close_cur_scan_seg(scan); + /* adjust cur_seg to fit for open_next_scan_seg() */ + scan->cur_seg = segidx - 1; + if (open_next_scan_seg(scan) >= 0) + { + /* new segment, make sure segrowsprocessed was reset */ + Assert(scan->segrowsprocessed == 0); + } + else + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected behavior, failed to open segno %d during scanning AOCO table %s", + segno, RelationGetRelationName(scan->rs_base.rs_rd)))); + } + } + + return segno; +} + +static inline int +aocs_block_remaining_rows(DatumStreamRead *ds) +{ + return (ds->blockRowCount - ds->blockRowsProcessed); +} + +/* + * fetches a single column value corresponding to `endrow` (equals to `targrow`) + */ +static bool +aocs_gettuple_column(AOCSScanDesc scan, AttrNumber attno, int64 startrow, int64 endrow, bool chkvisimap, TupleTableSlot *slot) +{ + bool isSnapshotAny = (scan->rs_base.rs_snapshot == SnapshotAny); + DatumStreamRead *ds = scan->columnScanInfo.ds[attno]; + int segno = scan->seginfo[scan->cur_seg]->segno; + AOTupleId aotid; + bool ret = true; + int64 rowstoprocess, nrows, rownum; + Datum *values; + bool *nulls; + + if (ds->blockFirstRowNum <= 0) + elog(ERROR, "AOCO varblock->blockFirstRowNum should be greater than zero."); + + Assert(segno > InvalidFileSegNumber && segno <= AOTupleId_MaxSegmentFileNum); + Assert(startrow <= endrow); + + rowstoprocess = endrow - startrow + 1; + nrows = ds->blockRowsProcessed + rowstoprocess; + rownum = ds->blockFirstRowNum + nrows - 1; + + /* form the target tuple TID */ + AOTupleIdInit(&aotid, segno, rownum); + + if (chkvisimap && !isSnapshotAny && !AppendOnlyVisimap_IsVisible(&scan->visibilityMap, &aotid)) + { + if (slot != NULL) + ExecClearTuple(slot); + + ret = false; + /* must update tracking vars before return */ + goto out; + } + + /* rowNumInBlock = rowNum - blockFirstRowNum */ + datumstreamread_find(ds, rownum - ds->blockFirstRowNum); + + values = slot->tts_values; + nulls = slot->tts_isnull; + + datumstreamread_get(ds, &(values[attno]), &(nulls[attno])); + +out: + /* update rows processed */ + ds->blockRowsProcessed += rowstoprocess; + + return ret; +} + +/* + * fetches all columns of the target tuple corresponding to `targrow` + */ +static bool +aocs_gettuple(AOCSScanDesc scan, int64 targrow, TupleTableSlot *slot) +{ + bool ret = true; + int64 rowcount = -1; + int64 rowstoprocess; + bool chkvisimap = true; + + Assert(scan->cur_seg >= 0); + Assert(slot != NULL); + + ExecClearTuple(slot); + + rowstoprocess = targrow - scan->segfirstrow + 1; + + /* read from scan->cur_seg */ + for (AttrNumber i = 0; i < scan->columnScanInfo.num_proj_atts; i++) + { + AttrNumber attno = scan->columnScanInfo.proj_atts[i]; + DatumStreamRead *ds = scan->columnScanInfo.ds[attno]; + int64 startrow = scan->segfirstrow + scan->segrowsprocessed; + + if (ds->blockRowCount <= 0) + ; /* haven't read block */ + else + { + /* block was read */ + rowcount = aocs_block_remaining_rows(ds); + Assert(rowcount >= 0); + + if (startrow + rowcount - 1 >= targrow) + { + if (!aocs_gettuple_column(scan, attno, startrow, targrow, chkvisimap, slot)) + { + ret = false; + /* must update tracking vars before return */ + goto out; + } + + chkvisimap = false; + /* haven't finished scanning on current block */ + continue; + } + else + startrow += rowcount; /* skip scanning remaining rows */ + } + + /* + * Keep reading block headers until we find the block containing + * the target row. + */ + while (true) + { + elog(DEBUG2, "aocs_gettuple(): [targrow: %ld, currow: %ld, diff: %ld, " + "startrow: %ld, rowcount: %ld, segfirstrow: %ld, segrowsprocessed: %ld, nth: %d, " + "blockRowCount: %d, blockRowsProcessed: %d]", targrow, startrow + rowcount - 1, + startrow + rowcount - 1 - targrow, startrow, rowcount, scan->segfirstrow, + scan->segrowsprocessed, datumstreamread_nth(ds), ds->blockRowCount, + ds->blockRowsProcessed); + + if (datumstreamread_block_info(ds)) + { + rowcount = ds->blockRowCount; + Assert(rowcount > 0); + + /* new block, reset blockRowsProcessed */ + ds->blockRowsProcessed = 0; + + if (startrow + rowcount - 1 >= targrow) + { + /* read a new buffer to consume */ + datumstreamread_block_content(ds); + + if (!aocs_gettuple_column(scan, attno, startrow, targrow, chkvisimap, slot)) + { + ret = false; + /* must update tracking vars before return */ + goto out; + } + + chkvisimap = false; + /* done this column */ + break; + } + + startrow += rowcount; + AppendOnlyStorageRead_SkipCurrentBlock(&ds->ao_read); + /* continue next block */ + } + else + pg_unreachable(); /* unreachable code */ + } + } + +out: + /* update rows processed */ + scan->segrowsprocessed = rowstoprocess; + + if (ret) + { + ExecStoreVirtualTuple(slot); + pgstat_count_heap_getnext(scan->rs_base.rs_rd); + } + + return ret; +} + +/* + * Given a specific target row number 'targrow' (in the space of all row numbers + * physically present in the table, i.e. across all segfiles), scan and return + * the corresponding tuple in 'slot'. + * + * If the tuple is visible, return true. Otherwise, return false. + */ +bool +aocs_get_target_tuple(AOCSScanDesc aoscan, int64 targrow, TupleTableSlot *slot) +{ + if (aoscan->columnScanInfo.relationTupleDesc == NULL) + { + aoscan->columnScanInfo.relationTupleDesc = slot->tts_tupleDescriptor; + /* Pin it! ... and of course release it upon destruction / rescan */ + PinTupleDesc(aoscan->columnScanInfo.relationTupleDesc); + initscan_with_colinfo(aoscan); + } + + if (aoscan->blkdirscan != NULL) + return aocs_blkdirscan_get_target_tuple(aoscan, targrow, slot); + + if (aocs_getsegment(aoscan, targrow) < 0) + { + /* all done */ + ExecClearTuple(slot); + return false; + } + + /* + * Unlike AO_ROW, AO_COLUMN may have different varblocks + * for different columns, so we get per-column tuple directly + * on the way of walking per-column varblock. + */ + return aocs_gettuple(aoscan, targrow, slot); +} static pg_attribute_hot_inline bool aocs_getnext_noqual(AOCSScanDesc scan, ScanDirection direction, TupleTableSlot *slot) @@ -689,6 +1147,9 @@ aocs_getnext_noqual(AOCSScanDesc scan, ScanDirection direction, TupleTableSlot * Assert(ScanDirectionIsForward(direction)); + /* should not be in ANALYZE - we use a different API */ + Assert((scan->rs_base.rs_flags & SO_TYPE_ANALYZE) == 0); + if (scan->columnScanInfo.relationTupleDesc == NULL) { scan->columnScanInfo.relationTupleDesc = slot->tts_tupleDescriptor; @@ -782,9 +1243,6 @@ aocs_getnext_noqual(AOCSScanDesc scan, ScanDirection direction, TupleTableSlot * * The tuple is invisible. * In `analyze`, we can simply return false */ - if ((scan->rs_base.rs_flags & SO_TYPE_ANALYZE) != 0) - return false; - rowNum = INT64CONST(-1); visible_pass = false; continue; /* not break, need advance for other cols */ @@ -930,11 +1388,7 @@ aocs_getnext_withqual(AOCSScanDesc scan, ScanDirection direction, TupleTableSlot { /* * The tuple is invisible. - * In `analyze`, we can simply return false */ - if ((scan->rs_base.rs_flags & SO_TYPE_ANALYZE) != 0) - return false; - rowNum = INT64CONST(-1); visible_pass = false; continue; /* not break, need advance for other cols */ diff --git a/src/backend/access/aocs/aocsam_handler.c b/src/backend/access/aocs/aocsam_handler.c index 504e9c0fb7a..22ca114cfe0 100644 --- a/src/backend/access/aocs/aocsam_handler.c +++ b/src/backend/access/aocs/aocsam_handler.c @@ -47,6 +47,7 @@ #include "utils/lsyscache.h" #include "utils/pg_rusage.h" #include "utils/guc.h" +#include "utils/sampling.h" #define IS_BTREE(r) ((r)->rd_rel->relam == BTREE_AM_OID) @@ -1623,10 +1624,13 @@ static bool aoco_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno, BufferAccessStrategy bstrategy) { - AOCSScanDesc aoscan = (AOCSScanDesc) scan; - aoscan->targetTupleId = blockno; - - return true; + /* + * For append-optimized relations, we use a separate sampling + * method. See table_relation_acquire_sample_rows(). + */ + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("API not supported for appendoptimized relations"))); } static bool @@ -1634,28 +1638,78 @@ aoco_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, double *liverows, double *deadrows, TupleTableSlot *slot) { - AOCSScanDesc aoscan = (AOCSScanDesc) scan; - bool ret = false; + /* + * For append-optimized relations, we use a separate sampling + * method. See table_relation_acquire_sample_rows(). + */ + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("API not supported for appendoptimized relations"))); +} - /* skip several tuples if they are not sampling target */ - while (aoscan->targetTupleId > aoscan->nextTupleId) - { - aoco_getnextslot(scan, ForwardScanDirection, slot); - aoscan->nextTupleId++; - } +static int +aoco_acquire_sample_rows(Relation onerel, int elevel, HeapTuple *rows, + int targrows, double *totalrows, double *totaldeadrows) +{ + int numrows = 0; /* # rows now in reservoir */ + double liverows = 0; /* # live rows seen */ + double deadrows = 0; /* # dead rows seen */ - if (aoscan->targetTupleId == aoscan->nextTupleId) + Assert(targrows > 0); + + TableScanDesc scan = table_beginscan_analyze(onerel); + TupleTableSlot *slot = table_slot_create(onerel, NULL); + AOCSScanDesc aocoscan = (AOCSScanDesc) scan; + + int64 totaltupcount = AOCSScanDesc_TotalTupCount(aocoscan); + int64 totaldeadtupcount = 0; + if (aocoscan->total_seg > 0 ) + totaldeadtupcount = AppendOnlyVisimap_GetRelationHiddenTupleCount(&aocoscan->visibilityMap); + /* + * The conversion from int64 to double (53 significant bits) is safe as the + * AOTupleId is 48bits, the max value of totalrows is never greater than + * AOTupleId_MaxSegmentFileNum * AOTupleId_MaxRowNum (< 48 significant bits). + */ + *totalrows = (double) (totaltupcount - totaldeadtupcount); + *totaldeadrows = (double) totaldeadtupcount; + + /* Prepare for sampling tuple numbers */ + RowSamplerData rs; + RowSampler_Init(&rs, *totalrows, targrows, random()); + + while (RowSampler_HasMore(&rs)) { - ret = aoco_getnextslot(scan, ForwardScanDirection, slot); - aoscan->nextTupleId++; + aocoscan->targrow = RowSampler_Next(&rs); + + vacuum_delay_point(); - if (ret) - *liverows += 1; + if (aocs_get_target_tuple(aocoscan, aocoscan->targrow, slot)) + { + rows[numrows++] = ExecCopySlotHeapTuple(slot); + liverows++; + } else - *deadrows += 1; /* if return an invisible tuple */ + deadrows++; + + ExecClearTuple(slot); } - return ret; + ExecDropSingleTupleTableSlot(slot); + table_endscan(scan); + + /* + * Emit some interesting relation info + */ + ereport(elevel, + (errmsg("\"%s\": scanned " INT64_FORMAT " rows, " + "containing %.0f live rows and %.0f dead rows; " + "%d rows in sample, %.0f accurate total live rows, " + "%.f accurate total dead rows", + RelationGetRelationName(onerel), + rs.m, liverows, deadrows, numrows, + *totalrows, *totaldeadrows))); + + return numrows; } static double @@ -2588,6 +2642,7 @@ static TableAmRoutine ao_column_methods = { .relation_vacuum = aoco_vacuum_rel, .scan_analyze_next_block = aoco_scan_analyze_next_block, .scan_analyze_next_tuple = aoco_scan_analyze_next_tuple, + .relation_acquire_sample_rows = aoco_acquire_sample_rows, .index_build_range_scan = aoco_index_build_range_scan, .index_validate_scan = aoco_index_validate_scan, @@ -2602,7 +2657,6 @@ static TableAmRoutine ao_column_methods = { .scan_bitmap_next_tuple = aoco_scan_bitmap_next_tuple, .scan_sample_next_block = aoco_scan_sample_next_block, .scan_sample_next_tuple = aoco_scan_sample_next_tuple, - .acquire_sample_rows = acquire_sample_rows, .amoptions = ao_amoptions, .swap_relation_files = aoco_swap_relation_files, diff --git a/src/backend/access/appendonly/appendonlyam.c b/src/backend/access/appendonly/appendonlyam.c index 2f590a8678e..b5f6a17a597 100755 --- a/src/backend/access/appendonly/appendonlyam.c +++ b/src/backend/access/appendonly/appendonlyam.c @@ -1118,6 +1118,311 @@ getNextBlock(AppendOnlyScanDesc scan) return true; } +static int +appendonly_locate_target_segment(AppendOnlyScanDesc scan, int64 targrow) +{ + int64 rowcount; + + for (int i = scan->aos_segfiles_processed - 1; i < scan->aos_total_segfiles; i++) + { + if (i < 0) + continue; + + rowcount = scan->aos_segfile_arr[i]->total_tupcount; + if (rowcount <= 0) + continue; + + if (scan->aos_segfile_arr[i]->state == AOSEG_STATE_AWAITING_DROP) + { + /* skip this segment, it is awaiting drop */ + continue; + } + + if (scan->segfirstrow + rowcount - 1 >= targrow) + { + /* found the target segment */ + return i; + } + + /* continue next segment */ + scan->segfirstrow += rowcount; + scan->segrowsprocessed = 0; + } + + /* row is beyond the total number of rows in the relation */ + return -1; +} + +/* + * returns the segfile number in which `targrow` locates + */ +static int +appendonly_getsegment(AppendOnlyScanDesc scan, int64 targrow) +{ + int segidx, segno; + + /* locate the target segment */ + segidx = appendonly_locate_target_segment(scan, targrow); + if (segidx < 0) + { + CloseScannedFileSeg(scan); + + /* done reading all segfiles */ + Assert(scan->aos_done_all_segfiles); + + return -1; + } + + if (segidx + 1 > scan->aos_segfiles_processed) + { + /* done current segfile */ + CloseScannedFileSeg(scan); + /* + * Adjust aos_segfiles_processed to guide + * SetNextFileSegForRead() opening next + * right segfile. + */ + scan->aos_segfiles_processed = segidx; + } + + segno = scan->aos_segfile_arr[segidx]->segno; + Assert(segno > InvalidFileSegNumber && segno <= AOTupleId_MaxSegmentFileNum); + + if (scan->aos_need_new_segfile) + { + if (SetNextFileSegForRead(scan)) + { + Assert(scan->segrowsprocessed == 0); + scan->needNextBuffer = true; + } + else + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("Unexpected behavior, failed to open segno %d during scanning AO table %s", + segno, RelationGetRelationName(scan->aos_rd)))); + } + } + + return segno; +} + +static inline int64 +appendonly_block_remaining_rows(AppendOnlyScanDesc scan) +{ + return (scan->executorReadBlock.rowCount - scan->executorReadBlock.blockRowsProcessed); +} + +/* + * locates the block in which `targrow` exists + */ +static void +appendonly_getblock(AppendOnlyScanDesc scan, int64 targrow, int64 *startrow) +{ + AppendOnlyExecutorReadBlock *varblock = &scan->executorReadBlock; + int64 rowcount = -1; + + if (!scan->needNextBuffer) + { + /* we have a current block */ + rowcount = appendonly_block_remaining_rows(scan); + Assert(rowcount >= 0); + + if (*startrow + rowcount - 1 >= targrow) + { + /* row lies in current block, nothing to do */ + return; + } + else + { + /* skip scanning remaining rows */ + *startrow += rowcount; + scan->needNextBuffer = true; + } + } + + /* + * Keep reading block headers until we find the block containing + * the target row. + */ + while (true) + { + elog(DEBUG2, "appendonly_getblock(): [targrow: %ld, currow: %ld, diff: %ld, " + "startrow: %ld, rowcount: %ld, segfirstrow: %ld, segrowsprocessed: %ld, " + "blockRowsProcessed: %ld, blockRowCount: %d]", targrow, *startrow + rowcount - 1, + *startrow + rowcount - 1 - targrow, *startrow, rowcount, scan->segfirstrow, + scan->segrowsprocessed, varblock->blockRowsProcessed, + varblock->rowCount); + + if (AppendOnlyExecutorReadBlock_GetBlockInfo(&scan->storageRead, varblock)) + { + /* new block, reset blockRowsProcessed */ + varblock->blockRowsProcessed = 0; + rowcount = appendonly_block_remaining_rows(scan); + Assert(rowcount > 0); + if (*startrow + rowcount - 1 >= targrow) + { + AppendOnlyExecutorReadBlock_GetContents(varblock); + /* got a new buffer to consume */ + scan->needNextBuffer = false; + return; + } + + *startrow += rowcount; + AppendOnlyExecutionReadBlock_FinishedScanBlock(varblock); + AppendOnlyStorageRead_SkipCurrentBlock(&scan->storageRead); + /* continue next block */ + } + else + pg_unreachable(); /* unreachable code */ + } +} + +/* + * block directory based get_target_tuple() + */ +static bool +appendonly_blkdirscan_get_target_tuple(AppendOnlyScanDesc scan, int64 targrow, TupleTableSlot *slot) +{ + int segno, segidx; + int64 rownum, rowsprocessed; + AOTupleId aotid; + AppendOnlyBlockDirectory *blkdir = &scan->aofetch->blockDirectory; + + Assert(scan->blkdirscan != NULL); + + /* locate the target segment */ + segidx = appendonly_locate_target_segment(scan, targrow); + if (segidx < 0) + return false; + + scan->aos_segfiles_processed = segidx + 1; + + segno = scan->aos_segfile_arr[segidx]->segno; + Assert(segno > InvalidFileSegNumber && segno <= AOTupleId_MaxSegmentFileNum); + + /* + * Note: It is safe to assume that the scan's segfile array and the + * blockdir's segfile array are identical. Otherwise, we should stop + * processing and throw an exception to make the error visible. + */ + if (blkdir->segmentFileInfo[segidx]->segno != segno) + { + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("segfile array contents in both scan descriptor " + "and block directory are not identical on " + "append-optimized relation '%s'", + RelationGetRelationName(blkdir->aoRel)))); + } + + /* + * Set the current segfile info in the blkdir struct, so we can + * reuse the (cached) block directory entry during the tuple fetch + * operation below. See AppendOnlyBlockDirectory_GetCachedEntry(). + */ + blkdir->currentSegmentFileNum = blkdir->segmentFileInfo[segidx]->segno; + blkdir->currentSegmentFileInfo = blkdir->segmentFileInfo[segidx]; + + /* + * "segfirstrow" should be always pointing to the first row of + * a new segfile in blkdir based ANALYZE, only locate_target_segment + * could update its value. + * + * "segrowsprocessed" is used for tracking the position of + * processed rows in the current segfile. + */ + rowsprocessed = scan->segfirstrow + scan->segrowsprocessed; + /* locate the target row by seqscan block directory */ + rownum = AOBlkDirScan_GetRowNum(scan->blkdirscan, + segno, + 0, + targrow, + &rowsprocessed); + + elog(DEBUG2, "AOBlkDirScan_GetRowNum(segno: %d, col: %d, targrow: %ld): " + "[segfirstrow: %ld, segrowsprocessed: %ld, rownum: %ld, cached_mpentry_num: %d]", + segno, 0, targrow, scan->segfirstrow, scan->segrowsprocessed, rownum, + blkdir->cached_mpentry_num); + + if (rownum < 0) + return false; + + scan->segrowsprocessed = rowsprocessed - scan->segfirstrow; + + /* form the target tuple TID */ + AOTupleIdInit(&aotid, segno, rownum); + + /* ensure the target minipage entry was stored in fetch descriptor */ + Assert(blkdir->cached_mpentry_num != InvalidEntryNum); + Assert(blkdir->minipages == &blkdir->minipages[0]); + + /* fetch the target tuple */ + if(!appendonly_fetch(scan->aofetch, &aotid, slot)) + return false; + + /* OK to return this tuple */ + pgstat_count_heap_fetch(scan->aos_rd); + + return true; +} + +/* + * Given a specific target row number 'targrow' (in the space of all row numbers + * physically present in the table, i.e. across all segfiles), scan and return + * the corresponding tuple in 'slot'. + * + * If the tuple is visible, return true. Otherwise, return false. + */ +bool +appendonly_get_target_tuple(AppendOnlyScanDesc aoscan, int64 targrow, TupleTableSlot *slot) +{ + AppendOnlyExecutorReadBlock *varblock = &aoscan->executorReadBlock; + bool visible; + int64 rowsprocessed, rownum; + int segno; + AOTupleId aotid; + + if (aoscan->blkdirscan != NULL) + return appendonly_blkdirscan_get_target_tuple(aoscan, targrow, slot); + + segno = appendonly_getsegment(aoscan, targrow); + if (segno < 0) + return false; + + rowsprocessed = aoscan->segfirstrow + aoscan->segrowsprocessed; + + appendonly_getblock(aoscan, targrow, &rowsprocessed); + + aoscan->segrowsprocessed = rowsprocessed - aoscan->segfirstrow; + + Assert(rowsprocessed + varblock->rowCount - 1 >= targrow); + rownum = varblock->blockFirstRowNum + (targrow - rowsprocessed); + + elog(DEBUG2, "appendonly_getblock() returns: [segno: %d, rownum: %ld]", segno, rownum); + + /* form the target tuple TID */ + AOTupleIdInit(&aotid, segno, rownum); + + visible = (aoscan->snapshot == SnapshotAny || + AppendOnlyVisimap_IsVisible(&aoscan->visibilityMap, &aotid)); + + if (visible && AppendOnlyExecutorReadBlock_FetchTuple(varblock, rownum, 0, NULL, slot)) + { + /* OK to return this tuple */ + pgstat_count_heap_fetch(aoscan->aos_rd); + } + else + { + if (slot != NULL) + ExecClearTuple(slot); + + return false; + } + + return true; +} + /* ---------------- * appendonlygettup - fetch next appendonly tuple * @@ -1138,6 +1443,8 @@ appendonlygettup(AppendOnlyScanDesc scan, TupleTableSlot *slot) { Assert(ScanDirectionIsForward(dir)); + /* should not be in ANALYZE - we use a different API */ + Assert((scan->rs_base.rs_flags & SO_TYPE_ANALYZE) == 0); Assert(scan->usableBlockSize > 0); bool isSnapshotAny = (scan->snapshot == SnapshotAny); @@ -1178,12 +1485,7 @@ appendonlygettup(AppendOnlyScanDesc scan, if (!isSnapshotAny && !AppendOnlyVisimap_IsVisible(&scan->visibilityMap, aoTupleId)) { - /* - * The tuple is invisible. - * In `analyze`, we can simply return false - */ - if ((scan->rs_base.rs_flags & SO_TYPE_ANALYZE) != 0) - return false; + /* The tuple is invisible */ } else { @@ -1310,6 +1612,34 @@ finishWriteBlock(AppendOnlyInsertDesc aoInsertDesc) Assert(!AppendOnlyStorageWrite_IsBufferAllocated(&aoInsertDesc->storageWrite)); } +static void +appendonly_blkdirscan_init(AppendOnlyScanDesc scan) +{ + if (scan->aofetch == NULL) + scan->aofetch = appendonly_fetch_init(scan->aos_rd, + scan->snapshot, + scan->appendOnlyMetaDataSnapshot); + + scan->blkdirscan = palloc0(sizeof(AOBlkDirScanData)); + AOBlkDirScan_Init(scan->blkdirscan, &scan->aofetch->blockDirectory); +} + +static void +appendonly_blkdirscan_finish(AppendOnlyScanDesc scan) +{ + AOBlkDirScan_Finish(scan->blkdirscan); + pfree(scan->blkdirscan); + scan->blkdirscan = NULL; + + if (scan->aofetch != NULL) + { + appendonly_fetch_finish(scan->aofetch); + pfree(scan->aofetch); + scan->aofetch = NULL; + } +} + + /* ---------------------------------------------------------------- * append-only access method interface * ---------------------------------------------------------------- @@ -1431,19 +1761,33 @@ appendonly_beginrangescan_internal(Relation relation, scan->blockDirectory = NULL; + if ((flags & SO_TYPE_ANALYZE) != 0) + { + scan->segrowsprocessed = 0; + scan->segfirstrow = 0; + scan->targrow = 0; + } + if (segfile_count > 0) { Oid visimaprelid; Oid visimapidxid; + Oid blkdirrelid; GetAppendOnlyEntryAuxOids(relation, - NULL, NULL, NULL, &visimaprelid, &visimapidxid); + NULL, &blkdirrelid, NULL, &visimaprelid, &visimapidxid); AppendOnlyVisimap_Init(&scan->visibilityMap, visimaprelid, visimapidxid, AccessShareLock, appendOnlyMetaDataSnapshot); + + if ((flags & SO_TYPE_ANALYZE) != 0) + { + if (OidIsValid(blkdirrelid)) + appendonly_blkdirscan_init(scan); + } } scan->totalBytesRead = 0; @@ -1519,7 +1863,7 @@ appendonly_beginscan(Relation relation, */ seginfo = GetAllFileSegInfo(relation, appendOnlyMetaDataSnapshot, &segfile_count, NULL); - + aoscan = appendonly_beginrangescan_internal(relation, snapshot, appendOnlyMetaDataSnapshot, @@ -1614,6 +1958,9 @@ appendonly_endscan(TableScanDesc scan) if (aoscan->aos_total_segfiles > 0) AppendOnlyVisimap_Finish(&aoscan->visibilityMap, AccessShareLock); + if (aoscan->blkdirscan != NULL) + appendonly_blkdirscan_finish(aoscan); + if (aoscan->aofetch) { appendonly_fetch_finish(aoscan->aofetch); diff --git a/src/backend/access/appendonly/appendonlyam_handler.c b/src/backend/access/appendonly/appendonlyam_handler.c index 0136d4a850f..218ade630d4 100644 --- a/src/backend/access/appendonly/appendonlyam_handler.c +++ b/src/backend/access/appendonly/appendonlyam_handler.c @@ -46,6 +46,7 @@ #include "utils/lsyscache.h" #include "utils/pg_rusage.h" #include "utils/guc.h" +#include "utils/sampling.h" #define IS_BTREE(r) ((r)->rd_rel->relam == BTREE_AM_OID) @@ -1492,10 +1493,13 @@ static bool appendonly_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno, BufferAccessStrategy bstrategy) { - AppendOnlyScanDesc aoscan = (AppendOnlyScanDesc) scan; - aoscan->targetTupleId = blockno; - - return true; + /* + * For append-optimized relations, we use a separate sampling + * method. See table_relation_acquire_sample_rows(). + */ + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("API not supported for appendoptimized relations"))); } static bool @@ -1503,30 +1507,91 @@ appendonly_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, double *liverows, double *deadrows, TupleTableSlot *slot) { + /* + * For append-optimized relations, we use a separate sampling + * method. See table_relation_acquire_sample_rows(). + */ + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("API not supported for appendoptimized relations"))); +} + +/* + * Implementation of relation_acquire_sample_rows(). + * + * As opposed to upstream's method of 2-stage sampling, here we can simply use + * Knuth's S algorithm (TAOCP Part 2 Section 3.4.2) as we clearly know N - the + * population size up front (i.e the total number of rows in the relation) + * + * Although an estimate is demanded for the total live rows and total dead rows + * in the table, we can actually return their exact values from aux table metadata. + * + * We intrinsically return rows in physical order, since the rows sampled by + * Algorithm S are in physical order. + */ +static int +appendonly_acquire_sample_rows(Relation onerel, int elevel, HeapTuple *rows, + int targrows, double *totalrows, double *totaldeadrows) +{ + int numrows = 0; /* # number of rows sampled */ + double liverows = 0; /* # live rows seen */ + double deadrows = 0; /* # dead rows seen */ + + Assert(targrows > 0); + + TableScanDesc scan = table_beginscan_analyze(onerel); + TupleTableSlot *slot = table_slot_create(onerel, NULL); AppendOnlyScanDesc aoscan = (AppendOnlyScanDesc) scan; - bool ret = false; - /* skip several tuples if they are not sampling target */ - while (!aoscan->aos_done_all_segfiles - && aoscan->targetTupleId > aoscan->nextTupleId) - { - appendonly_getnextslot(scan, ForwardScanDirection, slot); - aoscan->nextTupleId++; - } + int64 totaltupcount = AppendOnlyScanDesc_TotalTupCount(aoscan); + int64 totaldeadtupcount = 0; + if (aoscan->aos_total_segfiles > 0 ) + totaldeadtupcount = AppendOnlyVisimap_GetRelationHiddenTupleCount(&aoscan->visibilityMap); + /* + * The conversion from int64 to double (53 significant bits) is safe as the + * AOTupleId is 48bits, the max value of totalrows is never greater than + * AOTupleId_MaxSegmentFileNum * AOTupleId_MaxRowNum (< 48 significant bits). + */ + *totalrows = (double) (totaltupcount - totaldeadtupcount); + *totaldeadrows = (double) totaldeadtupcount; - if (!aoscan->aos_done_all_segfiles - && aoscan->targetTupleId == aoscan->nextTupleId) + /* Prepare for sampling row numbers */ + RowSamplerData rs; + RowSampler_Init(&rs, *totalrows, targrows, random()); + + while (RowSampler_HasMore(&rs)) { - ret = appendonly_getnextslot(scan, ForwardScanDirection, slot); - aoscan->nextTupleId++; + aoscan->targrow = RowSampler_Next(&rs); + + vacuum_delay_point(); - if (ret) - *liverows += 1; + if (appendonly_get_target_tuple(aoscan, aoscan->targrow, slot)) + { + rows[numrows++] = ExecCopySlotHeapTuple(slot); + liverows++; + } else - *deadrows += 1; /* if return an invisible tuple */ + deadrows++; + + ExecClearTuple(slot); } - return ret; + ExecDropSingleTupleTableSlot(slot); + table_endscan(scan); + + /* + * Emit some interesting relation info + */ + ereport(elevel, + (errmsg("\"%s\": scanned " INT64_FORMAT " rows, " + "containing %.0f live rows and %.0f dead rows; " + "%d rows in sample, %.0f total live rows, " + "%.f total dead rows", + RelationGetRelationName(onerel), + rs.m, liverows, deadrows, numrows, + *totalrows, *totaldeadrows))); + + return numrows; } static double @@ -2278,6 +2343,7 @@ static const TableAmRoutine ao_row_methods = { .relation_vacuum = appendonly_vacuum_rel, .scan_analyze_next_block = appendonly_scan_analyze_next_block, .scan_analyze_next_tuple = appendonly_scan_analyze_next_tuple, + .relation_acquire_sample_rows = appendonly_acquire_sample_rows, .index_build_range_scan = appendonly_index_build_range_scan, .index_validate_scan = appendonly_index_validate_scan, @@ -2293,7 +2359,6 @@ static const TableAmRoutine ao_row_methods = { .scan_bitmap_next_tuple = appendonly_scan_bitmap_next_tuple, .scan_sample_next_block = appendonly_scan_sample_next_block, .scan_sample_next_tuple = appendonly_scan_sample_next_tuple, - .acquire_sample_rows = acquire_sample_rows, .amoptions = ao_amoptions, .swap_relation_files = appendonly_swap_relation_files, diff --git a/src/backend/access/appendonly/appendonlyblockdirectory.c b/src/backend/access/appendonly/appendonlyblockdirectory.c index b6a9b17991d..2f9a9f4522a 100644 --- a/src/backend/access/appendonly/appendonlyblockdirectory.c +++ b/src/backend/access/appendonly/appendonlyblockdirectory.c @@ -154,6 +154,8 @@ init_internal(AppendOnlyBlockDirectory *blockDirectory) ItemPointerSetInvalid(&minipageInfo->tupleTid); } + blockDirectory->cached_mpentry_num = InvalidEntryNum; + MemoryContextSwitchTo(oldcxt); } @@ -515,6 +517,32 @@ set_directoryentry_range( return true; } +/* + * AppendOnlyBlockDirectory_GetCachedEntry + * + * Return cached minipage entry for avoidance + * of double scans on block directory. + */ +static inline bool +AppendOnlyBlockDirectory_GetCachedEntry( + AppendOnlyBlockDirectory *blockDirectory, + int segmentFileNum, + int columnGroupNo, + AppendOnlyBlockDirectoryEntry *directoryEntry) +{ + MinipagePerColumnGroup *minipageInfo PG_USED_FOR_ASSERTS_ONLY = &blockDirectory->minipages[columnGroupNo]; + + Assert(blockDirectory->cached_mpentry_num != InvalidEntryNum); + Assert(segmentFileNum == blockDirectory->currentSegmentFileNum); + Assert(blockDirectory->currentSegmentFileInfo != NULL); + Assert(minipageInfo->numMinipageEntries > 0); + + return set_directoryentry_range(blockDirectory, + columnGroupNo, + blockDirectory->cached_mpentry_num, + directoryEntry); +} + /* * AppendOnlyBlockDirectory_GetEntry * @@ -547,7 +575,7 @@ AppendOnlyBlockDirectory_GetEntry( HeapTuple tuple = NULL; MinipagePerColumnGroup *minipageInfo = &blockDirectory->minipages[columnGroupNo]; - int entry_no = -1; + int entry_no = InvalidEntryNum; int tmpGroupNo; if (blkdirRel == NULL || blkdirIdx == NULL) @@ -567,6 +595,21 @@ AppendOnlyBlockDirectory_GetEntry( "(%d, %d, " INT64_FORMAT ")", columnGroupNo, segmentFileNum, rowNum))); + /* + * We enable caching minipage entry only for ao_row. + * + * Because ao_column requires all column values, + * but the entry returned here caches for only one + * column. It is unavoidable to scan blkdir again in + * aocs_fetch() to extract all other column entries + * for constructing the whole tuple. + */ + if (!blockDirectory->isAOCol && blockDirectory->cached_mpentry_num != InvalidEntryNum) + return AppendOnlyBlockDirectory_GetCachedEntry(blockDirectory, + segmentFileNum, + columnGroupNo, + directoryEntry); + /* * If the segment file number is the same as * blockDirectory->currentSegmentFileNum, the in-memory minipage may @@ -591,7 +634,8 @@ AppendOnlyBlockDirectory_GetEntry( entry_no = find_minipage_entry(minipageInfo->minipage, minipageInfo->numMinipageEntries, rowNum); - if (entry_no != -1) + + if (entry_no != InvalidEntryNum) { return set_directoryentry_range(blockDirectory, columnGroupNo, @@ -883,7 +927,7 @@ blkdir_entry_exists(AppendOnlyBlockDirectory *blockDirectory, entry_no = find_minipage_entry(minipageInfo->minipage, minipageInfo->numMinipageEntries, rowNum); - if (entry_no != -1) + if (entry_no != InvalidEntryNum) { found = true; break; @@ -1282,7 +1326,7 @@ find_minipage_entry(Minipage *minipage, if (start_no <= end_no) return entry_no; else - return -1; + return InvalidEntryNum; } /* @@ -1620,3 +1664,126 @@ AppendOnlyBlockDirectory_End_forUniqueChecks(AppendOnlyBlockDirectory *blockDire MemoryContextDelete(blockDirectory->memoryContext); } + +/* + * AOBlkDirScan_GetRowNum + * + * Given a target logical row number, + * returns the corresponding physical rowNum, + * or -1 if not found. + * + * targrow: 0-based target logical row number + * startrow: pointer of the start point stepping to targrow + * targsegno: the segfile number in which targrow locates + * colgroupno: current coloumn group number, always 0 for ao_row + */ +int64 +AOBlkDirScan_GetRowNum(AOBlkDirScan blkdirscan, + int targsegno, + int colgroupno, + int64 targrow, + int64 *startrow) +{ + HeapTuple tuple; + TupleDesc tupdesc; + AppendOnlyBlockDirectory *blkdir = blkdirscan->blkdir; + int mpentryi; + MinipagePerColumnGroup *mpinfo; + int64 rownum = -1; + + Assert(targsegno >= 0); + Assert(blkdir != NULL); + + if (blkdirscan->segno != targsegno || blkdirscan->colgroupno != colgroupno) + { + if (blkdirscan->sysscan != NULL) + systable_endscan_ordered(blkdirscan->sysscan); + + ScanKeyData scankeys[2]; + + ScanKeyInit(&scankeys[0], + Anum_pg_aoblkdir_segno, + BTEqualStrategyNumber, + F_INT4EQ, + Int32GetDatum(targsegno)); + + ScanKeyInit(&scankeys[1], + Anum_pg_aoblkdir_columngroupno, + BTEqualStrategyNumber, + F_INT4EQ, + Int32GetDatum(colgroupno)); + + blkdirscan->sysscan = systable_beginscan_ordered(blkdir->blkdirRel, + blkdir->blkdirIdx, + blkdir->appendOnlyMetaDataSnapshot, + 2, /* nkeys */ + scankeys); + blkdirscan->segno = targsegno; + blkdirscan->colgroupno = colgroupno; + /* reset to InvalidEntryNum for new Minipage entry array */ + blkdir->cached_mpentry_num = InvalidEntryNum; + } + + mpentryi = blkdir->cached_mpentry_num; + mpinfo = &blkdir->minipages[colgroupno]; + + while (true) + { + if (mpentryi == InvalidEntryNum) + { + tuple = systable_getnext_ordered(blkdirscan->sysscan, ForwardScanDirection); + if (HeapTupleIsValid(tuple)) + { + tupdesc = RelationGetDescr(blkdir->blkdirRel); + extract_minipage(blkdir, tuple, tupdesc, colgroupno); + /* new minipage */ + mpentryi = 0; + mpinfo = &blkdir->minipages[colgroupno]; + } + else + { + /* done this < segno, colgroupno > */ + systable_endscan_ordered(blkdirscan->sysscan); + blkdirscan->sysscan = NULL; + blkdirscan->segno = -1; + blkdirscan->colgroupno = 0; + /* always set both vars in pairs for safe */ + mpentryi = InvalidEntryNum; + mpinfo = NULL; + + goto out; + } + } + + Assert(mpentryi != InvalidEntryNum); + Assert(mpinfo != NULL); + + for (int i = mpentryi; i < mpinfo->numMinipageEntries; i++) + { + Minipage *mp = mpinfo->minipage; + MinipageEntry *entry = &(mp->entry[i]); + + Assert(entry->firstRowNum > 0); + Assert(entry->rowCount > 0); + + if (*startrow + entry->rowCount - 1 >= targrow) + { + rownum = entry->firstRowNum + (targrow - *startrow); + mpentryi = i; + goto out; + } + + *startrow += entry->rowCount; + } + + /* done this minipage */ + mpentryi = InvalidEntryNum; + mpinfo = NULL; + } + +out: + /* set the result of minipage entry lookup */ + blkdir->cached_mpentry_num = mpentryi; + + return rownum; +} diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 22314d15b07..4535f2fdaba 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -2686,7 +2686,6 @@ static const TableAmRoutine heapam_methods = { .scan_bitmap_next_tuple = heapam_scan_bitmap_next_tuple, .scan_sample_next_block = heapam_scan_sample_next_block, .scan_sample_next_tuple = heapam_scan_sample_next_tuple, - .acquire_sample_rows = acquire_sample_rows, .amoptions = heapam_amoptions, diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 7a6428627f6..e06dbea2870 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -183,13 +183,22 @@ static void compute_index_stats(Relation onerel, double totalrows, MemoryContext col_context); static VacAttrStats *examine_attribute(Relation onerel, int attnum, Node *index_expr, int elevel); +static int acquire_sample_rows(Relation onerel, int elevel, + HeapTuple *rows, int targrows, + double *totalrows, double *totaldeadrows); static int acquire_sample_rows_dispatcher(Relation onerel, bool inh, int elevel, HeapTuple *rows, int targrows, double *totalrows, double *totaldeadrows); +static int gp_acquire_sample_rows_func(Relation onerel, int elevel, + HeapTuple *rows, int targrows, + double *totalrows, double *totaldeadrows); static BlockNumber acquire_index_number_of_blocks(Relation indexrel, Relation tablerel); static void gp_acquire_correlations_dispatcher(Oid relOid, bool inh, float4 *correlations, bool *correlationsIsNull); static int compare_rows(const void *a, const void *b); +static int acquire_inherited_sample_rows(Relation onerel, int elevel, + HeapTuple *rows, int targrows, + double *totalrows, double *totaldeadrows); static void update_attstats(Oid relid, bool inh, int natts, VacAttrStats **vacattrstats); static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull); @@ -349,16 +358,11 @@ analyze_rel_internal(Oid relid, RangeVar *relation, onerel->rd_rel->relkind == RELKIND_MATVIEW || onerel->rd_rel->relkind == RELKIND_DIRECTORY_TABLE) { - /* Regular table, so we'll use the regular row acquisition function */ - if (onerel->rd_tableam) - acquirefunc = onerel->rd_tableam->acquire_sample_rows; - /* - * If the TableAmRoutine's acquire_sample_rows if NULL, we use - * acquire_sample_rows as default. + * If the TableAmRoutine's gp_acquire_sample_rows_func if NULL, we use + * gp_acquire_sample_rows_func as default. */ - if (acquirefunc == NULL) - acquirefunc = acquire_sample_rows; + acquirefunc = gp_acquire_sample_rows_func; /* Also get regular table's size */ relpages = AcquireNumberOfBlocks(onerel); @@ -1608,6 +1612,32 @@ examine_attribute(Relation onerel, int attnum, Node *index_expr, int elevel) return stats; } +/* + * GPDB: If we are the dispatcher, then issue analyze on the segments and + * collect the statistics from them. + */ +int +gp_acquire_sample_rows_func(Relation onerel, int elevel, + HeapTuple *rows, int targrows, + double *totalrows, double *totaldeadrows) +{ + if (Gp_role == GP_ROLE_DISPATCH && + onerel->rd_cdbpolicy && !GpPolicyIsEntry(onerel->rd_cdbpolicy)) + { + /* Fetch sample from the segments. */ + return acquire_sample_rows_dispatcher(onerel, false, elevel, + rows, targrows, + totalrows, totaldeadrows); + } + + if (RelationIsAppendOptimized(onerel)) + return table_relation_acquire_sample_rows(onerel, elevel, rows, + targrows, totalrows, totaldeadrows); + + return acquire_sample_rows(onerel, elevel, rows, + targrows, totalrows, totaldeadrows); +} + /* * acquire_sample_rows -- acquire a random sample of rows from the table * @@ -1640,14 +1670,8 @@ examine_attribute(Relation onerel, int attnum, Node *index_expr, int elevel) * unbiased estimates of the average numbers of live and dead rows per * block. The previous sampling method put too much credence in the row * density near the start of the table. - * - * The returned list of tuples is in order by physical position in the table. - * (We will rely on this later to derive correlation estimates.) - * - * GPDB: If we are the dispatcher, then issue analyze on the segments and - * collect the statistics from them. */ -int +static int acquire_sample_rows(Relation onerel, int elevel, HeapTuple *rows, int targrows, double *totalrows, double *totaldeadrows) @@ -1673,27 +1697,23 @@ acquire_sample_rows(Relation onerel, int elevel, Assert(targrows > 0); - if (Gp_role == GP_ROLE_DISPATCH && - onerel->rd_cdbpolicy && !GpPolicyIsEntry(onerel->rd_cdbpolicy)) - { - /* Fetch sample from the segments. */ - return acquire_sample_rows_dispatcher(onerel, false, elevel, - rows, targrows, - totalrows, totaldeadrows); - } - /* - * GPDB: Analyze does make a lot of assumptions regarding the file layout of a - * relation. These assumptions are heap specific and do not hold for AO/AOCO - * relations. In the case of AO/AOCO, what is actually needed and used instead - * of number of blocks, is number of tuples. - * - * GPDB_12_MERGE_FIXME: BlockNumber is uint32 and Number of tuples is uint64. - * That means that after row number UINT_MAX we will never analyze the table. + * GPDB: Legacy analyze does make a lot of assumptions regarding the file + * layout of a relation. These assumptions are heap specific and do not hold + * for AO/AOCO relations. In the case of AO/AOCO, what is actually needed and + * used instead of number of blocks, is number of tuples. Moreover, BlockNumber + * is uint32 and Number of tuples is uint64. That means that after row number + * UINT_MAX we will never analyze the table. + * + * We introduced a tuple based sampling approach for AO/CO tables to address + * above problems, all corresponding logics were moved out of here and enclosed + * in table_relation_acquire_sample_rows(). So leave here an assertion to ensure + * the relation should not be an AO/CO table. */ - if (RelationIsNonblockRelation(onerel)) + Assert(!RelationIsAppendOptimized(onerel)); + if (RelationIsPax(onerel)) { - /* AO/CO/PAX use non-fixed block layout */ + /* PAX use non-fixed block layout */ BlockNumber pages; double tuples; double allvisfrac; @@ -1897,13 +1917,13 @@ acquire_sample_rows(Relation onerel, int elevel, * Emit some interesting relation info */ ereport(elevel, - (errmsg("\"%s\": scanned %d of %u pages, " - "containing %.0f live rows and %.0f dead rows; " - "%d rows in sample, %.0f estimated total rows", - RelationGetRelationName(onerel), - bs.m, totalblocks, - liverows, deadrows, - numrows, *totalrows))); + (errmsg("\"%s\": scanned %d of %u pages, " + "containing %.0f live rows and %.0f dead rows; " + "%d rows in sample, %.0f estimated total rows", + RelationGetRelationName(onerel), + bs.m, totalblocks, + liverows, deadrows, + numrows, *totalrows))); return numrows; } @@ -1918,17 +1938,12 @@ compare_rows(const void *a, const void *b) HeapTuple hb = *(const HeapTuple *) b; /* - * GPDB_12_MERGE_FIXME: For AO/AOCO tables, blocknumber does not have a - * meaning and is not set. The current implementation of analyze makes - * assumptions about the file layout which do not hold for these two cases. - * The compare function should maintain the row order as consrtucted, hence - * return 0; - * + * For AO/AOCO tables, blocknumber does not have a meaning and is not set. + * We leave this workaround here for legacy AO/CO analyze still working. * There should be no apparent and measurable perfomance hit from calling * this function. - * - * One possible proper fix is to refactor analyze to use the tableam api and - * this sorting should move to the specific implementation. + * + * The AO/CO Fast Analyze won't reach here anymore. */ if (!BlockNumberIsValid(ItemPointerGetBlockNumberNoCheck(&ha->t_self))) return 0; @@ -2051,16 +2066,8 @@ acquire_inherited_sample_rows(Relation onerel, int elevel, childrel->rd_rel->relkind == RELKIND_MATVIEW || childrel->rd_rel->relkind == RELKIND_DIRECTORY_TABLE) { - /* Regular table, so use the regular row acquisition function */ - if (childrel->rd_tableam) - acquirefunc = childrel->rd_tableam->acquire_sample_rows; - - /* - * If the TableAmRoutine's acquire_sample_rows if NULL, we use - * acquire_sample_rows as default. - */ - if (acquirefunc == NULL) - acquirefunc = acquire_sample_rows; + /* use relation_acquire_sample_rows as default. */ + acquirefunc = gp_acquire_sample_rows_func; relpages = AcquireNumberOfBlocks(childrel); } diff --git a/src/backend/commands/analyzefuncs.c b/src/backend/commands/analyzefuncs.c index 8c9fd56199d..b97bbee048c 100644 --- a/src/backend/commands/analyzefuncs.c +++ b/src/backend/commands/analyzefuncs.c @@ -27,6 +27,7 @@ #include "miscadmin.h" #include "funcapi.h" #include "utils/syscache.h" +#include "utils/faultinjector.h" /** * Statistics related parameters. @@ -303,6 +304,8 @@ gp_acquire_sample_rows(PG_FUNCTION_ARGS) ctx->index++; + SIMPLE_FAULT_INJECTOR("returned_sample_row"); + SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(res)); } else if (!ctx->summary_sent) diff --git a/src/backend/utils/datumstream/datumstream.c b/src/backend/utils/datumstream/datumstream.c index 42f374d1498..53addc99a95 100644 --- a/src/backend/utils/datumstream/datumstream.c +++ b/src/backend/utils/datumstream/datumstream.c @@ -876,6 +876,7 @@ datumstreamread_close_file(DatumStreamRead * ds) { AppendOnlyStorageRead_CloseFile(&ds->ao_read); + ds->blockRowCount = 0; ds->need_close_file = false; } @@ -1129,7 +1130,7 @@ datumstreamwrite_lob(DatumStreamWrite * acc, return varLen; } -static bool +bool datumstreamread_block_info(DatumStreamRead * acc) { bool readOK = false; diff --git a/src/backend/utils/misc/sampling.c b/src/backend/utils/misc/sampling.c index 40e47487c21..58361eb2bff 100644 --- a/src/backend/utils/misc/sampling.c +++ b/src/backend/utils/misc/sampling.c @@ -116,7 +116,11 @@ BlockSampler_Next(BlockSampler bs) } /* - * This is a 64 bit version of BlockSampler. + * This is used for sampling AO/CO row numbers, in the flattened + * row number space, across all segfile tuple counts. 64 bits is + * used for simplicity and is sufficient to hold a maximum tuple + * count of (AOTupleId_MaxSegmentFileNum * MAX_AOREL_CONCURRENCY), + * which can be represented with 48 bits. * * The code is same as BlockSampler except replacing * int type of variables with int64, which is intended @@ -127,7 +131,7 @@ BlockSampler_Next(BlockSampler bs) */ void RowSampler_Init(RowSampler rs, int64 nobjects, int64 samplesize, - long randseed) + long randseed) { rs->N = nobjects; /* measured table size */ diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 61948090e22..4a877e857b1 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -751,6 +751,13 @@ typedef struct TableAmRoutine double *liverows, double *deadrows, TupleTableSlot *slot); + + int (*relation_acquire_sample_rows) (Relation onerel, + int elevel, + HeapTuple *rows, + int targrows, + double *totalrows, + double *totaldeadrows); /* see table_index_build_range_scan for reference about parameters */ double (*index_build_range_scan) (Relation table_rel, @@ -948,15 +955,6 @@ typedef struct TableAmRoutine struct SampleScanState *scanstate, TupleTableSlot *slot); - /* - * Like scan_sample_next_block and scan_sample_next_tuple, this callback - * is also used to sample tuple rows. As different storage maybe need to - * use different acquire sample rows process, we extend an interface to - * achieve this requirement. - */ - int (*acquire_sample_rows) (Relation onerel, int elevel, - HeapTuple *rows, int targrows, - double *totalrows, double *totaldeadrows); /* * This callback is used to parse reloptions for relation/matview/toast. */ @@ -1903,6 +1901,38 @@ table_relation_vacuum(Relation rel, struct VacuumParams *params, rel->rd_tableam->relation_vacuum(rel, params, bstrategy); } +/* + * GPDB: Interface to acquire sample rows from a given relation (currently + * AO/CO). + * + * Selected rows are returned in the caller-allocated array rows[], which + * must have space to hold at least targrows entries. + * + * The actual number of rows selected is returned as the function result. + * We also estimate the total numbers of live and dead rows in the table, + * and return them into *totalrows and *totaldeadrows, respectively. + * + * The returned list of tuples is in order by physical position in the table. + * (We will rely on this later to derive correlation estimates.) + * + * Note: this interface is not used by upstream code. + * The upstream interface (implemented by heap) uses a 2-stage sampling method + * using table_scan_analyze_next_block() and table_scan_analyze_next_tuple(). + * See acquire_sample_rows(). Since this upstream method does not suit AO/CO + * tables we have created the relation_acquire_sample_rows() interface. + * + * Note for future merges: + * We have to keep this interface consistent with acquire_sample_rows(). + */ +static inline int +table_relation_acquire_sample_rows(Relation rel, int elevel, HeapTuple *rows, + int targrows, double *totalrows, double *totaldeadrows) +{ + return rel->rd_tableam->relation_acquire_sample_rows(rel, elevel, rows, + targrows, totalrows, + totaldeadrows); +} + /* * Prepare to analyze block `blockno` of `scan`. The scan needs to have been * started with table_beginscan_analyze(). Note that this routine might diff --git a/src/include/cdb/cdbaocsam.h b/src/include/cdb/cdbaocsam.h index b4a4653bfd0..ab990e2bf01 100644 --- a/src/include/cdb/cdbaocsam.h +++ b/src/include/cdb/cdbaocsam.h @@ -105,6 +105,57 @@ enum AOCSScanDescIdentifier AOCSBITMAPSCANDATA /* am private */ }; +/* + * Used for fetch individual tuples from specified by TID of append only relations + * using the AO Block Directory. + */ +typedef struct AOCSFetchDescData +{ + Relation relation; + Snapshot appendOnlyMetaDataSnapshot; + + /* + * Snapshot to use for non-metadata operations. + * Usually snapshot = appendOnlyMetaDataSnapshot, but they + * differ e.g. if gp_select_invisible is set. + */ + Snapshot snapshot; + + MemoryContext initContext; + + int totalSegfiles; + struct AOCSFileSegInfo **segmentFileInfo; + + /* + * Array containing the maximum row number in each aoseg (to be consulted + * during fetch). This is a sparse array as not all segments are involved + * in a scan. Sparse entries are marked with InvalidAORowNum. + * + * Note: + * If we have no updates and deletes, the total_tupcount is equal to the + * maximum row number. But after some updates and deletes, the maximum row + * number is always much bigger than total_tupcount, so this carries the + * last sequence from gp_fastsequence. + */ + int64 lastSequence[AOTupleId_MultiplierSegmentFileNum]; + + char *segmentFileName; + int segmentFileNameMaxLen; + char *basepath; + + AppendOnlyBlockDirectory blockDirectory; + + DatumStreamFetchDesc *datumStreamFetchDesc; + + int64 skipBlockCount; + + AppendOnlyVisimap visibilityMap; + + Oid segrelid; +} AOCSFetchDescData; + +typedef AOCSFetchDescData *AOCSFetchDesc; + /* * Used for scan of appendoptimized column oriented relations, should be used in * the tableam api related code and under it. @@ -118,19 +169,41 @@ typedef struct AOCSScanDescData /* synthetic system attributes */ ItemPointerData cdb_fake_ctid; - int64 total_row; - int64 segrowsprocessed; /* - * Only used by `analyze` + * used by `analyze` */ - int64 nextTupleId; - int64 targetTupleId; /* - * Only used by `sample scan` - */ + * targrow: the output of the Row-based sampler (Alogrithm S), denotes a + * rownumber in the flattened row number space that is the target of a sample, + * which starts from 0. + * In other words, if we have seg0 rownums: [1, 100], seg1 rownums: [1, 200] + * If targrow = 150, then we are referring to seg1's rownum=51. + */ + int64 targrow; + + /* + * segfirstrow: pointing to the next starting row which is used to check + * the distance to `targrow` + */ + int64 segfirstrow; + + /* + * segrowsprocessed: track the rows processed under the current segfile. + * Don't miss updating it accordingly when "segfirstrow" is updated. + */ + int64 segrowsprocessed; + + AOBlkDirScan blkdirscan; + AOCSFetchDesc aocsfetch; + bool *proj; + + /* + * Only used by `sample scan` + */ int64 fetchTupleId; + int64 nextTupleId; int64 totalTuples; /* * Part of the struct to be used only inside aocsam.c @@ -212,57 +285,6 @@ typedef struct AOCSScanDescData typedef AOCSScanDescData *AOCSScanDesc; -/* - * Used for fetch individual tuples from specified by TID of append only relations - * using the AO Block Directory. - */ -typedef struct AOCSFetchDescData -{ - Relation relation; - Snapshot appendOnlyMetaDataSnapshot; - - /* - * Snapshot to use for non-metadata operations. - * Usually snapshot = appendOnlyMetaDataSnapshot, but they - * differ e.g. if gp_select_invisible is set. - */ - Snapshot snapshot; - - MemoryContext initContext; - - int totalSegfiles; - struct AOCSFileSegInfo **segmentFileInfo; - - /* - * Array containing the maximum row number in each aoseg (to be consulted - * during fetch). This is a sparse array as not all segments are involved - * in a scan. Sparse entries are marked with InvalidAORowNum. - * - * Note: - * If we have no updates and deletes, the total_tupcount is equal to the - * maximum row number. But after some updates and deletes, the maximum row - * number is always much bigger than total_tupcount, so this carries the - * last sequence from gp_fastsequence. - */ - int64 lastSequence[AOTupleId_MultiplierSegmentFileNum]; - - char *segmentFileName; - int segmentFileNameMaxLen; - char *basepath; - - AppendOnlyBlockDirectory blockDirectory; - - DatumStreamFetchDesc *datumStreamFetchDesc; - - int64 skipBlockCount; - - AppendOnlyVisimap visibilityMap; - - Oid segrelid; -} AOCSFetchDescData; - -typedef AOCSFetchDescData *AOCSFetchDesc; - /* * AOCSDeleteDescData is used for delete data from AOCS relations. * It serves an equivalent purpose as AppendOnlyScanDescData @@ -397,6 +419,8 @@ extern void aocs_addcol_emptyvpe( extern void aocs_addcol_setfirstrownum(AOCSAddColumnDesc desc, int64 firstRowNum); +extern bool aocs_get_target_tuple(AOCSScanDesc aoscan, int64 targrow, TupleTableSlot *slot); + extern void aoco_dml_init(Relation relation, CmdType operation); extern void aoco_dml_finish(Relation relation, CmdType operation); @@ -423,5 +447,21 @@ AOCSScanDesc_UpdateTotalBytesRead(AOCSScanDesc scan, AttrNumber attno) scan->totalBytesRead += scan->columnScanInfo.ds[attno]->ao_read.current.uncompressedLen; } +static inline int64 +AOCSScanDesc_TotalTupCount(AOCSScanDesc scan) +{ + Assert(scan != NULL); + + int64 totalrows = 0; + AOCSFileSegInfo **seginfo = scan->seginfo; + + for (int i = 0; i < scan->total_seg; i++) + { + if (seginfo[i]->state != AOSEG_STATE_AWAITING_DROP) + totalrows += seginfo[i]->total_tupcount; + } + + return totalrows; +} #endif /* AOCSAM_H */ diff --git a/src/include/cdb/cdbappendonlyam.h b/src/include/cdb/cdbappendonlyam.h index 125c1de4d23..58fc77d8f5c 100644 --- a/src/include/cdb/cdbappendonlyam.h +++ b/src/include/cdb/cdbappendonlyam.h @@ -141,6 +141,7 @@ typedef struct AppendOnlyExecutorReadBlock int segmentFileNum; int64 totalRowsScanned; + int64 blockRowsProcessed; int64 blockFirstRowNum; int64 headerOffsetInFile; @@ -236,10 +237,31 @@ typedef struct AppendOnlyScanDescData AppendOnlyVisimap visibilityMap; /* - * Only used by `analyze` + * used by `analyze` */ - int64 nextTupleId; - int64 targetTupleId; + + /* + * targrow: the output of the Row-based sampler (Alogrithm S), denotes a + * rownumber in the flattened row number space that is the target of a sample, + * which starts from 0. + * In other words, if we have seg0 rownums: [1, 100], seg1 rownums: [1, 200] + * If targrow = 150, then we are referring to seg1's rownum=51. + */ + int64 targrow; + + /* + * segfirstrow: pointing to the next starting row which is used to check + * the distance to `targrow` + */ + int64 segfirstrow; + + /* + * segrowsprocessed: track the rows processed under the current segfile. + * Don't miss updating it accordingly when "segfirstrow" is updated. + */ + int64 segrowsprocessed; + + AOBlkDirScan blkdirscan; /* scan current state */ BlockNumber rs_nblocks; /* current block */ @@ -454,6 +476,9 @@ extern bool appendonly_getnextslot(TableScanDesc scan, ScanDirection direction, TupleTableSlot *slot); extern uint32 appendonly_scan_flags(Relation relation); +extern bool appendonly_get_target_tuple(AppendOnlyScanDesc aoscan, + int64 targrow, + TupleTableSlot *slot); extern AppendOnlyFetchDesc appendonly_fetch_init( Relation relation, Snapshot snapshot, @@ -514,4 +539,21 @@ AppendOnlyScanDesc_UpdateTotalBytesRead(AppendOnlyScanDesc scan) scan->totalBytesRead += scan->storageRead.current.uncompressedLen; } +static inline int64 +AppendOnlyScanDesc_TotalTupCount(AppendOnlyScanDesc scan) +{ + Assert(scan != NULL); + + int64 totalrows = 0; + FileSegInfo **seginfo = scan->aos_segfile_arr; + + for (int i = 0; i < scan->aos_total_segfiles; i++) + { + if (seginfo[i]->state != AOSEG_STATE_AWAITING_DROP) + totalrows += seginfo[i]->total_tupcount; + } + + return totalrows; +} + #endif /* CDBAPPENDONLYAM_H */ diff --git a/src/include/cdb/cdbappendonlyblockdirectory.h b/src/include/cdb/cdbappendonlyblockdirectory.h index 425cb9f7312..f90f7849fe0 100644 --- a/src/include/cdb/cdbappendonlyblockdirectory.h +++ b/src/include/cdb/cdbappendonlyblockdirectory.h @@ -95,6 +95,8 @@ typedef struct MinipagePerColumnGroup #define IsMinipageFull(minipagePerColumnGroup) \ ((minipagePerColumnGroup)->numMinipageEntries == (uint32) gp_blockdirectory_minipage_size) +#define InvalidEntryNum (-1) + /* * Define a structure for the append-only relation block directory. */ @@ -135,6 +137,14 @@ typedef struct AppendOnlyBlockDirectory ScanKey scanKeys; StrategyNumber *strategyNumbers; + /* + * Minipage entry number, for caching purpose. + * + * XXX: scenarios which call AppendOnlyBlockDirectory_GetEntry() + * may need to consider using this cache. + */ + int cached_mpentry_num; + } AppendOnlyBlockDirectory; @@ -169,10 +179,16 @@ typedef struct AOFetchSegmentFile int64 logicalEof; } AOFetchSegmentFile; -typedef struct AppendOnlyBlockDirectorySeqScan { - AppendOnlyBlockDirectory blkdir; - SysScanDesc sysScan; -} AppendOnlyBlockDirectorySeqScan; +/* + * Tracks block directory scan state for block-directory based ANALYZE. + */ +typedef struct AOBlkDirScanData +{ + AppendOnlyBlockDirectory *blkdir; + SysScanDesc sysscan; + int segno; + int colgroupno; +} AOBlkDirScanData, *AOBlkDirScan; extern void AppendOnlyBlockDirectoryEntry_GetBeginRange( AppendOnlyBlockDirectoryEntry *directoryEntry, @@ -190,6 +206,12 @@ extern bool AppendOnlyBlockDirectory_GetEntry( AOTupleId *aoTupleId, int columnGroupNo, AppendOnlyBlockDirectoryEntry *directoryEntry); +extern int64 AOBlkDirScan_GetRowNum( + AOBlkDirScan blkdirscan, + int targsegno, + int colgroupno, + int64 targrow, + int64 *startrow); extern bool AppendOnlyBlockDirectory_CoversTuple( AppendOnlyBlockDirectory *blockDirectory, AOTupleId *aoTupleId); @@ -243,6 +265,8 @@ extern void AppendOnlyBlockDirectory_DeleteSegmentFile( int columnGroupNo); extern void AppendOnlyBlockDirectory_End_forUniqueChecks( AppendOnlyBlockDirectory *blockDirectory); +extern void AppendOnlyBlockDirectory_End_forSearch_InSequence( + AOBlkDirScan seqscan); extern void AppendOnlyBlockDirectory_InsertPlaceholder(AppendOnlyBlockDirectory *blockDirectory, int64 firstRowNum, @@ -322,4 +346,35 @@ copy_out_minipage(MinipagePerColumnGroup *minipageInfo, minipageInfo->numMinipageEntries = minipageInfo->minipage->nEntry; } +static inline void +AOBlkDirScan_Init(AOBlkDirScan blkdirscan, + AppendOnlyBlockDirectory *blkdir) +{ + blkdirscan->blkdir = blkdir; + blkdirscan->sysscan = NULL; + blkdirscan->segno = -1; + blkdirscan->colgroupno = 0; +} + +/* should be called before fetch_finish() */ +static inline void +AOBlkDirScan_Finish(AOBlkDirScan blkdirscan) +{ + /* + * Make sure blkdir hasn't been destroyed by fetch_finish(), + * or systable_endscan_ordered() will be crashed for sysscan + * is holding blkdir relation which is freed. + */ + Assert(blkdirscan->blkdir != NULL); + + if (blkdirscan->sysscan != NULL) + { + systable_endscan_ordered(blkdirscan->sysscan); + blkdirscan->sysscan = NULL; + } + blkdirscan->segno = -1; + blkdirscan->colgroupno = 0; + blkdirscan->blkdir = NULL; +} + #endif diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h index 87f19d4e087..8e99c8fd70c 100644 --- a/src/include/commands/vacuum.h +++ b/src/include/commands/vacuum.h @@ -434,13 +434,6 @@ extern double anl_random_fract(void); extern double anl_init_selection_state(int n); extern double anl_get_next_S(double t, int n, double *stateptr); -extern int acquire_sample_rows(Relation onerel, int elevel, - HeapTuple *rows, int targrows, - double *totalrows, double *totaldeadrows); -extern int acquire_inherited_sample_rows(Relation onerel, int elevel, - HeapTuple *rows, int targrows, - double *totalrows, double *totaldeadrows); - /* in commands/analyzefuncs.c */ extern Datum gp_acquire_sample_rows(PG_FUNCTION_ARGS); extern Datum gp_acquire_correlations(PG_FUNCTION_ARGS); diff --git a/src/include/utils/datumstream.h b/src/include/utils/datumstream.h index 5210159d8b6..278579fb60b 100644 --- a/src/include/utils/datumstream.h +++ b/src/include/utils/datumstream.h @@ -93,6 +93,7 @@ typedef struct DatumStreamRead int64 blockFirstRowNum; int64 blockFileOffset; int blockRowCount; + int blockRowsProcessed; AppendOnlyStorageRead ao_read; @@ -322,8 +323,9 @@ extern bool datumstreamread_find_block(DatumStreamRead * datumStream, extern void *datumstreamread_get_upgrade_space(DatumStreamRead *datumStream, size_t len); +extern bool datumstreamread_block_info(DatumStreamRead * acc); /* - * MPP-17061: make sure datumstream_read_block_info was called first for the CO block + * MPP-17061: make sure datumstreamread_block_info was called first for the CO block * before calling datumstreamread_block_content. */ extern void datumstreamread_block_content(DatumStreamRead * acc); diff --git a/src/include/utils/sampling.h b/src/include/utils/sampling.h index 9536b07432c..fa3c9aa2ff8 100644 --- a/src/include/utils/sampling.h +++ b/src/include/utils/sampling.h @@ -42,7 +42,7 @@ extern BlockNumber BlockSampler_Init(BlockSampler bs, BlockNumber nblocks, extern bool BlockSampler_HasMore(BlockSampler bs); extern BlockNumber BlockSampler_Next(BlockSampler bs); -/* 64 bit version of BlockSampler */ +/* 64 bit version of BlockSampler (used for sampling AO/CO table rows) */ typedef struct { int64 N; /* number of objects, known in advance */ diff --git a/src/test/isolation2/expected/ao_blkdir.out b/src/test/isolation2/expected/ao_blkdir.out index f202d8ec4da..0a1eb468592 100644 --- a/src/test/isolation2/expected/ao_blkdir.out +++ b/src/test/isolation2/expected/ao_blkdir.out @@ -319,6 +319,155 @@ COMMIT DROP TABLE ao_blkdir_test; DROP +-- Test `tupcount` in pg_aoseg == sum of number of `row_count` across all +-- aoblkdir entries for each segno. Test with commits, aborts and deletes. + +-- Case1: without VACUUM ANALYZE +CREATE TABLE ao_blkdir_test_rowcount(i int, j int) USING ao_row DISTRIBUTED BY (j); +CREATE +1: BEGIN; +BEGIN +2: BEGIN; +BEGIN +3: BEGIN; +BEGIN +4: BEGIN; +BEGIN +1: INSERT INTO ao_blkdir_test_rowcount SELECT i, 2 FROM generate_series(1, 10) i; +INSERT 10 +2: INSERT INTO ao_blkdir_test_rowcount SELECT i, 3 FROM generate_series(1, 20) i; +INSERT 20 +3: INSERT INTO ao_blkdir_test_rowcount SELECT i, 4 FROM generate_series(1, 30) i; +INSERT 30 +3: ABORT; +ABORT +3: BEGIN; +BEGIN +3: INSERT INTO ao_blkdir_test_rowcount SELECT i, 4 FROM generate_series(1, 40) i; +INSERT 40 +4: INSERT INTO ao_blkdir_test_rowcount SELECT i, 7 FROM generate_series(1, 50) i; +INSERT 50 +1: COMMIT; +COMMIT +2: COMMIT; +COMMIT +3: COMMIT; +COMMIT +4: COMMIT; +COMMIT +DELETE FROM ao_blkdir_test_rowcount WHERE j = 7; +DELETE 50 + +CREATE INDEX ao_blkdir_test_rowcount_idx ON ao_blkdir_test_rowcount(i); +CREATE + +SELECT segno, sum(row_count) AS totalrows FROM (SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test_rowcount')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0)s GROUP BY segno, columngroup_no ORDER BY segno; + segno | totalrows +-------+----------- + 1 | 10 + 2 | 20 + 3 | 40 + 4 | 50 +(4 rows) +SELECT segno, sum(tupcount) AS totalrows FROM gp_toolkit.__gp_aoseg('ao_blkdir_test_rowcount') WHERE segment_id = 0 GROUP BY segno; + segno | totalrows +-------+----------- + 1 | 10 + 2 | 20 + 3 | 40 + 4 | 50 +(4 rows) + +-- Case2: with VACUUM ANALYZE +DROP TABLE ao_blkdir_test_rowcount; +DROP +CREATE TABLE ao_blkdir_test_rowcount(i int, j int) USING ao_row DISTRIBUTED BY (j); +CREATE +CREATE INDEX ao_blkdir_test_rowcount_idx ON ao_blkdir_test_rowcount(i); +CREATE +1: BEGIN; +BEGIN +2: BEGIN; +BEGIN +3: BEGIN; +BEGIN +4: BEGIN; +BEGIN +1: INSERT INTO ao_blkdir_test_rowcount SELECT i, 2 FROM generate_series(1, 10) i; +INSERT 10 +1: INSERT INTO ao_blkdir_test_rowcount SELECT i, 2 FROM ao_blkdir_test_rowcount; +INSERT 10 +1: INSERT INTO ao_blkdir_test_rowcount SELECT i, 2 FROM ao_blkdir_test_rowcount; +INSERT 20 +2: INSERT INTO ao_blkdir_test_rowcount SELECT i, 3 FROM generate_series(1, 20) i; +INSERT 20 +2: INSERT INTO ao_blkdir_test_rowcount SELECT i, 3 FROM ao_blkdir_test_rowcount; +INSERT 20 +2: INSERT INTO ao_blkdir_test_rowcount SELECT i, 3 FROM ao_blkdir_test_rowcount; +INSERT 40 +3: INSERT INTO ao_blkdir_test_rowcount SELECT i, 4 FROM generate_series(1, 30) i; +INSERT 30 +3: INSERT INTO ao_blkdir_test_rowcount SELECT i, 4 FROM ao_blkdir_test_rowcount; +INSERT 30 +3: INSERT INTO ao_blkdir_test_rowcount SELECT i, 4 FROM ao_blkdir_test_rowcount; +INSERT 60 +4: INSERT INTO ao_blkdir_test_rowcount SELECT i, 7 FROM generate_series(1, 50) i; +INSERT 50 +4: INSERT INTO ao_blkdir_test_rowcount SELECT i, 7 FROM ao_blkdir_test_rowcount; +INSERT 50 +4: INSERT INTO ao_blkdir_test_rowcount SELECT i, 7 FROM ao_blkdir_test_rowcount; +INSERT 100 +1: COMMIT; +COMMIT +2: COMMIT; +COMMIT +3: ABORT; +ABORT +4: COMMIT; +COMMIT + +DELETE FROM ao_blkdir_test_rowcount WHERE j = 7; +DELETE 200 +VACUUM ANALYZE ao_blkdir_test_rowcount; +VACUUM + +SELECT segno, sum(row_count) AS totalrows FROM (SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test_rowcount')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0)s GROUP BY segno, columngroup_no ORDER BY segno; + segno | totalrows +-------+----------- + 1 | 40 + 2 | 80 +(2 rows) +SELECT segno, sum(tupcount) AS totalrows FROM gp_toolkit.__gp_aoseg('ao_blkdir_test_rowcount') WHERE segment_id = 0 GROUP BY segno; + segno | totalrows +-------+----------- + 1 | 40 + 2 | 80 + 3 | 0 + 4 | 0 +(4 rows) + +UPDATE ao_blkdir_test_rowcount SET i = i + 1; +UPDATE 120 +VACUUM ANALYZE ao_blkdir_test_rowcount; +VACUUM + +SELECT segno, sum(row_count) AS totalrows FROM (SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test_rowcount')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0)s GROUP BY segno, columngroup_no ORDER BY segno; + segno | totalrows +-------+----------- + 3 | 120 +(1 row) +SELECT segno, sum(tupcount) AS totalrows FROM gp_toolkit.__gp_aoseg('ao_blkdir_test_rowcount') WHERE segment_id = 0 GROUP BY segno; + segno | totalrows +-------+----------- + 1 | 0 + 2 | 0 + 3 | 120 + 4 | 0 +(4 rows) + +DROP TABLE ao_blkdir_test_rowcount; +DROP + -------------------------------------------------------------------------------- -- AOCO tables -------------------------------------------------------------------------------- @@ -825,3 +974,172 @@ DETAIL: Key (i)=(2) already exists. DROP TABLE aoco_blkdir_test; DROP + +-- Test `tupcount` in pg_ao(cs)seg == sum of number of `row_count` across all +-- aoblkdir entries for each . Test with commits, aborts +-- and deletes. + +-- Case1: without VACUUM ANALYZE +CREATE TABLE aoco_blkdir_test_rowcount(i int, j int) USING ao_column DISTRIBUTED BY (j); +CREATE +1: BEGIN; +BEGIN +2: BEGIN; +BEGIN +3: BEGIN; +BEGIN +4: BEGIN; +BEGIN +1: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 2 FROM generate_series(1, 10) i; +INSERT 10 +2: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 3 FROM generate_series(1, 20) i; +INSERT 20 +3: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 4 FROM generate_series(1, 30) i; +INSERT 30 +3: ABORT; +ABORT +3: BEGIN; +BEGIN +3: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 4 FROM generate_series(1, 40) i; +INSERT 40 +4: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 7 FROM generate_series(1, 50) i; +INSERT 50 +1: COMMIT; +COMMIT +2: COMMIT; +COMMIT +3: COMMIT; +COMMIT +4: COMMIT; +COMMIT +DELETE FROM aoco_blkdir_test_rowcount WHERE j = 7; +DELETE 50 + +CREATE INDEX aoco_blkdir_test_rowcount_idx ON aoco_blkdir_test_rowcount(i); +CREATE + +SELECT segno, columngroup_no, sum(row_count) AS totalrows FROM (SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test_rowcount')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0)s GROUP BY segno, columngroup_no ORDER BY segno, columngroup_no; + segno | columngroup_no | totalrows +-------+----------------+----------- + 1 | 0 | 10 + 1 | 1 | 10 + 2 | 0 | 20 + 2 | 1 | 20 + 3 | 0 | 40 + 3 | 1 | 40 + 4 | 0 | 50 + 4 | 1 | 50 +(8 rows) +SELECT segno, column_num, sum(tupcount) AS totalrows FROM gp_toolkit.__gp_aocsseg('aoco_blkdir_test_rowcount') WHERE segment_id = 0 GROUP BY segno, column_num; + segno | column_num | totalrows +-------+------------+----------- + 1 | 0 | 10 + 1 | 1 | 10 + 2 | 0 | 20 + 2 | 1 | 20 + 3 | 0 | 40 + 3 | 1 | 40 + 4 | 0 | 50 + 4 | 1 | 50 +(8 rows) + +-- Case2: with VACUUM ANALYZE +DROP TABLE aoco_blkdir_test_rowcount; +DROP +CREATE TABLE aoco_blkdir_test_rowcount(i int, j int) USING ao_column DISTRIBUTED BY (j); +CREATE +CREATE INDEX aoco_blkdir_test_rowcount_idx ON aoco_blkdir_test_rowcount(i); +CREATE +1: BEGIN; +BEGIN +2: BEGIN; +BEGIN +3: BEGIN; +BEGIN +4: BEGIN; +BEGIN +1: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 2 FROM generate_series(1, 10) i; +INSERT 10 +1: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 2 FROM aoco_blkdir_test_rowcount; +INSERT 10 +1: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 2 FROM aoco_blkdir_test_rowcount; +INSERT 20 +2: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 3 FROM generate_series(1, 20) i; +INSERT 20 +2: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 3 FROM aoco_blkdir_test_rowcount; +INSERT 20 +2: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 3 FROM aoco_blkdir_test_rowcount; +INSERT 40 +3: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 4 FROM generate_series(1, 30) i; +INSERT 30 +3: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 4 FROM aoco_blkdir_test_rowcount; +INSERT 30 +3: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 4 FROM aoco_blkdir_test_rowcount; +INSERT 60 +4: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 7 FROM generate_series(1, 50) i; +INSERT 50 +4: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 7 FROM aoco_blkdir_test_rowcount; +INSERT 50 +4: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 7 FROM aoco_blkdir_test_rowcount; +INSERT 100 +1: COMMIT; +COMMIT +2: COMMIT; +COMMIT +3: ABORT; +ABORT +4: COMMIT; +COMMIT + +DELETE FROM aoco_blkdir_test_rowcount WHERE j = 7; +DELETE 200 +VACUUM ANALYZE aoco_blkdir_test_rowcount; +VACUUM + +SELECT segno, columngroup_no, sum(row_count) AS totalrows FROM (SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test_rowcount')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0)s GROUP BY segno, columngroup_no ORDER BY segno, columngroup_no; + segno | columngroup_no | totalrows +-------+----------------+----------- + 1 | 0 | 40 + 1 | 1 | 40 + 2 | 0 | 80 + 2 | 1 | 80 +(4 rows) +SELECT segno, column_num, sum(tupcount) AS totalrows FROM gp_toolkit.__gp_aocsseg('aoco_blkdir_test_rowcount') WHERE segment_id = 0 GROUP BY segno, column_num; + segno | column_num | totalrows +-------+------------+----------- + 1 | 0 | 40 + 1 | 1 | 40 + 2 | 0 | 80 + 2 | 1 | 80 + 3 | 0 | 0 + 3 | 1 | 0 + 4 | 0 | 0 + 4 | 1 | 0 +(8 rows) + +UPDATE aoco_blkdir_test_rowcount SET i = i + 1; +UPDATE 120 +VACUUM ANALYZE aoco_blkdir_test_rowcount; +VACUUM + +SELECT segno, columngroup_no, sum(row_count) AS totalrows FROM (SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test_rowcount')).* FROM gp_dist_random('gp_id') WHERE gp_segment_id = 0)s GROUP BY segno, columngroup_no ORDER BY segno, columngroup_no; + segno | columngroup_no | totalrows +-------+----------------+----------- + 3 | 0 | 120 + 3 | 1 | 120 +(2 rows) +SELECT segno, column_num, sum(tupcount) AS totalrows FROM gp_toolkit.__gp_aocsseg('aoco_blkdir_test_rowcount') WHERE segment_id = 0 GROUP BY segno, column_num; + segno | column_num | totalrows +-------+------------+----------- + 1 | 0 | 0 + 1 | 1 | 0 + 2 | 0 | 0 + 2 | 1 | 0 + 3 | 0 | 120 + 3 | 1 | 120 + 4 | 0 | 0 + 4 | 1 | 0 +(8 rows) + +DROP TABLE aoco_blkdir_test_rowcount; +DROP diff --git a/src/test/isolation2/input/uao/fast_analyze.source b/src/test/isolation2/input/uao/fast_analyze.source new file mode 100644 index 00000000000..962ba600f75 --- /dev/null +++ b/src/test/isolation2/input/uao/fast_analyze.source @@ -0,0 +1,349 @@ +-- +-- Test AO/CO sampling method. +-- +-- These tests ensure that we achieve our ANALYZE targets for AO/CO tables. +-- +CREATE TABLE fast_analyze_@amname@_1(i int, j int) USING @amname@ DISTRIBUTED BY (j); + +-- Stats target info shows that we will sample 300 * (100) rows. +SHOW default_statistics_target; +SELECT attstattarget FROM pg_attribute + WHERE attrelid = 'fast_analyze_@amname@_1'::regclass AND attname IN ('i', 'j'); + +-------------------------------------------------------------------------------- +-- Scenario 1: +-- We have MORE than 300 * default_statistics_target = 30k rows for a 2 int table, +-- spread across 3 segments, with no aborted rows [2 subcases -> blkdir and +-- non-blkdir]. +-- Expectation: We have collected 30k live rows. +-------------------------------------------------------------------------------- + +-- (a) Without blkdir subcase + +-- Insert 10.5k rows in each QE. +1: BEGIN; +2: BEGIN; +3: BEGIN; +1: INSERT INTO fast_analyze_@amname@_1 SELECT i, 2 FROM generate_series(1, 10500) i; +2: INSERT INTO fast_analyze_@amname@_1 SELECT i, 1 FROM generate_series(1, 10500) i; +3: INSERT INTO fast_analyze_@amname@_1 SELECT i, 5 FROM generate_series(1, 10500) i; +1: COMMIT; +2: COMMIT; +3: COMMIT; + +SELECT gp_inject_fault_infinite('returned_sample_row', 'skip', dbid) + FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + +ANALYZE fast_analyze_@amname@_1; + +-- We have sampled 10k live rows. +SELECT gp_inject_fault('returned_sample_row', 'status', dbid) + FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + +SELECT gp_inject_fault('returned_sample_row', 'reset', dbid) + FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + +-- (b) With blkdir subcase + +CREATE INDEX ON fast_analyze_@amname@_1(i); + +SELECT gp_inject_fault_infinite('returned_sample_row', 'skip', dbid) + FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + +ANALYZE fast_analyze_@amname@_1; + +-- We have sampled 10k live rows from each QE. +SELECT gp_inject_fault('returned_sample_row', 'status', dbid) + FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + +SELECT gp_inject_fault('returned_sample_row', 'reset', dbid) + FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + +-------------------------------------------------------------------------------- +-- Scenario 2: +-- We have LESS than 300 * default_statistics_target = 30k rows for a 2 int table, +-- spread across 3 segments, with no aborted rows [2 subcases -> blkdir and +-- non-blkdir]. +-- Expectation: We have collected number of live rows = total tupcount of table. +-------------------------------------------------------------------------------- + +CREATE TABLE fast_analyze_@amname@_2(i int, j int) USING @amname@ DISTRIBUTED BY (j); + +-- (a) Without blkdir subcase + +-- Insert 10 rows in each QE. +1: BEGIN; +2: BEGIN; +3: BEGIN; +1: INSERT INTO fast_analyze_@amname@_2 SELECT i, 2 FROM generate_series(1, 10) i; +2: INSERT INTO fast_analyze_@amname@_2 SELECT i, 1 FROM generate_series(1, 10) i; +3: INSERT INTO fast_analyze_@amname@_2 SELECT i, 5 FROM generate_series(1, 10) i; +1: COMMIT; +2: COMMIT; +3: COMMIT; + +SELECT gp_inject_fault_infinite('returned_sample_row', 'skip', dbid) + FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + +ANALYZE fast_analyze_@amname@_2; + +-- We have sampled 10 live rows from each QE. +SELECT gp_inject_fault('returned_sample_row', 'status', dbid) + FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + +SELECT gp_inject_fault('returned_sample_row', 'reset', dbid) + FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + +-- (b) With blkdir subcase + +CREATE INDEX ON fast_analyze_@amname@_2(i); + +SELECT gp_inject_fault_infinite('returned_sample_row', 'skip', dbid) + FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + +ANALYZE fast_analyze_@amname@_2; + +-- We have sampled 10 live rows from each QE. +SELECT gp_inject_fault('returned_sample_row', 'status', dbid) + FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + +SELECT gp_inject_fault('returned_sample_row', 'reset', dbid) + FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + +-------------------------------------------------------------------------------- +-- Scenario 3: +-- We have ALL aborted rows [2 subcases -> blkdir and non-blkdir]. +-- Expectation: We have not sampled any live rows. +-------------------------------------------------------------------------------- + +CREATE TABLE fast_analyze_@amname@_3(i int, j int) USING @amname@ DISTRIBUTED BY (j); + +-- (a) Without blkdir subcase + +-- Insert 10 rows in each QE. +1: BEGIN; +2: BEGIN; +3: BEGIN; +1: INSERT INTO fast_analyze_@amname@_3 SELECT i, 2 FROM generate_series(1, 10) i; +2: INSERT INTO fast_analyze_@amname@_3 SELECT i, 1 FROM generate_series(1, 10) i; +3: INSERT INTO fast_analyze_@amname@_3 SELECT i, 5 FROM generate_series(1, 10) i; +1: ABORT; +2: ABORT; +3: ABORT; + +SELECT gp_inject_fault_infinite('returned_sample_row', 'skip', dbid) + FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + +ANALYZE fast_analyze_@amname@_3; + +-- We have not sampled any live rows on any QE. +SELECT gp_inject_fault('returned_sample_row', 'status', dbid) + FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + +SELECT gp_inject_fault('returned_sample_row', 'reset', dbid) + FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + +-- (b) With blkdir subcase + +CREATE INDEX ON fast_analyze_@amname@_3(i); + +SELECT gp_inject_fault_infinite('returned_sample_row', 'skip', dbid) + FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + +ANALYZE fast_analyze_@amname@_3; + +-- We have not sampled any live rows on any QE. +SELECT gp_inject_fault('returned_sample_row', 'status', dbid) + FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + +SELECT gp_inject_fault('returned_sample_row', 'reset', dbid) + FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + +-------------------------------------------------------------------------------- +-- Scenario 4: +-- We have ALL deleted rows [2 subcases -> blkdir and non-blkdir]. +-- Expectation: We have not collected any live rows. +-------------------------------------------------------------------------------- + +CREATE TABLE fast_analyze_@amname@_4(i int, j int) USING @amname@ DISTRIBUTED BY (j); + +-- (a) Without blkdir subcase + +-- Insert 10 rows in each QE. +1: BEGIN; +2: BEGIN; +3: BEGIN; +1: INSERT INTO fast_analyze_@amname@_4 SELECT i, 2 FROM generate_series(1, 10) i; +2: INSERT INTO fast_analyze_@amname@_4 SELECT i, 1 FROM generate_series(1, 10) i; +3: INSERT INTO fast_analyze_@amname@_4 SELECT i, 5 FROM generate_series(1, 10) i; +1: COMMIT; +2: COMMIT; +3: COMMIT; +-- Delete all rows. +DELETE FROM fast_analyze_@amname@_4; +SELECT gp_inject_fault_infinite('returned_sample_row', 'skip', dbid) + FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + +ANALYZE fast_analyze_@amname@_4; + +-- We have not sampled any live rows on any QE. +SELECT gp_inject_fault('returned_sample_row', 'status', dbid) + FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + +SELECT gp_inject_fault('returned_sample_row', 'reset', dbid) + FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + +-- (b) With blkdir subcase + +CREATE INDEX ON fast_analyze_@amname@_4(i); + +SELECT gp_inject_fault_infinite('returned_sample_row', 'skip', dbid) + FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + +ANALYZE fast_analyze_@amname@_4; + +-- We have not sampled any live rows on any QE. +SELECT gp_inject_fault('returned_sample_row', 'status', dbid) + FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + +SELECT gp_inject_fault('returned_sample_row', 'reset', dbid) + FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + +DROP TABLE fast_analyze_@amname@_1; +DROP TABLE fast_analyze_@amname@_2; +DROP TABLE fast_analyze_@amname@_3; +DROP TABLE fast_analyze_@amname@_4; + +-- +-- The following tests ensure fast analyze function to work +-- with multi-segfiles tables under concurrent inserts. +-- + +create table analyze_@amname@ (id int, a int, b inet, c inet) using @amname@ with (compresstype=zlib, compresslevel=3); + +insert into analyze_@amname@ select 2, i, (select ((i%255)::text || '.' || (i%255)::text || '.' || (i%255)::text || '.' || + (i%255)::text))::inet, (select ((i%255)::text || '.' || (i%255)::text || '.' || (i%255)::text || '.' || + (i%255)::text))::inet from generate_series(1,30000)i; + +insert into analyze_@amname@ select * from analyze_@amname@ limit 1000; +insert into analyze_@amname@ select * from analyze_@amname@ limit 1000; + +-- test ANALYZE after concurrent inserts commit + +1: begin; +1: insert into analyze_@amname@ select * from analyze_@amname@ limit 1000; + +2: begin; +2: insert into analyze_@amname@ select * from analyze_@amname@ limit 1000; + +3: begin; +3: insert into analyze_@amname@ select * from analyze_@amname@ limit 1000; + +4: insert into analyze_@amname@ select * from analyze_@amname@ limit 1000; + +5: analyze analyze_@amname@; + +1: commit; +2: commit; +3: abort; + +1: analyze analyze_@amname@; + +-- test aoblkdir based ANALYZE + +create index on analyze_@amname@(id); + +1: begin; +1: insert into analyze_@amname@ select * from analyze_@amname@ limit 1000; + +2: begin; +2: insert into analyze_@amname@ select * from analyze_@amname@ limit 1000; + +3: begin; +3: insert into analyze_@amname@ select * from analyze_@amname@ limit 1000; + +4: insert into analyze_@amname@ select * from analyze_@amname@ limit 1000; + +5: analyze analyze_@amname@; + +1: commit; +2: commit; +3: abort; + +1: analyze analyze_@amname@; + +drop table analyze_@amname@; + +-- test more data and stability, note, it could take a little long time + +create table analyze_@amname@_2 (id int, a int, b inet, c inet) using @amname@ with (compresstype=zlib, compresslevel=3); +insert into analyze_@amname@_2 select 2, i, (select ((i%255)::text || '.' || (i%255)::text || '.' || (i%255)::text || '.' || + (i%255)::text))::inet, (select ((i%255)::text || '.' || (i%255)::text || '.' || (i%255)::text || '.' || + (i%255)::text))::inet from generate_series(1,1000)i; + +insert into analyze_@amname@_2 select * from analyze_@amname@_2; +insert into analyze_@amname@_2 select * from analyze_@amname@_2; +insert into analyze_@amname@_2 select * from analyze_@amname@_2; +insert into analyze_@amname@_2 select * from analyze_@amname@_2; +insert into analyze_@amname@_2 select * from analyze_@amname@_2; +insert into analyze_@amname@_2 select * from analyze_@amname@_2; +insert into analyze_@amname@_2 select * from analyze_@amname@_2; +insert into analyze_@amname@_2 select * from analyze_@amname@_2; + +1: begin; +1: insert into analyze_@amname@_2 select * from analyze_@amname@_2; + +2: insert into analyze_@amname@_2 select * from analyze_@amname@_2; + +1: commit; + +1: analyze analyze_@amname@_2; +1: analyze analyze_@amname@_2; +1: analyze analyze_@amname@_2; +1: analyze analyze_@amname@_2; +1: analyze analyze_@amname@_2; + +1: begin; +1: insert into analyze_@amname@_2 select * from analyze_@amname@_2; + +2: insert into analyze_@amname@_2 select * from analyze_@amname@_2; + +1: abort; + +1: analyze analyze_@amname@_2; +1: analyze analyze_@amname@_2; +1: analyze analyze_@amname@_2; +1: analyze analyze_@amname@_2; +1: analyze analyze_@amname@_2; + +-- test with aoblkdir + +create index on analyze_@amname@_2(a); + +1: begin; +1: insert into analyze_@amname@_2 select * from analyze_@amname@_2; + +2: insert into analyze_@amname@_2 select * from analyze_@amname@_2; + +1: commit; + +1: analyze analyze_@amname@_2; +1: analyze analyze_@amname@_2; +1: analyze analyze_@amname@_2; +1: analyze analyze_@amname@_2; +1: analyze analyze_@amname@_2; + +1: begin; +1: insert into analyze_@amname@_2 select * from analyze_@amname@_2; + +2: insert into analyze_@amname@_2 select * from analyze_@amname@_2; + +1: abort; + +1: analyze analyze_@amname@_2; +1: analyze analyze_@amname@_2; +1: analyze analyze_@amname@_2; +1: analyze analyze_@amname@_2; +1: analyze analyze_@amname@_2; + +drop table analyze_@amname@_2; diff --git a/src/test/isolation2/isolation2_schedule b/src/test/isolation2/isolation2_schedule index 8931e46a5fc..aec0005e8d8 100644 --- a/src/test/isolation2/isolation2_schedule +++ b/src/test/isolation2/isolation2_schedule @@ -148,6 +148,7 @@ test: uao/limit_indexscan_inits_row test: uao/test_pg_appendonly_version_row # Refer to the case comment for why it is commented out. # test: uao/bad_buffer_on_temp_ao_row +test: uao/fast_analyze_row test: uao/create_index_allows_readonly_row test: reorganize_after_ao_vacuum_skip_drop truncate_after_ao_vacuum_skip_drop mark_all_aoseg_await_drop @@ -209,6 +210,7 @@ test: uao/test_pg_appendonly_version_column # Refer to the case comment for why it is commented out. # test: uao/bad_buffer_on_temp_ao_column test: uao/create_index_allows_readonly_column +test: uao/fast_analyze_column # this case contains fault injection, must be put in a separate test group test: terminate_in_gang_creation diff --git a/src/test/isolation2/output/uao/fast_analyze.source b/src/test/isolation2/output/uao/fast_analyze.source new file mode 100644 index 00000000000..7f6a133e18b --- /dev/null +++ b/src/test/isolation2/output/uao/fast_analyze.source @@ -0,0 +1,632 @@ +-- +-- Test AO/CO sampling method. +-- +-- These tests ensure that we achieve our ANALYZE targets for AO/CO tables. +-- +CREATE TABLE fast_analyze_@amname@_1(i int, j int) USING @amname@ DISTRIBUTED BY (j); +CREATE + +-- Stats target info shows that we will sample 300 * (100) rows. +SHOW default_statistics_target; + default_statistics_target +--------------------------- + 100 +(1 row) +SELECT attstattarget FROM pg_attribute WHERE attrelid = 'fast_analyze_@amname@_1'::regclass AND attname IN ('i', 'j'); + attstattarget +--------------- + -1 + -1 +(2 rows) + +-------------------------------------------------------------------------------- +-- Scenario 1: +-- We have MORE than 300 * default_statistics_target = 30k rows for a 2 int table, +-- spread across 3 segments, with no aborted rows [2 subcases -> blkdir and +-- non-blkdir]. +-- Expectation: We have collected 30k live rows. +-------------------------------------------------------------------------------- + +-- (a) Without blkdir subcase + +-- Insert 10.5k rows in each QE. +1: BEGIN; +BEGIN +2: BEGIN; +BEGIN +3: BEGIN; +BEGIN +1: INSERT INTO fast_analyze_@amname@_1 SELECT i, 2 FROM generate_series(1, 10500) i; +INSERT 10500 +2: INSERT INTO fast_analyze_@amname@_1 SELECT i, 1 FROM generate_series(1, 10500) i; +INSERT 10500 +3: INSERT INTO fast_analyze_@amname@_1 SELECT i, 5 FROM generate_series(1, 10500) i; +INSERT 10500 +1: COMMIT; +COMMIT +2: COMMIT; +COMMIT +3: COMMIT; +COMMIT + +SELECT gp_inject_fault_infinite('returned_sample_row', 'skip', dbid) FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + gp_inject_fault_infinite +-------------------------- + Success: + Success: + Success: +(3 rows) + +ANALYZE fast_analyze_@amname@_1; +ANALYZE + +-- We have sampled 10k live rows. +SELECT gp_inject_fault('returned_sample_row', 'status', dbid) FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + gp_inject_fault +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Success: fault name:'returned_sample_row' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'triggered' num times hit:'10000' + + Success: fault name:'returned_sample_row' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'triggered' num times hit:'10000' + + Success: fault name:'returned_sample_row' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'triggered' num times hit:'10000' + +(3 rows) + +SELECT gp_inject_fault('returned_sample_row', 'reset', dbid) FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) + +-- (b) With blkdir subcase + +CREATE INDEX ON fast_analyze_@amname@_1(i); +CREATE + +SELECT gp_inject_fault_infinite('returned_sample_row', 'skip', dbid) FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + gp_inject_fault_infinite +-------------------------- + Success: + Success: + Success: +(3 rows) + +ANALYZE fast_analyze_@amname@_1; +ANALYZE + +-- We have sampled 10k live rows from each QE. +SELECT gp_inject_fault('returned_sample_row', 'status', dbid) FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + gp_inject_fault +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Success: fault name:'returned_sample_row' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'triggered' num times hit:'10000' + + Success: fault name:'returned_sample_row' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'triggered' num times hit:'10000' + + Success: fault name:'returned_sample_row' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'triggered' num times hit:'10000' + +(3 rows) + +SELECT gp_inject_fault('returned_sample_row', 'reset', dbid) FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) + +-------------------------------------------------------------------------------- +-- Scenario 2: +-- We have LESS than 300 * default_statistics_target = 30k rows for a 2 int table, +-- spread across 3 segments, with no aborted rows [2 subcases -> blkdir and +-- non-blkdir]. +-- Expectation: We have collected number of live rows = total tupcount of table. +-------------------------------------------------------------------------------- + +CREATE TABLE fast_analyze_@amname@_2(i int, j int) USING @amname@ DISTRIBUTED BY (j); +CREATE + +-- (a) Without blkdir subcase + +-- Insert 10 rows in each QE. +1: BEGIN; +BEGIN +2: BEGIN; +BEGIN +3: BEGIN; +BEGIN +1: INSERT INTO fast_analyze_@amname@_2 SELECT i, 2 FROM generate_series(1, 10) i; +INSERT 10 +2: INSERT INTO fast_analyze_@amname@_2 SELECT i, 1 FROM generate_series(1, 10) i; +INSERT 10 +3: INSERT INTO fast_analyze_@amname@_2 SELECT i, 5 FROM generate_series(1, 10) i; +INSERT 10 +1: COMMIT; +COMMIT +2: COMMIT; +COMMIT +3: COMMIT; +COMMIT + +SELECT gp_inject_fault_infinite('returned_sample_row', 'skip', dbid) FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + gp_inject_fault_infinite +-------------------------- + Success: + Success: + Success: +(3 rows) + +ANALYZE fast_analyze_@amname@_2; +ANALYZE + +-- We have sampled 10 live rows from each QE. +SELECT gp_inject_fault('returned_sample_row', 'status', dbid) FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + gp_inject_fault +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Success: fault name:'returned_sample_row' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'triggered' num times hit:'10' + + Success: fault name:'returned_sample_row' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'triggered' num times hit:'10' + + Success: fault name:'returned_sample_row' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'triggered' num times hit:'10' + +(3 rows) + +SELECT gp_inject_fault('returned_sample_row', 'reset', dbid) FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) + +-- (b) With blkdir subcase + +CREATE INDEX ON fast_analyze_@amname@_2(i); +CREATE + +SELECT gp_inject_fault_infinite('returned_sample_row', 'skip', dbid) FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + gp_inject_fault_infinite +-------------------------- + Success: + Success: + Success: +(3 rows) + +ANALYZE fast_analyze_@amname@_2; +ANALYZE + +-- We have sampled 10 live rows from each QE. +SELECT gp_inject_fault('returned_sample_row', 'status', dbid) FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + gp_inject_fault +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Success: fault name:'returned_sample_row' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'triggered' num times hit:'10' + + Success: fault name:'returned_sample_row' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'triggered' num times hit:'10' + + Success: fault name:'returned_sample_row' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'triggered' num times hit:'10' + +(3 rows) + +SELECT gp_inject_fault('returned_sample_row', 'reset', dbid) FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) + +-------------------------------------------------------------------------------- +-- Scenario 3: +-- We have ALL aborted rows [2 subcases -> blkdir and non-blkdir]. +-- Expectation: We have not sampled any live rows. +-------------------------------------------------------------------------------- + +CREATE TABLE fast_analyze_@amname@_3(i int, j int) USING @amname@ DISTRIBUTED BY (j); +CREATE + +-- (a) Without blkdir subcase + +-- Insert 10 rows in each QE. +1: BEGIN; +BEGIN +2: BEGIN; +BEGIN +3: BEGIN; +BEGIN +1: INSERT INTO fast_analyze_@amname@_3 SELECT i, 2 FROM generate_series(1, 10) i; +INSERT 10 +2: INSERT INTO fast_analyze_@amname@_3 SELECT i, 1 FROM generate_series(1, 10) i; +INSERT 10 +3: INSERT INTO fast_analyze_@amname@_3 SELECT i, 5 FROM generate_series(1, 10) i; +INSERT 10 +1: ABORT; +ABORT +2: ABORT; +ABORT +3: ABORT; +ABORT + +SELECT gp_inject_fault_infinite('returned_sample_row', 'skip', dbid) FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + gp_inject_fault_infinite +-------------------------- + Success: + Success: + Success: +(3 rows) + +ANALYZE fast_analyze_@amname@_3; +ANALYZE + +-- We have not sampled any live rows on any QE. +SELECT gp_inject_fault('returned_sample_row', 'status', dbid) FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + gp_inject_fault +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Success: fault name:'returned_sample_row' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'set' num times hit:'0' + + Success: fault name:'returned_sample_row' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'set' num times hit:'0' + + Success: fault name:'returned_sample_row' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'set' num times hit:'0' + +(3 rows) + +SELECT gp_inject_fault('returned_sample_row', 'reset', dbid) FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) + +-- (b) With blkdir subcase + +CREATE INDEX ON fast_analyze_@amname@_3(i); +CREATE + +SELECT gp_inject_fault_infinite('returned_sample_row', 'skip', dbid) FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + gp_inject_fault_infinite +-------------------------- + Success: + Success: + Success: +(3 rows) + +ANALYZE fast_analyze_@amname@_3; +ANALYZE + +-- We have not sampled any live rows on any QE. +SELECT gp_inject_fault('returned_sample_row', 'status', dbid) FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + gp_inject_fault +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Success: fault name:'returned_sample_row' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'set' num times hit:'0' + + Success: fault name:'returned_sample_row' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'set' num times hit:'0' + + Success: fault name:'returned_sample_row' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'set' num times hit:'0' + +(3 rows) + +SELECT gp_inject_fault('returned_sample_row', 'reset', dbid) FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) + +-------------------------------------------------------------------------------- +-- Scenario 4: +-- We have ALL deleted rows [2 subcases -> blkdir and non-blkdir]. +-- Expectation: We have not collected any live rows. +-------------------------------------------------------------------------------- + +CREATE TABLE fast_analyze_@amname@_4(i int, j int) USING @amname@ DISTRIBUTED BY (j); +CREATE + +-- (a) Without blkdir subcase + +-- Insert 10 rows in each QE. +1: BEGIN; +BEGIN +2: BEGIN; +BEGIN +3: BEGIN; +BEGIN +1: INSERT INTO fast_analyze_@amname@_4 SELECT i, 2 FROM generate_series(1, 10) i; +INSERT 10 +2: INSERT INTO fast_analyze_@amname@_4 SELECT i, 1 FROM generate_series(1, 10) i; +INSERT 10 +3: INSERT INTO fast_analyze_@amname@_4 SELECT i, 5 FROM generate_series(1, 10) i; +INSERT 10 +1: COMMIT; +COMMIT +2: COMMIT; +COMMIT +3: COMMIT; +COMMIT +-- Delete all rows. +DELETE FROM fast_analyze_@amname@_4; +DELETE 30 +SELECT gp_inject_fault_infinite('returned_sample_row', 'skip', dbid) FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + gp_inject_fault_infinite +-------------------------- + Success: + Success: + Success: +(3 rows) + +ANALYZE fast_analyze_@amname@_4; +ANALYZE + +-- We have not sampled any live rows on any QE. +SELECT gp_inject_fault('returned_sample_row', 'status', dbid) FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + gp_inject_fault +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Success: fault name:'returned_sample_row' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'set' num times hit:'0' + + Success: fault name:'returned_sample_row' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'set' num times hit:'0' + + Success: fault name:'returned_sample_row' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'set' num times hit:'0' + +(3 rows) + +SELECT gp_inject_fault('returned_sample_row', 'reset', dbid) FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) + +-- (b) With blkdir subcase + +CREATE INDEX ON fast_analyze_@amname@_4(i); +CREATE + +SELECT gp_inject_fault_infinite('returned_sample_row', 'skip', dbid) FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + gp_inject_fault_infinite +-------------------------- + Success: + Success: + Success: +(3 rows) + +ANALYZE fast_analyze_@amname@_4; +ANALYZE + +-- We have not sampled any live rows on any QE. +SELECT gp_inject_fault('returned_sample_row', 'status', dbid) FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + gp_inject_fault +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + Success: fault name:'returned_sample_row' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'set' num times hit:'0' + + Success: fault name:'returned_sample_row' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'set' num times hit:'0' + + Success: fault name:'returned_sample_row' fault type:'skip' ddl statement:'' database name:'' table name:'' start occurrence:'1' end occurrence:'-1' extra arg:'0' fault injection state:'set' num times hit:'0' + +(3 rows) + +SELECT gp_inject_fault('returned_sample_row', 'reset', dbid) FROM gp_segment_configuration WHERE content != -1 AND role = 'p'; + gp_inject_fault +----------------- + Success: + Success: + Success: +(3 rows) + +DROP TABLE fast_analyze_@amname@_1; +DROP +DROP TABLE fast_analyze_@amname@_2; +DROP +DROP TABLE fast_analyze_@amname@_3; +DROP +DROP TABLE fast_analyze_@amname@_4; +DROP + +-- +-- The following tests ensure fast analyze function to work +-- with multi-segfiles tables under concurrent inserts. +-- + +create table analyze_@amname@ (id int, a int, b inet, c inet) using @amname@ with (compresstype=zlib, compresslevel=3); +CREATE + +insert into analyze_@amname@ select 2, i, (select ((i%255)::text || '.' || (i%255)::text || '.' || (i%255)::text || '.' || (i%255)::text))::inet, (select ((i%255)::text || '.' || (i%255)::text || '.' || (i%255)::text || '.' || (i%255)::text))::inet from generate_series(1,30000)i; +INSERT 30000 + +insert into analyze_@amname@ select * from analyze_@amname@ limit 1000; +INSERT 1000 +insert into analyze_@amname@ select * from analyze_@amname@ limit 1000; +INSERT 1000 + +-- test ANALYZE after concurrent inserts commit + +1: begin; +BEGIN +1: insert into analyze_@amname@ select * from analyze_@amname@ limit 1000; +INSERT 1000 + +2: begin; +BEGIN +2: insert into analyze_@amname@ select * from analyze_@amname@ limit 1000; +INSERT 1000 + +3: begin; +BEGIN +3: insert into analyze_@amname@ select * from analyze_@amname@ limit 1000; +INSERT 1000 + +4: insert into analyze_@amname@ select * from analyze_@amname@ limit 1000; +INSERT 1000 + +5: analyze analyze_@amname@; +ANALYZE + +1: commit; +COMMIT +2: commit; +COMMIT +3: abort; +ABORT + +1: analyze analyze_@amname@; +ANALYZE + +-- test aoblkdir based ANALYZE + +create index on analyze_@amname@(id); +CREATE + +1: begin; +BEGIN +1: insert into analyze_@amname@ select * from analyze_@amname@ limit 1000; +INSERT 1000 + +2: begin; +BEGIN +2: insert into analyze_@amname@ select * from analyze_@amname@ limit 1000; +INSERT 1000 + +3: begin; +BEGIN +3: insert into analyze_@amname@ select * from analyze_@amname@ limit 1000; +INSERT 1000 + +4: insert into analyze_@amname@ select * from analyze_@amname@ limit 1000; +INSERT 1000 + +5: analyze analyze_@amname@; +ANALYZE + +1: commit; +COMMIT +2: commit; +COMMIT +3: abort; +ABORT + +1: analyze analyze_@amname@; +ANALYZE + +drop table analyze_@amname@; +DROP + +-- test more data and stability, note, it could take a little long time + +create table analyze_@amname@_2 (id int, a int, b inet, c inet) using @amname@ with (compresstype=zlib, compresslevel=3); +CREATE +insert into analyze_@amname@_2 select 2, i, (select ((i%255)::text || '.' || (i%255)::text || '.' || (i%255)::text || '.' || (i%255)::text))::inet, (select ((i%255)::text || '.' || (i%255)::text || '.' || (i%255)::text || '.' || (i%255)::text))::inet from generate_series(1,1000)i; +INSERT 1000 + +insert into analyze_@amname@_2 select * from analyze_@amname@_2; +INSERT 1000 +insert into analyze_@amname@_2 select * from analyze_@amname@_2; +INSERT 2000 +insert into analyze_@amname@_2 select * from analyze_@amname@_2; +INSERT 4000 +insert into analyze_@amname@_2 select * from analyze_@amname@_2; +INSERT 8000 +insert into analyze_@amname@_2 select * from analyze_@amname@_2; +INSERT 16000 +insert into analyze_@amname@_2 select * from analyze_@amname@_2; +INSERT 32000 +insert into analyze_@amname@_2 select * from analyze_@amname@_2; +INSERT 64000 +insert into analyze_@amname@_2 select * from analyze_@amname@_2; +INSERT 128000 + +1: begin; +BEGIN +1: insert into analyze_@amname@_2 select * from analyze_@amname@_2; +INSERT 256000 + +2: insert into analyze_@amname@_2 select * from analyze_@amname@_2; +INSERT 256000 + +1: commit; +COMMIT + +1: analyze analyze_@amname@_2; +ANALYZE +1: analyze analyze_@amname@_2; +ANALYZE +1: analyze analyze_@amname@_2; +ANALYZE +1: analyze analyze_@amname@_2; +ANALYZE +1: analyze analyze_@amname@_2; +ANALYZE + +1: begin; +BEGIN +1: insert into analyze_@amname@_2 select * from analyze_@amname@_2; +INSERT 768000 + +2: insert into analyze_@amname@_2 select * from analyze_@amname@_2; +INSERT 768000 + +1: abort; +ABORT + +1: analyze analyze_@amname@_2; +ANALYZE +1: analyze analyze_@amname@_2; +ANALYZE +1: analyze analyze_@amname@_2; +ANALYZE +1: analyze analyze_@amname@_2; +ANALYZE +1: analyze analyze_@amname@_2; +ANALYZE + +-- test with aoblkdir + +create index on analyze_@amname@_2(a); +CREATE + +1: begin; +BEGIN +1: insert into analyze_@amname@_2 select * from analyze_@amname@_2; +INSERT 1536000 + +2: insert into analyze_@amname@_2 select * from analyze_@amname@_2; +INSERT 1536000 + +1: commit; +COMMIT + +1: analyze analyze_@amname@_2; +ANALYZE +1: analyze analyze_@amname@_2; +ANALYZE +1: analyze analyze_@amname@_2; +ANALYZE +1: analyze analyze_@amname@_2; +ANALYZE +1: analyze analyze_@amname@_2; +ANALYZE + +1: begin; +BEGIN +1: insert into analyze_@amname@_2 select * from analyze_@amname@_2; +INSERT 4608000 + +2: insert into analyze_@amname@_2 select * from analyze_@amname@_2; +INSERT 4608000 + +1: abort; +ABORT + +1: analyze analyze_@amname@_2; +ANALYZE +1: analyze analyze_@amname@_2; +ANALYZE +1: analyze analyze_@amname@_2; +ANALYZE +1: analyze analyze_@amname@_2; +ANALYZE +1: analyze analyze_@amname@_2; +ANALYZE + +drop table analyze_@amname@_2; +DROP diff --git a/src/test/isolation2/sql/ao_blkdir.sql b/src/test/isolation2/sql/ao_blkdir.sql index cd8f1b662d0..cff529876c1 100644 --- a/src/test/isolation2/sql/ao_blkdir.sql +++ b/src/test/isolation2/sql/ao_blkdir.sql @@ -96,6 +96,81 @@ WHERE gp_segment_id = 0 ORDER BY 1,2,3,4,5; DROP TABLE ao_blkdir_test; +-- Test `tupcount` in pg_aoseg == sum of number of `row_count` across all +-- aoblkdir entries for each segno. Test with commits, aborts and deletes. + +-- Case1: without VACUUM ANALYZE +CREATE TABLE ao_blkdir_test_rowcount(i int, j int) USING ao_row DISTRIBUTED BY (j); +1: BEGIN; +2: BEGIN; +3: BEGIN; +4: BEGIN; +1: INSERT INTO ao_blkdir_test_rowcount SELECT i, 2 FROM generate_series(1, 10) i; +2: INSERT INTO ao_blkdir_test_rowcount SELECT i, 3 FROM generate_series(1, 20) i; +3: INSERT INTO ao_blkdir_test_rowcount SELECT i, 4 FROM generate_series(1, 30) i; +3: ABORT; +3: BEGIN; +3: INSERT INTO ao_blkdir_test_rowcount SELECT i, 4 FROM generate_series(1, 40) i; +4: INSERT INTO ao_blkdir_test_rowcount SELECT i, 7 FROM generate_series(1, 50) i; +1: COMMIT; +2: COMMIT; +3: COMMIT; +4: COMMIT; +DELETE FROM ao_blkdir_test_rowcount WHERE j = 7; + +CREATE INDEX ao_blkdir_test_rowcount_idx ON ao_blkdir_test_rowcount(i); + +SELECT segno, sum(row_count) AS totalrows FROM + (SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test_rowcount')).* FROM gp_dist_random('gp_id') + WHERE gp_segment_id = 0)s GROUP BY segno, columngroup_no ORDER BY segno; +SELECT segno, sum(tupcount) AS totalrows FROM + gp_toolkit.__gp_aoseg('ao_blkdir_test_rowcount') WHERE segment_id = 0 GROUP BY segno; + +-- Case2: with VACUUM ANALYZE +DROP TABLE ao_blkdir_test_rowcount; +CREATE TABLE ao_blkdir_test_rowcount(i int, j int) USING ao_row DISTRIBUTED BY (j); +CREATE INDEX ao_blkdir_test_rowcount_idx ON ao_blkdir_test_rowcount(i); +1: BEGIN; +2: BEGIN; +3: BEGIN; +4: BEGIN; +1: INSERT INTO ao_blkdir_test_rowcount SELECT i, 2 FROM generate_series(1, 10) i; +1: INSERT INTO ao_blkdir_test_rowcount SELECT i, 2 FROM ao_blkdir_test_rowcount; +1: INSERT INTO ao_blkdir_test_rowcount SELECT i, 2 FROM ao_blkdir_test_rowcount; +2: INSERT INTO ao_blkdir_test_rowcount SELECT i, 3 FROM generate_series(1, 20) i; +2: INSERT INTO ao_blkdir_test_rowcount SELECT i, 3 FROM ao_blkdir_test_rowcount; +2: INSERT INTO ao_blkdir_test_rowcount SELECT i, 3 FROM ao_blkdir_test_rowcount; +3: INSERT INTO ao_blkdir_test_rowcount SELECT i, 4 FROM generate_series(1, 30) i; +3: INSERT INTO ao_blkdir_test_rowcount SELECT i, 4 FROM ao_blkdir_test_rowcount; +3: INSERT INTO ao_blkdir_test_rowcount SELECT i, 4 FROM ao_blkdir_test_rowcount; +4: INSERT INTO ao_blkdir_test_rowcount SELECT i, 7 FROM generate_series(1, 50) i; +4: INSERT INTO ao_blkdir_test_rowcount SELECT i, 7 FROM ao_blkdir_test_rowcount; +4: INSERT INTO ao_blkdir_test_rowcount SELECT i, 7 FROM ao_blkdir_test_rowcount; +1: COMMIT; +2: COMMIT; +3: ABORT; +4: COMMIT; + +DELETE FROM ao_blkdir_test_rowcount WHERE j = 7; +VACUUM ANALYZE ao_blkdir_test_rowcount; + +SELECT segno, sum(row_count) AS totalrows FROM + (SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test_rowcount')).* FROM gp_dist_random('gp_id') + WHERE gp_segment_id = 0)s GROUP BY segno, columngroup_no ORDER BY segno; +SELECT segno, sum(tupcount) AS totalrows FROM + gp_toolkit.__gp_aoseg('ao_blkdir_test_rowcount') WHERE segment_id = 0 GROUP BY segno; + +UPDATE ao_blkdir_test_rowcount SET i = i + 1; +VACUUM ANALYZE ao_blkdir_test_rowcount; + +SELECT segno, sum(row_count) AS totalrows FROM + (SELECT (gp_toolkit.__gp_aoblkdir('ao_blkdir_test_rowcount')).* FROM gp_dist_random('gp_id') + WHERE gp_segment_id = 0)s GROUP BY segno, columngroup_no ORDER BY segno; +SELECT segno, sum(tupcount) AS totalrows FROM + gp_toolkit.__gp_aoseg('ao_blkdir_test_rowcount') WHERE segment_id = 0 GROUP BY segno; + +DROP TABLE ao_blkdir_test_rowcount; + -------------------------------------------------------------------------------- -- AOCO tables -------------------------------------------------------------------------------- @@ -203,3 +278,79 @@ FROM gp_segment_configuration WHERE role = 'p' AND content = 0; 4: INSERT INTO aoco_blkdir_test VALUES (2, 2); DROP TABLE aoco_blkdir_test; + +-- Test `tupcount` in pg_ao(cs)seg == sum of number of `row_count` across all +-- aoblkdir entries for each . Test with commits, aborts +-- and deletes. + +-- Case1: without VACUUM ANALYZE +CREATE TABLE aoco_blkdir_test_rowcount(i int, j int) USING ao_column DISTRIBUTED BY (j); +1: BEGIN; +2: BEGIN; +3: BEGIN; +4: BEGIN; +1: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 2 FROM generate_series(1, 10) i; +2: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 3 FROM generate_series(1, 20) i; +3: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 4 FROM generate_series(1, 30) i; +3: ABORT; +3: BEGIN; +3: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 4 FROM generate_series(1, 40) i; +4: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 7 FROM generate_series(1, 50) i; +1: COMMIT; +2: COMMIT; +3: COMMIT; +4: COMMIT; +DELETE FROM aoco_blkdir_test_rowcount WHERE j = 7; + +CREATE INDEX aoco_blkdir_test_rowcount_idx ON aoco_blkdir_test_rowcount(i); + +SELECT segno, columngroup_no, sum(row_count) AS totalrows FROM + (SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test_rowcount')).* FROM gp_dist_random('gp_id') + WHERE gp_segment_id = 0)s GROUP BY segno, columngroup_no ORDER BY segno, columngroup_no; +SELECT segno, column_num, sum(tupcount) AS totalrows FROM + gp_toolkit.__gp_aocsseg('aoco_blkdir_test_rowcount') WHERE segment_id = 0 GROUP BY segno, column_num; + +-- Case2: with VACUUM ANALYZE +DROP TABLE aoco_blkdir_test_rowcount; +CREATE TABLE aoco_blkdir_test_rowcount(i int, j int) USING ao_column DISTRIBUTED BY (j); +CREATE INDEX aoco_blkdir_test_rowcount_idx ON aoco_blkdir_test_rowcount(i); +1: BEGIN; +2: BEGIN; +3: BEGIN; +4: BEGIN; +1: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 2 FROM generate_series(1, 10) i; +1: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 2 FROM aoco_blkdir_test_rowcount; +1: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 2 FROM aoco_blkdir_test_rowcount; +2: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 3 FROM generate_series(1, 20) i; +2: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 3 FROM aoco_blkdir_test_rowcount; +2: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 3 FROM aoco_blkdir_test_rowcount; +3: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 4 FROM generate_series(1, 30) i; +3: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 4 FROM aoco_blkdir_test_rowcount; +3: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 4 FROM aoco_blkdir_test_rowcount; +4: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 7 FROM generate_series(1, 50) i; +4: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 7 FROM aoco_blkdir_test_rowcount; +4: INSERT INTO aoco_blkdir_test_rowcount SELECT i, 7 FROM aoco_blkdir_test_rowcount; +1: COMMIT; +2: COMMIT; +3: ABORT; +4: COMMIT; + +DELETE FROM aoco_blkdir_test_rowcount WHERE j = 7; +VACUUM ANALYZE aoco_blkdir_test_rowcount; + +SELECT segno, columngroup_no, sum(row_count) AS totalrows FROM + (SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test_rowcount')).* FROM gp_dist_random('gp_id') + WHERE gp_segment_id = 0)s GROUP BY segno, columngroup_no ORDER BY segno, columngroup_no; +SELECT segno, column_num, sum(tupcount) AS totalrows FROM + gp_toolkit.__gp_aocsseg('aoco_blkdir_test_rowcount') WHERE segment_id = 0 GROUP BY segno, column_num; + +UPDATE aoco_blkdir_test_rowcount SET i = i + 1; +VACUUM ANALYZE aoco_blkdir_test_rowcount; + +SELECT segno, columngroup_no, sum(row_count) AS totalrows FROM + (SELECT (gp_toolkit.__gp_aoblkdir('aoco_blkdir_test_rowcount')).* FROM gp_dist_random('gp_id') + WHERE gp_segment_id = 0)s GROUP BY segno, columngroup_no ORDER BY segno, columngroup_no; +SELECT segno, column_num, sum(tupcount) AS totalrows FROM + gp_toolkit.__gp_aocsseg('aoco_blkdir_test_rowcount') WHERE segment_id = 0 GROUP BY segno, column_num; + +DROP TABLE aoco_blkdir_test_rowcount;