timescaledb/tsl/src/hypercore/hypercore_handler.c
Alexander Kuzmenkov fc827c154a
Prepare for clang-tidy-18 (#7213)
Mostly the complaints about implicit casts of multi-level pointers. Not
enabling it yet in the CI because there are some complicated warnings
left.

Loader changes are cosmetic.
2024-11-18 13:06:52 +00:00

3707 lines
122 KiB
C

/*
* This file and its contents are licensed under the Timescale License.
* Please see the included NOTICE for copyright information and
* LICENSE-TIMESCALE for a copy of the license.
*/
#include <postgres.h>
#include <access/attnum.h>
#include <access/heapam.h>
#include <access/hio.h>
#include <access/rewriteheap.h>
#include <access/sdir.h>
#include <access/skey.h>
#include <access/tableam.h>
#include <access/transam.h>
#include <access/xact.h>
#include <catalog/heap.h>
#include <catalog/index.h>
#include <catalog/pg_attribute.h>
#include <catalog/storage.h>
#include <commands/progress.h>
#include <commands/vacuum.h>
#include <common/relpath.h>
#include <executor/tuptable.h>
#include <nodes/bitmapset.h>
#include <nodes/execnodes.h>
#include <nodes/makefuncs.h>
#include <nodes/nodes.h>
#include <nodes/parsenodes.h>
#include <nodes/plannodes.h>
#include <optimizer/optimizer.h>
#include <optimizer/pathnode.h>
#include <optimizer/plancat.h>
#include <parser/parsetree.h>
#include <pgstat.h>
#include <postgres_ext.h>
#include <storage/block.h>
#include <storage/buf.h>
#include <storage/bufmgr.h>
#include <storage/itemptr.h>
#include <storage/lmgr.h>
#include <storage/lockdefs.h>
#include <storage/off.h>
#include <storage/procarray.h>
#include <utils/builtins.h>
#include <utils/elog.h>
#include <utils/hsearch.h>
#include <utils/lsyscache.h>
#include <utils/memutils.h>
#include <utils/palloc.h>
#include <utils/rel.h>
#include <utils/sampling.h>
#include <utils/syscache.h>
#include <utils/tuplesort.h>
#include <utils/typcache.h>
#include <math.h>
#include "arrow_array.h"
#include "arrow_cache.h"
#include "arrow_tts.h"
#include "compression/api.h"
#include "compression/compression.h"
#include "compression/create.h"
#include "debug_assert.h"
#include "guc.h"
#include "hypercore_handler.h"
#include "process_utility.h"
#include "relstats.h"
#include "trigger.h"
#include "ts_catalog/array_utils.h"
#include "ts_catalog/catalog.h"
#include "ts_catalog/compression_chunk_size.h"
#include "ts_catalog/compression_settings.h"
#if PG17_GE
#include "import/analyze.h"
#endif
static const TableAmRoutine hypercore_methods;
static void convert_to_hypercore_finish(Oid relid);
static List *partially_compressed_relids = NIL; /* Relids that needs to have
* updated status set at end of
* transaction */
#define HYPERCORE_AM_INFO_SIZE(natts) \
(sizeof(HypercoreInfo) + (sizeof(ColumnCompressionSettings) * (natts)))
static int32
get_chunk_id_from_relid(Oid relid)
{
int32 chunk_id;
Oid nspid = get_rel_namespace(relid);
const char *schema = get_namespace_name(nspid);
const char *relname = get_rel_name(relid);
ts_chunk_get_id(schema, relname, &chunk_id, false);
return chunk_id;
}
static int32
chunk_get_compressed_chunk_relid(Oid relid)
{
FormData_chunk fd;
if (!ts_chunk_simple_scan_by_reloid(relid, &fd, true))
return InvalidOid;
return ts_chunk_get_relid(fd.compressed_chunk_id, true);
}
static const TableAmRoutine *
switch_to_heapam(Relation rel)
{
const TableAmRoutine *tableam = rel->rd_tableam;
Assert(tableam == hypercore_routine());
rel->rd_tableam = GetHeapamTableAmRoutine();
return tableam;
}
static void
create_proxy_vacuum_index(Relation rel, Oid compressed_relid)
{
Oid compressed_namespaceid = get_rel_namespace(compressed_relid);
char *compressed_namespace = get_namespace_name(compressed_namespaceid);
char *compressed_relname = get_rel_name(compressed_relid);
IndexElem elem = {
.type = T_IndexElem,
.name = COMPRESSION_COLUMN_METADATA_COUNT_NAME,
.indexcolname = NULL,
};
IndexStmt stmt = {
.type = T_IndexStmt,
.accessMethod = "hypercore_proxy",
.idxcomment = "Hypercore vacuum proxy index",
.idxname = psprintf("%s_ts_hypercore_proxy_idx", compressed_relname),
.indexParams = list_make1(&elem),
.relation = makeRangeVar(compressed_namespace, compressed_relname, -1),
};
DefineIndexCompat(compressed_relid,
&stmt,
InvalidOid,
InvalidOid,
InvalidOid,
-1,
false,
false,
false,
false,
true);
}
static void
create_compression_relation_size_stats(int32 chunk_id, Oid relid, int32 compress_chunk_id,
Oid compress_relid, RelationSize *before_size,
int64 num_rows_pre, int64 num_rows_post,
int64 num_rows_frozen)
{
RelationSize after_size = ts_relation_size_impl(compress_relid);
compression_chunk_size_catalog_insert(chunk_id,
before_size,
compress_chunk_id,
&after_size,
num_rows_pre,
num_rows_post,
num_rows_frozen);
}
static HypercoreInfo *
lazy_build_hypercore_info_cache(Relation rel, bool create_chunk_constraints,
bool *compressed_relation_created)
{
Assert(OidIsValid(rel->rd_id) && !ts_is_hypertable(rel->rd_id));
HypercoreInfo *hsinfo;
CompressionSettings *settings;
TupleDesc tupdesc = RelationGetDescr(rel);
/* Anything put in rel->rd_amcache must be a single memory chunk
* palloc'd in CacheMemoryContext since PostgreSQL expects to be able
* to free it with a single pfree(). */
hsinfo = MemoryContextAllocZero(CacheMemoryContext, HYPERCORE_AM_INFO_SIZE(tupdesc->natts));
hsinfo->relation_id = get_chunk_id_from_relid(rel->rd_id);
hsinfo->compressed_relid = InvalidOid;
hsinfo->num_columns = tupdesc->natts;
hsinfo->hypertable_id = ts_chunk_get_hypertable_id_by_reloid(rel->rd_id);
FormData_chunk form = ts_chunk_get_formdata(hsinfo->relation_id);
hsinfo->compressed_relation_id = form.compressed_chunk_id;
/* Create compressed chunk and set the created flag if it does not
* exist. */
if (compressed_relation_created)
*compressed_relation_created = (hsinfo->compressed_relation_id == INVALID_CHUNK_ID);
if (hsinfo->compressed_relation_id == INVALID_CHUNK_ID)
{
/* Consider if we want to make it simpler to create the compressed
* table by just considering a normal side-relation with no strong
* connection to the original chunk. We do not need constraints,
* foreign keys, or any other things on this table since it never
* participate in any plans. */
Chunk *chunk = ts_chunk_get_by_relid(rel->rd_id, true);
Hypertable *ht = ts_hypertable_get_by_id(chunk->fd.hypertable_id);
Hypertable *ht_compressed = ts_hypertable_get_by_id(ht->fd.compressed_hypertable_id);
if (NULL == ht_compressed)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("hypertable \"%s\" is missing compression settings",
NameStr(ht->fd.table_name)),
errhint("Enable compression on the hypertable.")));
Chunk *c_chunk = create_compress_chunk(ht_compressed, chunk, InvalidOid);
hsinfo->compressed_relation_id = c_chunk->fd.id;
ts_chunk_set_compressed_chunk(chunk, c_chunk->fd.id);
if (create_chunk_constraints)
{
ts_chunk_constraints_create(ht_compressed, c_chunk);
ts_trigger_create_all_on_chunk(c_chunk);
create_proxy_vacuum_index(rel, c_chunk->table_id);
RelationSize before_size = ts_relation_size_impl(RelationGetRelid(rel));
create_compression_relation_size_stats(hsinfo->relation_id,
RelationGetRelid(rel),
hsinfo->compressed_relation_id,
c_chunk->table_id,
&before_size,
0,
0,
0);
}
}
hsinfo->compressed_relid = ts_chunk_get_relid(hsinfo->compressed_relation_id, false);
hsinfo->count_cattno =
get_attnum(hsinfo->compressed_relid, COMPRESSION_COLUMN_METADATA_COUNT_NAME);
Assert(hsinfo->compressed_relation_id > 0 && OidIsValid(hsinfo->compressed_relid));
Assert(hsinfo->count_cattno != InvalidAttrNumber);
settings = ts_compression_settings_get(hsinfo->compressed_relid);
Ensure(settings,
"no compression settings for relation %s",
get_rel_name(RelationGetRelid(rel)));
for (int i = 0; i < hsinfo->num_columns; i++)
{
const Form_pg_attribute attr = &tupdesc->attrs[i];
ColumnCompressionSettings *colsettings = &hsinfo->columns[i];
if (attr->attisdropped)
{
colsettings->attnum = InvalidAttrNumber;
colsettings->cattnum = InvalidAttrNumber;
colsettings->is_dropped = true;
continue;
}
const char *attname = NameStr(attr->attname);
int segmentby_pos = ts_array_position(settings->fd.segmentby, attname);
int orderby_pos = ts_array_position(settings->fd.orderby, attname);
namestrcpy(&colsettings->attname, attname);
colsettings->attnum = attr->attnum;
colsettings->typid = attr->atttypid;
colsettings->is_segmentby = segmentby_pos > 0;
colsettings->is_orderby = orderby_pos > 0;
if (OidIsValid(hsinfo->compressed_relid))
colsettings->cattnum = get_attnum(hsinfo->compressed_relid, attname);
else
colsettings->cattnum = InvalidAttrNumber;
}
Ensure(hsinfo->relation_id > 0, "invalid chunk ID");
return hsinfo;
}
HypercoreInfo *
RelationGetHypercoreInfo(Relation rel)
{
/*coverity[tainted_data_downcast : FALSE]*/
HypercoreInfo *info = rel->rd_amcache;
if (NULL == info)
info = rel->rd_amcache = lazy_build_hypercore_info_cache(rel, true, NULL);
Assert(info && OidIsValid(info->compressed_relid));
return info;
}
static void
build_segment_and_orderby_bms(const HypercoreInfo *hsinfo, Bitmapset **segmentby,
Bitmapset **orderby)
{
*segmentby = NULL;
*orderby = NULL;
for (int i = 0; i < hsinfo->num_columns; i++)
{
const ColumnCompressionSettings *colsettings = &hsinfo->columns[i];
if (colsettings->is_segmentby)
*segmentby = bms_add_member(*segmentby, colsettings->attnum);
if (colsettings->is_orderby)
*orderby = bms_add_member(*orderby, colsettings->attnum);
}
}
/* ------------------------------------------------------------------------
* Slot related callbacks for Hypercore
* ------------------------------------------------------------------------
*/
static const TupleTableSlotOps *
hypercore_slot_callbacks(Relation relation)
{
return &TTSOpsArrowTuple;
}
#define FEATURE_NOT_SUPPORTED \
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("%s not supported", __func__)))
#define pgstat_count_hypercore_scan(rel) pgstat_count_heap_scan(rel)
#define pgstat_count_hypercore_getnext(rel) pgstat_count_heap_getnext(rel)
typedef struct HypercoreParallelScanDescData
{
ParallelBlockTableScanDescData pscandesc;
ParallelBlockTableScanDescData cpscandesc;
} HypercoreParallelScanDescData;
typedef struct HypercoreParallelScanDescData *HypercoreParallelScanDesc;
typedef enum HypercoreScanState
{
HYPERCORE_SCAN_START = 0,
HYPERCORE_SCAN_COMPRESSED = HYPERCORE_SCAN_START,
HYPERCORE_SCAN_NON_COMPRESSED = 1,
HYPERCORE_SCAN_DONE = 2,
} HypercoreScanState;
const char *scan_state_name[] = {
[HYPERCORE_SCAN_COMPRESSED] = "COMPRESSED",
[HYPERCORE_SCAN_NON_COMPRESSED] = "NON_COMPRESSED",
[HYPERCORE_SCAN_DONE] = "DONE",
};
typedef struct HypercoreScanDescData
{
TableScanDescData rs_base;
TableScanDesc uscan_desc; /* scan descriptor for non-compressed relation */
Relation compressed_rel;
TableScanDesc cscan_desc; /* scan descriptor for compressed relation */
int64 returned_noncompressed_count;
int64 returned_compressed_count;
int32 compressed_row_count;
HypercoreScanState hs_scan_state;
bool reset;
#if PG17_GE
/* These fields are only used for ANALYZE */
ReadStream *canalyze_read_stream;
ReadStream *uanalyze_read_stream;
#endif
} HypercoreScanDescData;
typedef struct HypercoreScanDescData *HypercoreScanDesc;
static bool hypercore_getnextslot_noncompressed(HypercoreScanDesc scan, ScanDirection direction,
TupleTableSlot *slot);
static bool hypercore_getnextslot_compressed(HypercoreScanDesc scan, ScanDirection direction,
TupleTableSlot *slot);
#if PG17_GE
static int
compute_targrows(Relation rel)
{
MemoryContext analyze_context =
AllocSetContextCreate(CurrentMemoryContext, "Hypercore Analyze", ALLOCSET_DEFAULT_SIZES);
VacAttrStats **vacattrstats;
int attr_cnt = hypercore_analyze_compute_vacattrstats(rel, &vacattrstats, analyze_context);
int targrows = 100;
for (int i = 0; i < attr_cnt; i++)
{
if (targrows < vacattrstats[i]->minrows)
{
targrows = vacattrstats[i]->minrows;
}
}
MemoryContextDelete(analyze_context);
return targrows;
}
#endif
/*
* Initialization common for beginscan and rescan.
*/
static void
initscan(HypercoreScanDesc scan, ScanKey keys, int nkeys)
{
int nvalidkeys = 0;
/*
* Translate any scankeys to the corresponding scankeys on the compressed
* relation.
*
* It is only possible to use scankeys in the following two cases:
*
* 1. The scankey is for a segment_by column
* 2. The scankey is for a column that has min/max metadata (i.e., order_by column).
*
* TODO: Implement support for (2) above, which involves transforming a
* scankey to the corresponding min/max scankeys.
*/
if (NULL != keys && nkeys > 0)
{
const HypercoreInfo *hsinfo = RelationGetHypercoreInfo(scan->rs_base.rs_rd);
for (int i = 0; i < nkeys; i++)
{
const ScanKey key = &keys[i];
for (int j = 0; j < hsinfo->num_columns; j++)
{
const ColumnCompressionSettings *column = &hsinfo->columns[j];
if (column->is_segmentby && key->sk_attno == column->attnum)
{
scan->rs_base.rs_key[nvalidkeys] = *key;
/* Remap the attribute number to the corresponding
* compressed rel attribute number */
scan->rs_base.rs_key[nvalidkeys].sk_attno = column->cattnum;
nvalidkeys++;
break;
}
}
}
}
scan->rs_base.rs_nkeys = nvalidkeys;
/* Use the TableScanDescData's scankeys to store the transformed compression scan keys */
if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN)
pgstat_count_hypercore_scan(scan->rs_base.rs_rd);
}
#ifdef TS_DEBUG
static const char *
get_scan_type(uint32 flags)
{
if (flags & SO_TYPE_TIDSCAN)
return "TID";
#if PG14_GE
if (flags & SO_TYPE_TIDRANGESCAN)
return "TID range";
#endif
if (flags & SO_TYPE_BITMAPSCAN)
return "bitmap";
if (flags & SO_TYPE_SAMPLESCAN)
return "sample";
if (flags & SO_TYPE_ANALYZE)
return "analyze";
if (flags & SO_TYPE_SEQSCAN)
return "sequence";
return "unknown";
}
#endif
static TableScanDesc
hypercore_beginscan(Relation relation, Snapshot snapshot, int nkeys, ScanKey keys,
ParallelTableScanDesc parallel_scan, uint32 flags)
{
HypercoreScanDesc scan;
HypercoreParallelScanDesc cpscan = (HypercoreParallelScanDesc) parallel_scan;
RelationIncrementReferenceCount(relation);
TS_DEBUG_LOG("starting %s scan of relation %s parallel_scan=%p",
get_scan_type(flags),
RelationGetRelationName(relation),
parallel_scan);
scan = palloc0(sizeof(HypercoreScanDescData));
scan->rs_base.rs_rd = relation;
scan->rs_base.rs_snapshot = snapshot;
scan->rs_base.rs_nkeys = nkeys;
scan->rs_base.rs_key = nkeys > 0 ? palloc0(sizeof(ScanKeyData) * nkeys) : NULL;
scan->rs_base.rs_flags = flags;
scan->rs_base.rs_parallel = parallel_scan;
scan->returned_noncompressed_count = 0;
scan->returned_compressed_count = 0;
scan->compressed_row_count = 0;
scan->reset = true;
if (ts_is_hypertable(relation->rd_id))
{
/* If this is a hypertable, there is nothing for us to scan */
scan->hs_scan_state = HYPERCORE_SCAN_DONE;
return &scan->rs_base;
}
HypercoreInfo *hsinfo = RelationGetHypercoreInfo(relation);
scan->compressed_rel = table_open(hsinfo->compressed_relid, AccessShareLock);
if ((ts_guc_enable_transparent_decompression == 2) ||
(keys && keys->sk_flags & SK_NO_COMPRESSED))
{
/*
* Don't read compressed data if transparent decompression is enabled
* or it is requested by the scan.
*
* Transparent decompression reads compressed data itself, directly
* from the compressed chunk, so avoid reading it again here.
*/
scan->hs_scan_state = HYPERCORE_SCAN_NON_COMPRESSED;
}
initscan(scan, keys, nkeys);
ParallelTableScanDesc ptscan =
parallel_scan ? (ParallelTableScanDesc) &cpscan->pscandesc : NULL;
const TableAmRoutine *oldtam = switch_to_heapam(relation);
scan->uscan_desc =
relation->rd_tableam->scan_begin(relation, snapshot, nkeys, keys, ptscan, flags);
relation->rd_tableam = oldtam;
if (parallel_scan)
{
/* Parallel workers use a serialized snapshot that they get from the
* coordinator. The snapshot will be marked as a temp snapshot so that
* endscan() knows to deregister it. However, if we pass the snapshot
* to both sub-scans marked as a temp snapshot it will be deregistered
* twice. Therefore remove the temp flag for the second scan. */
flags &= ~SO_TEMP_SNAPSHOT;
}
ParallelTableScanDesc cptscan =
parallel_scan ? (ParallelTableScanDesc) &cpscan->cpscandesc : NULL;
scan->cscan_desc = scan->compressed_rel->rd_tableam->scan_begin(scan->compressed_rel,
snapshot,
scan->rs_base.rs_nkeys,
scan->rs_base.rs_key,
cptscan,
flags);
return &scan->rs_base;
}
static void
hypercore_rescan(TableScanDesc sscan, ScanKey key, bool set_params, bool allow_strat,
bool allow_sync, bool allow_pagemode)
{
HypercoreScanDesc scan = (HypercoreScanDesc) sscan;
initscan(scan, key, scan->rs_base.rs_nkeys);
scan->reset = true;
scan->hs_scan_state = HYPERCORE_SCAN_START;
table_rescan(scan->cscan_desc, key);
Relation relation = scan->uscan_desc->rs_rd;
const TableAmRoutine *oldtam = switch_to_heapam(relation);
relation->rd_tableam
->scan_rescan(scan->uscan_desc, key, set_params, allow_strat, allow_sync, allow_pagemode);
relation->rd_tableam = oldtam;
}
static void
hypercore_endscan(TableScanDesc sscan)
{
HypercoreScanDesc scan = (HypercoreScanDesc) sscan;
RelationDecrementReferenceCount(sscan->rs_rd);
if (scan->cscan_desc)
table_endscan(scan->cscan_desc);
if (scan->compressed_rel)
table_close(scan->compressed_rel, AccessShareLock);
#if PG17_GE
if (scan->canalyze_read_stream)
read_stream_end(scan->canalyze_read_stream);
if (scan->uanalyze_read_stream)
read_stream_end(scan->uanalyze_read_stream);
#endif
Relation relation = sscan->rs_rd;
if (scan->uscan_desc)
{
const TableAmRoutine *oldtam = switch_to_heapam(relation);
relation->rd_tableam->scan_end(scan->uscan_desc);
relation->rd_tableam = oldtam;
}
TS_DEBUG_LOG("scanned " INT64_FORMAT " tuples (" INT64_FORMAT " compressed, " INT64_FORMAT
" noncompressed) in rel %s",
scan->returned_compressed_count + scan->returned_noncompressed_count,
scan->returned_compressed_count,
scan->returned_noncompressed_count,
RelationGetRelationName(sscan->rs_rd));
if (scan->rs_base.rs_key)
pfree(scan->rs_base.rs_key);
pfree(scan);
}
static bool
hypercore_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
{
if (arrow_slot_try_getnext(slot, direction))
{
slot->tts_tableOid = RelationGetRelid(sscan->rs_rd);
return true;
}
HypercoreScanDesc scan = (HypercoreScanDesc) sscan;
TS_DEBUG_LOG("relid: %d, relation: %s, reset: %s, scan_state: %s",
sscan->rs_rd->rd_id,
get_rel_name(sscan->rs_rd->rd_id),
yes_no(scan->reset),
scan_state_name[scan->hs_scan_state]);
switch (scan->hs_scan_state)
{
case HYPERCORE_SCAN_DONE:
return false; /* Nothing more to scan */
case HYPERCORE_SCAN_NON_COMPRESSED:
return hypercore_getnextslot_noncompressed(scan, direction, slot);
case HYPERCORE_SCAN_COMPRESSED:
return hypercore_getnextslot_compressed(scan, direction, slot);
}
return false; /* To keep compiler happy */
}
static bool
hypercore_getnextslot_noncompressed(HypercoreScanDesc scan, ScanDirection direction,
TupleTableSlot *slot)
{
TupleTableSlot *child_slot = arrow_slot_get_noncompressed_slot(slot);
Relation relation = scan->rs_base.rs_rd;
const TableAmRoutine *oldtam = switch_to_heapam(relation);
bool result = relation->rd_tableam->scan_getnextslot(scan->uscan_desc, direction, child_slot);
relation->rd_tableam = oldtam;
if (result)
{
scan->returned_noncompressed_count++;
slot->tts_tableOid = RelationGetRelid(relation);
ExecStoreArrowTuple(slot, InvalidTupleIndex);
}
else if (direction == BackwardScanDirection)
{
scan->hs_scan_state = HYPERCORE_SCAN_COMPRESSED;
return hypercore_getnextslot(&scan->rs_base, direction, slot);
}
return result;
}
static bool
should_read_new_compressed_slot(TupleTableSlot *slot, ScanDirection direction)
{
/* Scans are never invoked with NoMovementScanDirection */
Assert(direction != NoMovementScanDirection);
/* A slot can be empty if just started the scan or (or moved back to the
* start due to backward scan) */
if (TTS_EMPTY(slot))
return true;
if (direction == ForwardScanDirection)
{
if (arrow_slot_is_last(slot) || arrow_slot_is_consumed(slot))
return true;
}
else if (direction == BackwardScanDirection)
{
/* Check if backward scan reached the start or the slot values */
if (arrow_slot_row_index(slot) <= 1)
return true;
}
return false;
}
static bool
hypercore_getnextslot_compressed(HypercoreScanDesc scan, ScanDirection direction,
TupleTableSlot *slot)
{
TupleTableSlot *child_slot =
arrow_slot_get_compressed_slot(slot, RelationGetDescr(scan->compressed_rel));
if (scan->reset || should_read_new_compressed_slot(slot, direction))
{
scan->reset = false;
if (!table_scan_getnextslot(scan->cscan_desc, direction, child_slot))
{
ExecClearTuple(slot);
if (direction == ForwardScanDirection)
{
scan->hs_scan_state = HYPERCORE_SCAN_NON_COMPRESSED;
return hypercore_getnextslot(&scan->rs_base, direction, slot);
}
else
{
Assert(direction == BackwardScanDirection);
return false;
}
}
Assert(ItemPointerIsValid(&child_slot->tts_tid));
ExecStoreArrowTuple(slot, direction == ForwardScanDirection ? 1 : MaxTupleIndex);
scan->compressed_row_count = arrow_slot_total_row_count(slot);
}
else if (direction == ForwardScanDirection)
{
ExecStoreNextArrowTuple(slot);
}
else
{
Assert(direction == BackwardScanDirection);
ExecStorePreviousArrowTuple(slot);
}
slot->tts_tableOid = RelationGetRelid(scan->rs_base.rs_rd);
scan->returned_compressed_count++;
pgstat_count_hypercore_getnext(scan->rs_base.rs_rd);
return true;
}
static Size
hypercore_parallelscan_estimate(Relation rel)
{
return sizeof(HypercoreParallelScanDescData);
}
/*
* Initialize ParallelTableScanDesc for a parallel scan of this relation.
* `pscan` will be sized according to parallelscan_estimate() for the same
* relation.
*/
static Size
hypercore_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan)
{
HypercoreParallelScanDesc cpscan = (HypercoreParallelScanDesc) pscan;
HypercoreInfo *hsinfo = RelationGetHypercoreInfo(rel);
const TableAmRoutine *oldtam = switch_to_heapam(rel);
table_block_parallelscan_initialize(rel, (ParallelTableScanDesc) &cpscan->pscandesc);
rel->rd_tableam = oldtam;
Relation crel = table_open(hsinfo->compressed_relid, AccessShareLock);
table_block_parallelscan_initialize(crel, (ParallelTableScanDesc) &cpscan->cpscandesc);
table_close(crel, NoLock);
return sizeof(HypercoreParallelScanDescData);
}
/*
* Reinitialize `pscan` for a new scan. `rel` will be the same relation as
* when `pscan` was initialized by parallelscan_initialize.
*/
static void
hypercore_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
{
HypercoreParallelScanDesc cpscan = (HypercoreParallelScanDesc) pscan;
HypercoreInfo *hsinfo = RelationGetHypercoreInfo(rel);
const TableAmRoutine *oldtam = switch_to_heapam(rel);
table_block_parallelscan_reinitialize(rel, (ParallelTableScanDesc) &cpscan->pscandesc);
rel->rd_tableam = oldtam;
Relation crel = table_open(hsinfo->compressed_relid, AccessShareLock);
table_block_parallelscan_reinitialize(crel, (ParallelTableScanDesc) &cpscan->cpscandesc);
table_close(crel, NoLock);
}
static void
hypercore_get_latest_tid(TableScanDesc sscan, ItemPointer tid)
{
HypercoreScanDesc scan = (HypercoreScanDesc) sscan;
if (is_compressed_tid(tid))
{
ItemPointerData decoded_tid;
uint16 tuple_index = hypercore_tid_decode(&decoded_tid, tid);
const Relation rel = scan->cscan_desc->rs_rd;
rel->rd_tableam->tuple_get_latest_tid(scan->cscan_desc, &decoded_tid);
hypercore_tid_encode(tid, &decoded_tid, tuple_index);
}
else
{
const Relation rel = scan->uscan_desc->rs_rd;
const TableAmRoutine *oldtam = switch_to_heapam(rel);
rel->rd_tableam->tuple_get_latest_tid(scan->uscan_desc, tid);
rel->rd_tableam = oldtam;
}
}
static void
hypercore_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, CommandId cid,
int options, BulkInsertStateData *bistate)
{
/* Inserts only supported in non-compressed relation, so simply forward to the heap AM */
const TableAmRoutine *oldtam = switch_to_heapam(relation);
relation->rd_tableam->multi_insert(relation, slots, ntuples, cid, options, bistate);
relation->rd_tableam = oldtam;
MemoryContext oldmcxt = MemoryContextSwitchTo(CurTransactionContext);
partially_compressed_relids =
list_append_unique_oid(partially_compressed_relids, RelationGetRelid(relation));
MemoryContextSwitchTo(oldmcxt);
}
enum SegmentbyIndexStatus
{
SEGMENTBY_INDEX_UNKNOWN = -1,
SEGMENTBY_INDEX_FALSE = 0,
SEGMENTBY_INDEX_TRUE = 1,
};
typedef struct IndexFetchComprData
{
IndexFetchTableData h_base; /* AM independent part of the descriptor */
IndexFetchTableData *compr_hscan;
IndexFetchTableData *uncompr_hscan;
Relation compr_rel;
ItemPointerData tid;
int64 num_decompressions;
uint64 return_count;
enum SegmentbyIndexStatus segindex;
bool call_again; /* Used to remember the previous value of call_again in
* index_fetch_tuple */
bool internal_call_again; /* Call again passed on to compressed heap */
} IndexFetchComprData;
/* ------------------------------------------------------------------------
* Index Scan Callbacks for Hypercore
* ------------------------------------------------------------------------
*/
static IndexFetchTableData *
hypercore_index_fetch_begin(Relation rel)
{
IndexFetchComprData *cscan = palloc0(sizeof(IndexFetchComprData));
HypercoreInfo *hsinfo = RelationGetHypercoreInfo(rel);
Relation crel = table_open(hsinfo->compressed_relid, AccessShareLock);
cscan->segindex = SEGMENTBY_INDEX_UNKNOWN;
cscan->return_count = 0;
cscan->h_base.rel = rel;
cscan->compr_rel = crel;
cscan->compr_hscan = crel->rd_tableam->index_fetch_begin(crel);
const TableAmRoutine *oldtam = switch_to_heapam(rel);
cscan->uncompr_hscan = rel->rd_tableam->index_fetch_begin(rel);
rel->rd_tableam = oldtam;
ItemPointerSetInvalid(&cscan->tid);
return &cscan->h_base;
}
static void
hypercore_index_fetch_reset(IndexFetchTableData *scan)
{
IndexFetchComprData *cscan = (IndexFetchComprData *) scan;
Relation rel = scan->rel;
/* There is no need to reset segindex since there is no change in indexes
* used for the index scan when resetting the scan, but we need to reset
* the tid since we are restarting an index scan. */
ItemPointerSetInvalid(&cscan->tid);
cscan->compr_rel->rd_tableam->index_fetch_reset(cscan->compr_hscan);
const TableAmRoutine *oldtam = switch_to_heapam(rel);
rel->rd_tableam->index_fetch_reset(cscan->uncompr_hscan);
rel->rd_tableam = oldtam;
}
static void
hypercore_index_fetch_end(IndexFetchTableData *scan)
{
IndexFetchComprData *cscan = (IndexFetchComprData *) scan;
Relation rel = scan->rel;
Relation crel = cscan->compr_rel;
crel->rd_tableam->index_fetch_end(cscan->compr_hscan);
table_close(crel, AccessShareLock);
const TableAmRoutine *oldtam = switch_to_heapam(rel);
rel->rd_tableam->index_fetch_end(cscan->uncompr_hscan);
rel->rd_tableam = oldtam;
pfree(cscan);
}
/*
* Check if the index scan is only on segmentby columns.
*
* To identify a segmentby index scan (an index scan only using segmentby
* columns), it is necessary to know the columns (attributes) indexed by the
* index. Unfortunately, the TAM does not have access to information about the
* index being scanned, so this information is instead captured at the start
* of the scan (using the executor start hook) and stored in the
* ArrowTupleTableSlot.
*
* For EXPLAINs (without ANALYZE), the index attributes in the slot might not
* be set because the index is never really opened. For such a case, when
* nothing is actually scanned, it is OK to return "false" even though the
* query is using a segmentby index.
*
* Since the columns scanned by an index scan does not change during a scan,
* we cache this information to avoid re-computing it each time.
*/
static inline bool
is_segmentby_index_scan(IndexFetchComprData *cscan, TupleTableSlot *slot)
{
enum SegmentbyIndexStatus segindex = cscan->segindex;
if (segindex == SEGMENTBY_INDEX_UNKNOWN)
{
ArrowTupleTableSlot *aslot = (ArrowTupleTableSlot *) slot;
const HypercoreInfo *hsinfo = RelationGetHypercoreInfo(cscan->h_base.rel);
int16 attno = -1;
if (bms_is_empty(aslot->index_attrs))
segindex = SEGMENTBY_INDEX_FALSE;
else
{
/* True unless we discover that there is one attribute in the index
* that is not on a segment-by */
segindex = SEGMENTBY_INDEX_TRUE;
while ((attno = bms_next_member(aslot->index_attrs, attno)) >= 0)
if (!hsinfo->columns[AttrNumberGetAttrOffset(attno)].is_segmentby)
{
segindex = SEGMENTBY_INDEX_FALSE;
break;
}
}
cscan->segindex = segindex;
}
/* To avoid a warning, we compare with the enum value. */
Assert(segindex == SEGMENTBY_INDEX_TRUE || segindex == SEGMENTBY_INDEX_FALSE);
return (segindex == SEGMENTBY_INDEX_TRUE);
}
/*
* Return tuple for given TID via index scan.
*
* An index scan calls this function to fetch the "heap" tuple with the given
* TID from the index.
*
* The TID points to a tuple either in the regular (non-compressed) or the
* compressed relation. The data is fetched from the identified relation.
*
* If the index only indexes segmentby column(s), the index is itself
* "compressed" and there is only one TID per compressed segment/tuple. In
* that case, the "call_again" parameter is used to make sure the index scan
* calls this function until all the rows in a compressed tuple is
* returned. This "unwrapping" only happens in the case of segmentby indexes.
*/
static bool
hypercore_index_fetch_tuple(struct IndexFetchTableData *scan, ItemPointer tid, Snapshot snapshot,
TupleTableSlot *slot, bool *call_again, bool *all_dead)
{
IndexFetchComprData *cscan = (IndexFetchComprData *) scan;
TupleTableSlot *child_slot;
Relation rel = scan->rel;
Relation crel = cscan->compr_rel;
ItemPointerData decoded_tid;
if (!is_compressed_tid(tid))
{
child_slot = arrow_slot_get_noncompressed_slot(slot);
const TableAmRoutine *oldtam = switch_to_heapam(rel);
bool result = rel->rd_tableam->index_fetch_tuple(cscan->uncompr_hscan,
tid,
snapshot,
child_slot,
call_again,
all_dead);
rel->rd_tableam = oldtam;
if (result)
{
slot->tts_tableOid = RelationGetRelid(scan->rel);
ExecStoreArrowTuple(slot, InvalidTupleIndex);
}
cscan->return_count++;
return result;
}
/* Compressed tuples not visible through this TAM when scanned by
* transparent decompression enabled since DecompressChunk already scanned
* that data. */
if (ts_guc_enable_transparent_decompression == 2)
return false;
bool is_segmentby_index = is_segmentby_index_scan(cscan, slot);
/* Fast path for segmentby index scans. If the compressed tuple is still
* being consumed, just increment the tuple index and return. */
if (is_segmentby_index && cscan->call_again)
{
ExecStoreNextArrowTuple(slot);
slot->tts_tableOid = RelationGetRelid(scan->rel);
cscan->call_again = !arrow_slot_is_last(slot);
*call_again = cscan->call_again || cscan->internal_call_again;
cscan->return_count++;
return true;
}
/* Recreate the original TID for the compressed table */
uint16 tuple_index = hypercore_tid_decode(&decoded_tid, tid);
Assert(tuple_index != InvalidTupleIndex);
child_slot = arrow_slot_get_compressed_slot(slot, RelationGetDescr(cscan->compr_rel));
/*
* Avoid decompression if the new TID from the index points to the same
* compressed tuple as the previous call to this function.
*
* There are cases, however, we're the index scan jumps between the same
* compressed tuples to get the right order, which will lead to
* decompressing the same compressed tuple multiple times. This happens,
* for example, when there's a segmentby column and orderby on
* time. Returning data in time order requires interleaving rows from two
* or more compressed tuples with different segmenby values. It is
* possible to optimize that case further by retaining a window/cache of
* decompressed tuples, keyed on TID.
*/
if (!TTS_EMPTY(child_slot) && !TTS_EMPTY(slot) && ItemPointerIsValid(&cscan->tid) &&
ItemPointerEquals(&cscan->tid, &decoded_tid))
{
/* Still in the same compressed tuple, so just update tuple index and
* return the same Arrow slot */
ExecStoreArrowTuple(slot, tuple_index);
slot->tts_tableOid = RelationGetRelid(scan->rel);
cscan->return_count++;
return true;
}
bool result = crel->rd_tableam->index_fetch_tuple(cscan->compr_hscan,
&decoded_tid,
snapshot,
child_slot,
&cscan->internal_call_again,
all_dead);
if (result)
{
slot->tts_tableOid = RelationGetRelid(scan->rel);
ExecStoreArrowTuple(slot, tuple_index);
/* Save the current compressed TID */
ItemPointerCopy(&decoded_tid, &cscan->tid);
cscan->num_decompressions++;
if (is_segmentby_index)
{
Assert(tuple_index == 1);
cscan->call_again = !arrow_slot_is_last(slot);
*call_again = cscan->call_again || cscan->internal_call_again;
}
cscan->return_count++;
}
return result;
}
/* ------------------------------------------------------------------------
* Callbacks for non-modifying operations on individual tuples for Hypercore
* ------------------------------------------------------------------------
*/
static bool
hypercore_fetch_row_version(Relation relation, ItemPointer tid, Snapshot snapshot,
TupleTableSlot *slot)
{
bool result;
uint16 tuple_index = InvalidTupleIndex;
if (!is_compressed_tid(tid))
{
/*
* For non-compressed tuples, we fetch the tuple and copy it into the
* destination slot.
*
* We need to have a new slot for the call since the heap AM expects a
* BufferHeap TTS and we cannot pass down our Arrow TTS.
*/
TupleTableSlot *child_slot = arrow_slot_get_noncompressed_slot(slot);
const TableAmRoutine *oldtam = switch_to_heapam(relation);
result = relation->rd_tableam->tuple_fetch_row_version(relation, tid, snapshot, child_slot);
relation->rd_tableam = oldtam;
}
else
{
ItemPointerData decoded_tid;
HypercoreInfo *hsinfo = RelationGetHypercoreInfo(relation);
Relation child_rel = table_open(hsinfo->compressed_relid, AccessShareLock);
TupleTableSlot *child_slot =
arrow_slot_get_compressed_slot(slot, RelationGetDescr(child_rel));
tuple_index = hypercore_tid_decode(&decoded_tid, tid);
result = table_tuple_fetch_row_version(child_rel, &decoded_tid, snapshot, child_slot);
table_close(child_rel, NoLock);
}
if (result)
{
slot->tts_tableOid = RelationGetRelid(relation);
ExecStoreArrowTuple(slot, tuple_index);
}
return result;
}
static bool
hypercore_tuple_tid_valid(TableScanDesc scan, ItemPointer tid)
{
HypercoreScanDescData *cscan = (HypercoreScanDescData *) scan;
ItemPointerData ctid;
if (!is_compressed_tid(tid))
{
Relation rel = scan->rs_rd;
const TableAmRoutine *oldtam = switch_to_heapam(rel);
bool valid = rel->rd_tableam->tuple_tid_valid(cscan->uscan_desc, tid);
rel->rd_tableam = oldtam;
return valid;
}
(void) hypercore_tid_decode(&ctid, tid);
return cscan->compressed_rel->rd_tableam->tuple_tid_valid(cscan->cscan_desc, &ctid);
}
static bool
hypercore_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot, Snapshot snapshot)
{
HypercoreInfo *hsinfo = RelationGetHypercoreInfo(rel);
bool result;
if (is_compressed_tid(&slot->tts_tid))
{
Relation crel = table_open(hsinfo->compressed_relid, AccessShareLock);
TupleTableSlot *child_slot = arrow_slot_get_compressed_slot(slot, NULL);
result = crel->rd_tableam->tuple_satisfies_snapshot(crel, child_slot, snapshot);
table_close(crel, AccessShareLock);
}
else
{
TupleTableSlot *child_slot = arrow_slot_get_noncompressed_slot(slot);
const TableAmRoutine *oldtam = switch_to_heapam(rel);
result = rel->rd_tableam->tuple_satisfies_snapshot(rel, child_slot, snapshot);
rel->rd_tableam = oldtam;
}
return result;
}
/*
* Determine which index tuples are safe to delete.
*
* The Index AM asks the Table AM about which given index tuples (as
* referenced by TID) are safe to delete. Given that the array of TIDs to
* delete ("delTIDs") may reference either the compressed or non-compressed
* relation within Hypercore, it is necessary to split the information in the
* TM_IndexDeleteOp in two: one for each relation. Then the operation can be
* relayed to the standard heapAM method to do the heavy lifting for each
* relation.
*
* In order to call the heapAM method on the compressed relation, it is
* necessary to first "decode" the compressed TIDs to "normal" TIDs that
* reference compressed tuples. A complication, however, is that multiple
* distinct "compressed" TIDs may decode to the same TID, i.e., they reference
* the same compressed tuple in the TAM's compressed relation, and the heapAM
* method for index_delete_tuples() expects only unique TIDs. Therefore, it is
* necessary to deduplicate TIDs before calling the heapAM method on the
* compressed relation and then restore the result array of decoded delTIDs
* after the method returns. Note that the returned delTID array might be
* smaller than the input delTID array since only the TIDs that are safe to
* delete should remain. Thus, if a decoded TID is not safe to delete, then
* all compressed TIDs that reference that compressed tuple are also not safe
* to delete.
*/
static TransactionId
hypercore_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
{
TM_IndexDeleteOp noncompr_delstate = *delstate;
TM_IndexDeleteOp compr_delstate = *delstate;
HypercoreInfo *hsinfo = RelationGetHypercoreInfo(rel);
/* Hash table setup for TID deduplication */
typedef struct TidEntry
{
ItemPointerData tid;
List *tuple_indexes;
List *status_indexes;
} TidEntry;
struct HASHCTL hctl = {
.keysize = sizeof(ItemPointerData),
.entrysize = sizeof(TidEntry),
.hcxt = CurrentMemoryContext,
};
unsigned int total_knowndeletable_compressed = 0;
unsigned int total_knowndeletable_non_compressed = 0;
/*
* Setup separate TM_IndexDeleteOPs for the compressed and non-compressed
* relations. Note that it is OK to reference the original status array
* because it is accessed via the "id" index in the TM_IndexDelete struct,
* so it doesn't need the same length and order as the deltids array. This
* is because the deltids array is going to be sorted during processing
* anyway so the "same-array-index" mappings for the status and deltids
* arrays will be lost in any case.
*/
noncompr_delstate.deltids = palloc(sizeof(TM_IndexDelete) * delstate->ndeltids);
noncompr_delstate.ndeltids = 0;
compr_delstate.deltids = palloc(sizeof(TM_IndexDelete) * delstate->ndeltids);
compr_delstate.ndeltids = 0;
/* Hash table to deduplicate compressed TIDs that point to the same
* compressed tuple */
HTAB *tidhash = hash_create("IndexDelete deduplication",
delstate->ndeltids,
&hctl,
HASH_ELEM | HASH_CONTEXT | HASH_BLOBS);
/*
* Stage 1: preparation.
*
* Split the deltids array based on the two relations and deduplicate
* compressed TIDs at the same time. When deduplicating, it is necessary
* to "remember" the lost information when decoding (e.g., index into a
* compressed tuple).
*/
for (int i = 0; i < delstate->ndeltids; i++)
{
const TM_IndexDelete *deltid = &delstate->deltids[i];
const TM_IndexStatus *status = &delstate->status[deltid->id];
/* If this is a compressed TID, decode and deduplicate
* first. Otherwise just add to the non-compressed deltids array */
if (is_compressed_tid(&deltid->tid))
{
ItemPointerData decoded_tid;
bool found;
TidEntry *tidentry;
uint16 tuple_index;
tuple_index = hypercore_tid_decode(&decoded_tid, &deltid->tid);
tidentry = hash_search(tidhash, &decoded_tid, HASH_ENTER, &found);
if (status->knowndeletable)
total_knowndeletable_compressed++;
if (!found)
{
/* Add to compressed IndexDelete array */
TM_IndexDelete *deltid_compr = &compr_delstate.deltids[compr_delstate.ndeltids];
deltid_compr->id = deltid->id;
ItemPointerCopy(&decoded_tid, &deltid_compr->tid);
/* Remember the information for the compressed TID so that the
* deltids array can be restored later */
tidentry->tuple_indexes = list_make1_int(tuple_index);
tidentry->status_indexes = list_make1_int(deltid->id);
compr_delstate.ndeltids++;
}
else
{
/* Duplicate TID, so just append info that needs to be remembered */
tidentry->tuple_indexes = lappend_int(tidentry->tuple_indexes, tuple_index);
tidentry->status_indexes = lappend_int(tidentry->status_indexes, deltid->id);
}
}
else
{
TM_IndexDelete *deltid_noncompr =
&noncompr_delstate.deltids[noncompr_delstate.ndeltids];
*deltid_noncompr = *deltid;
noncompr_delstate.ndeltids++;
if (status->knowndeletable)
total_knowndeletable_non_compressed++;
}
}
Assert((total_knowndeletable_non_compressed + total_knowndeletable_compressed) > 0 ||
delstate->bottomup);
/*
* Stage 2: call heapAM method for each relation and recreate the deltids
* array with the result.
*
* The heapAM method implements various assumptions and asserts around the
* contents of the deltids array depending on whether the index AM is
* doing simple index tuple deletion or bottom up deletion (as indicated
* by delstate->bottomup). For example, in the simple index deletion case,
* it seems the deltids array should have at least have one known
* deletable entry or otherwise the heapAM might prune the array to zero
* length which leads to an assertion failure because it can only be zero
* length in the bottomup case. Since we split the original deltids array
* across the compressed and non-compressed relations, we might end up in
* a situation where we call one relation without any knowndeletable TIDs
* in the simple deletion case, leading to an assertion
* failure. Therefore, only call heapAM if there is at least one
* knowndeletable or we are doing bottomup deletion.
*
* Note, also, that the function should return latestRemovedXid
* transaction ID, so need to remember those for each call and then return
* the latest removed of those.
*/
TransactionId xid_noncompr = InvalidTransactionId;
TransactionId xid_compr = InvalidTransactionId;
/* Reset the deltids array before recreating it with the result */
delstate->ndeltids = 0;
if (noncompr_delstate.ndeltids > 0 &&
(total_knowndeletable_non_compressed > 0 || delstate->bottomup))
{
const TableAmRoutine *oldtam = switch_to_heapam(rel);
xid_noncompr = rel->rd_tableam->index_delete_tuples(rel, &noncompr_delstate);
rel->rd_tableam = oldtam;
memcpy(delstate->deltids,
noncompr_delstate.deltids,
noncompr_delstate.ndeltids * sizeof(TM_IndexDelete));
delstate->ndeltids = noncompr_delstate.ndeltids;
}
if (compr_delstate.ndeltids > 0 && (total_knowndeletable_compressed > 0 || delstate->bottomup))
{
/* Assume RowExclusivelock since this involves deleting tuples */
Relation compr_rel = table_open(hsinfo->compressed_relid, RowExclusiveLock);
xid_compr = compr_rel->rd_tableam->index_delete_tuples(compr_rel, &compr_delstate);
for (int i = 0; i < compr_delstate.ndeltids; i++)
{
const TM_IndexDelete *deltid_compr = &compr_delstate.deltids[i];
const TM_IndexStatus *status_compr = &delstate->status[deltid_compr->id];
ListCell *lc_id, *lc_tupindex;
TidEntry *tidentry;
bool found;
tidentry = hash_search(tidhash, &deltid_compr->tid, HASH_FIND, &found);
Assert(found);
forboth (lc_id, tidentry->status_indexes, lc_tupindex, tidentry->tuple_indexes)
{
int id = lfirst_int(lc_id);
uint16 tuple_index = lfirst_int(lc_tupindex);
TM_IndexDelete *deltid = &delstate->deltids[delstate->ndeltids];
TM_IndexStatus *status = &delstate->status[deltid->id];
deltid->id = id;
/* Assume that all index tuples pointing to the same heap
* compressed tuple are deletable if one is
* deletable. Otherwise leave status as before. */
if (status_compr->knowndeletable)
status->knowndeletable = true;
hypercore_tid_encode(&deltid->tid, &deltid_compr->tid, tuple_index);
delstate->ndeltids++;
}
}
table_close(compr_rel, NoLock);
}
hash_destroy(tidhash);
pfree(compr_delstate.deltids);
pfree(noncompr_delstate.deltids);
#ifdef USE_ASSERT_CHECKING
do
{
int ndeletable = 0;
for (int i = 0; i < delstate->ndeltids; i++)
{
const TM_IndexDelete *deltid = &delstate->deltids[i];
const TM_IndexStatus *status = &delstate->status[deltid->id];
if (status->knowndeletable)
ndeletable++;
}
Assert(ndeletable > 0 || delstate->ndeltids == 0);
} while (0);
#endif
/* Return the latestremovedXid. TransactionIdFollows can handle
* InvalidTransactionid. */
return TransactionIdFollows(xid_noncompr, xid_compr) ? xid_noncompr : xid_compr;
}
/* ----------------------------------------------------------------------------
* Functions for manipulations of physical tuples for Hypercore.
* ----------------------------------------------------------------------------
*/
typedef struct ConversionState
{
Oid relid;
RelationSize before_size;
Tuplesortstate *tuplesortstate;
} ConversionState;
static ConversionState *conversionstate = NULL;
static void
hypercore_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid, int options,
BulkInsertStateData *bistate)
{
if (conversionstate)
{
if (conversionstate->tuplesortstate)
{
tuplesort_puttupleslot(conversionstate->tuplesortstate, slot);
return;
}
/* If no tuplesortstate is set, conversion is happening from legacy
* compression where a compressed relation already exists. Therefore,
* there is no need to recompress; just insert the non-compressed data
* into the new heap. */
}
const TableAmRoutine *oldtam = switch_to_heapam(relation);
relation->rd_tableam->tuple_insert(relation, slot, cid, options, bistate);
relation->rd_tableam = oldtam;
MemoryContext oldmcxt = MemoryContextSwitchTo(CurTransactionContext);
partially_compressed_relids =
list_append_unique_oid(partially_compressed_relids, RelationGetRelid(relation));
MemoryContextSwitchTo(oldmcxt);
}
static void
hypercore_tuple_insert_speculative(Relation relation, TupleTableSlot *slot, CommandId cid,
int options, BulkInsertStateData *bistate, uint32 specToken)
{
const TableAmRoutine *oldtam = switch_to_heapam(relation);
relation->rd_tableam
->tuple_insert_speculative(relation, slot, cid, options, bistate, specToken);
relation->rd_tableam = oldtam;
}
static void
hypercore_tuple_complete_speculative(Relation relation, TupleTableSlot *slot, uint32 specToken,
bool succeeded)
{
const TableAmRoutine *oldtam = switch_to_heapam(relation);
relation->rd_tableam->tuple_complete_speculative(relation, slot, specToken, succeeded);
relation->rd_tableam = oldtam;
}
/*
* WholeSegmentDeleteState is used to enforce the invariant that only whole
* compressed segments can be deleted. See the delete handler function below
* for more information.
*/
typedef struct WholeSegmentDeleteState
{
ItemPointerData ctid; /* Original TID of compressed tuple (decoded) */
CommandId cid; /* Command ID for the query doing the deletion */
int32 count; /* The number of values/rows in compressed tuple */
Bitmapset *tuple_indexes; /* The values/rows of the compressed tuple deleted so far */
MemoryContextCallback end_of_query_cb;
MemoryContext mcxt;
} WholeSegmentDeleteState;
static WholeSegmentDeleteState *delete_state = NULL;
static bool
whole_segment_delete_state_clear(void)
{
if (delete_state)
{
/* Only reset the global pointer to indicate this delete state is
* reset. The actual memory is freed when the PortalContext is
* reset */
delete_state = NULL;
return true;
}
return false;
}
#define RAISE_DELETION_ERROR() \
ereport(ERROR, \
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED), \
errmsg("only whole-segment deletes are possible on compressed data"), \
errhint("Try deleting based on segment_by key.")));
/*
* Callback invoked at the end of a query (command).
*
* Ensure that the query only deleted whole segments of compressed
* data. Otherwise, raise an error.
*
* The callback is attached to the PortalContext memory context which is
* always cleared at the end of a query.
*/
static void
whole_segment_delete_callback(void *arg)
{
/* Clear delete state, but only raise error if we aren't already aborted */
if (whole_segment_delete_state_clear() && IsTransactionState())
RAISE_DELETION_ERROR();
}
/*
* Create a new delete state.
*
* Construct the delete state and tie it to the current query via the
* PortalContext's callback. This context is reset at the end of a query,
* which is a good point to check that delete invariants hold.
*/
static WholeSegmentDeleteState *
whole_segment_delete_state_create(const HypercoreInfo *hinfo, Relation crel, CommandId cid,
ItemPointer ctid)
{
WholeSegmentDeleteState *state;
HeapTupleData tp;
Page page;
BlockNumber block;
Buffer buffer;
ItemId lp;
bool isnull;
Datum d;
state = MemoryContextAllocZero(PortalContext, sizeof(WholeSegmentDeleteState));
state->mcxt = PortalContext;
state->end_of_query_cb.func = whole_segment_delete_callback;
ItemPointerCopy(ctid, &state->ctid);
state->cid = cid;
MemoryContextRegisterResetCallback(state->mcxt, &state->end_of_query_cb);
/* Need to construct a tuple in order to read out the "count" from the
* compressed segment */
block = ItemPointerGetBlockNumber(ctid);
buffer = ReadBuffer(crel, block);
page = BufferGetPage(buffer);
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
lp = PageGetItemId(page, ItemPointerGetOffsetNumber(ctid));
Assert(ItemIdIsNormal(lp));
tp.t_tableOid = RelationGetRelid(crel);
tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
tp.t_len = ItemIdGetLength(lp);
tp.t_self = *ctid;
d = heap_getattr(&tp, hinfo->count_cattno, RelationGetDescr(crel), &isnull);
state->count = DatumGetInt32(d);
UnlockReleaseBuffer(buffer);
return state;
}
static void
whole_segment_delete_state_add_row(WholeSegmentDeleteState *state, uint16 tuple_index)
{
MemoryContext oldmcxt = MemoryContextSwitchTo(state->mcxt);
state->tuple_indexes = bms_add_member(state->tuple_indexes, tuple_index);
MemoryContextSwitchTo(oldmcxt);
}
/*
* Check if a delete violates the "whole segment" invariant.
*
* The function will keep accumulating deleted TIDs as long as the following
* holds:
*
* 1. The delete is part of a segment that is the same segment as the previous delete.
* 2. The command ID is the same as the previous delete (i.e., still in same query).
* 3. The segment still contains rows that haven't been deleted.
*
* The function raises an error if any of 1 or 2 above is violated.
*
* Returns true if the whole segment has been deleted, otherwise false.
*/
static bool
is_whole_segment_delete(const HypercoreInfo *hinfo, Relation crel, CommandId cid, ItemPointer ctid,
uint16 tuple_index)
{
if (delete_state == NULL)
delete_state = whole_segment_delete_state_create(hinfo, crel, cid, ctid);
/* Check if any invariant is violated */
if (delete_state->cid != cid || !ItemPointerEquals(&delete_state->ctid, ctid))
{
whole_segment_delete_state_clear();
RAISE_DELETION_ERROR();
}
whole_segment_delete_state_add_row(delete_state, tuple_index);
/* Check if the whole segment is deleted. If so, cleanup. */
bool is_whole_segment = bms_num_members(delete_state->tuple_indexes) == delete_state->count;
if (is_whole_segment)
whole_segment_delete_state_clear();
return is_whole_segment;
}
/*
* Delete handler function.
*
* The TAM delete handler is invoked for individual rows referenced by TID,
* and these TIDs can point to either non-compressed data or into a compressed
* segment tuple. For TIDs pointing to non-compressed data, the row can be
* deleted directly. However, a TID pointing into a compressed tuple cannot
* lead to a delete of the whole compressed tuple unless also all the other
* rows in it should be deleted.
*
* It is tempting to simply disallow deletes directly on compressed
* data. However, Hypercore needs to support such deletes in some cases, for
* example, to support foreign key cascading deletes.
*
* Fortunately, some deletes of compressed data can be supported as long as
* the delete involves all rows in a compressed segment.
*
* The WholeSegmentDeleteState is used to track that this invariant is not
* violated.
*/
static TM_Result
hypercore_tuple_delete(Relation relation, ItemPointer tid, CommandId cid, Snapshot snapshot,
Snapshot crosscheck, bool wait, TM_FailureData *tmfd, bool changingPart)
{
TM_Result result = TM_Ok;
if (is_compressed_tid(tid))
{
HypercoreInfo *caminfo = RelationGetHypercoreInfo(relation);
Relation crel = table_open(caminfo->compressed_relid, RowExclusiveLock);
ItemPointerData decoded_tid;
uint16 tuple_index = hypercore_tid_decode(&decoded_tid, tid);
/*
* It is only possible to delete the compressed segment if all rows in
* it are deleted.
*/
if (is_whole_segment_delete(caminfo, crel, cid, &decoded_tid, tuple_index))
{
result = crel->rd_tableam->tuple_delete(crel,
&decoded_tid,
cid,
snapshot,
crosscheck,
wait,
tmfd,
changingPart);
if (result == TM_SelfModified)
{
/* The compressed tuple was already deleted by other means in
* the same transaction. This can happen because compression
* DML implemented the optimization to delete whole compressed
* segments after whole-segment deletes were implemented in
* the TAM. Trying to delete again should not hurt, and if it
* is already deleted, we ignore it. */
result = TM_Ok;
}
}
table_close(crel, NoLock);
}
else
{
/* Just pass this on to regular heap AM */
const TableAmRoutine *oldtam = switch_to_heapam(relation);
result =
relation->rd_tableam
->tuple_delete(relation, tid, cid, snapshot, crosscheck, wait, tmfd, changingPart);
relation->rd_tableam = oldtam;
}
return result;
}
#if PG16_LT
typedef bool TU_UpdateIndexes;
#endif
static TM_Result
hypercore_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot, CommandId cid,
Snapshot snapshot, Snapshot crosscheck, bool wait, TM_FailureData *tmfd,
LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes)
{
if (!is_compressed_tid(otid))
{
/* Just pass this on to regular heap AM */
const TableAmRoutine *oldtam = switch_to_heapam(relation);
TM_Result result = relation->rd_tableam->tuple_update(relation,
otid,
slot,
cid,
snapshot,
crosscheck,
wait,
tmfd,
lockmode,
update_indexes);
relation->rd_tableam = oldtam;
return result;
}
/* This shouldn't happen because hypertable_modify should have
* decompressed the data to be deleted already. It can happen, however, if
* UPDATE is run directly on a hypertable chunk, because that case isn't
* handled in the current code for DML on compressed chunks. */
elog(ERROR, "cannot update compressed tuple");
return TM_Ok;
}
static TM_Result
hypercore_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot, TupleTableSlot *slot,
CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, uint8 flags,
TM_FailureData *tmfd)
{
TM_Result result;
if (is_compressed_tid(tid))
{
HypercoreInfo *hsinfo = RelationGetHypercoreInfo(relation);
/* SELECT FOR UPDATE takes RowShareLock, so assume this
* lockmode. Another option to consider is take same lock as currently
* held on the non-compressed relation */
Relation crel = table_open(hsinfo->compressed_relid, RowShareLock);
TupleTableSlot *child_slot = arrow_slot_get_compressed_slot(slot, RelationGetDescr(crel));
ItemPointerData decoded_tid;
uint16 tuple_index = hypercore_tid_decode(&decoded_tid, tid);
result = crel->rd_tableam->tuple_lock(crel,
&decoded_tid,
snapshot,
child_slot,
cid,
mode,
wait_policy,
flags,
tmfd);
if (result == TM_Ok)
{
slot->tts_tableOid = RelationGetRelid(relation);
ExecStoreArrowTuple(slot, tuple_index);
}
table_close(crel, NoLock);
}
else
{
TupleTableSlot *child_slot = arrow_slot_get_noncompressed_slot(slot);
const TableAmRoutine *oldtam = switch_to_heapam(relation);
result = relation->rd_tableam->tuple_lock(relation,
tid,
snapshot,
child_slot,
cid,
mode,
wait_policy,
flags,
tmfd);
relation->rd_tableam = oldtam;
if (result == TM_Ok)
{
slot->tts_tableOid = RelationGetRelid(relation);
ExecStoreArrowTuple(slot, InvalidTupleIndex);
}
}
return result;
}
static void
hypercore_finish_bulk_insert(Relation rel, int options)
{
if (conversionstate)
convert_to_hypercore_finish(RelationGetRelid(rel));
}
/* ------------------------------------------------------------------------
* DDL related callbacks for Hypercore.
* ------------------------------------------------------------------------
*/
#if PG16_LT
/* Account for API differences in pre-PG16 versions */
typedef RelFileNode RelFileLocator;
#define relation_set_new_filelocator relation_set_new_filenode
#endif
static void
hypercore_relation_set_new_filelocator(Relation rel, const RelFileLocator *newrlocator,
char persistence, TransactionId *freezeXid,
MultiXactId *minmulti)
{
const TableAmRoutine *oldtam = switch_to_heapam(rel);
#if PG16_GE
rel->rd_tableam->relation_set_new_filelocator(rel,
newrlocator,
persistence,
freezeXid,
minmulti);
#else
rel->rd_tableam->relation_set_new_filenode(rel, newrlocator, persistence, freezeXid, minmulti);
#endif
rel->rd_tableam = oldtam;
/* If the chunk has a compressed chunk associated with it, then we need to
* change the rel file number for it as well. This can happen if you, for
* example, execute a transactional TRUNCATE. */
Oid compressed_relid = chunk_get_compressed_chunk_relid(RelationGetRelid(rel));
if (OidIsValid(compressed_relid))
{
Relation compressed_rel = table_open(compressed_relid, AccessExclusiveLock);
#if PG16_GE
RelationSetNewRelfilenumber(compressed_rel, compressed_rel->rd_rel->relpersistence);
#else
RelationSetNewRelfilenode(compressed_rel, compressed_rel->rd_rel->relpersistence);
#endif
table_close(compressed_rel, NoLock);
}
}
static void
hypercore_relation_nontransactional_truncate(Relation rel)
{
const TableAmRoutine *oldtam = switch_to_heapam(rel);
rel->rd_tableam->relation_nontransactional_truncate(rel);
rel->rd_tableam = oldtam;
Oid compressed_relid = chunk_get_compressed_chunk_relid(RelationGetRelid(rel));
if (OidIsValid(compressed_relid))
{
Relation crel = table_open(compressed_relid, AccessShareLock);
crel->rd_tableam->relation_nontransactional_truncate(crel);
table_close(crel, NoLock);
}
}
static void
hypercore_relation_copy_data(Relation rel, const RelFileLocator *newrlocator)
{
FEATURE_NOT_SUPPORTED;
}
static void
on_compression_progress(RowCompressor *rowcompress, uint64 ntuples)
{
pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN, ntuples);
}
/*
* Rewrite a relation and compress at the same time.
*
* Note that all tuples are frozen when compressed to make sure they are
* visible to concurrent transactions after the rewrite. This isn't MVCC
* compliant and does not work for isolation levels of repeatable read or
* higher. Ideally, we should check visibility of each original tuple that we
* roll up into a compressed tuple and transfer visibility information (XID)
* based on that, just like done in heap when it is using a rewrite state.
*/
static Oid
compress_and_swap_heap(Relation rel, Tuplesortstate *tuplesort, TransactionId *xid_cutoff,
MultiXactId *multi_cutoff)
{
const HypercoreInfo *hsinfo = RelationGetHypercoreInfo(rel);
TupleDesc tupdesc = RelationGetDescr(rel);
Oid old_compressed_relid = hsinfo->compressed_relid;
CompressionSettings *settings = ts_compression_settings_get(old_compressed_relid);
Relation old_compressed_rel = table_open(old_compressed_relid, AccessExclusiveLock);
#if PG15_GE
Oid accessMethod = old_compressed_rel->rd_rel->relam;
#endif
Oid tableSpace = old_compressed_rel->rd_rel->reltablespace;
char relpersistence = old_compressed_rel->rd_rel->relpersistence;
Oid new_compressed_relid = make_new_heap(old_compressed_relid,
tableSpace,
#if PG15_GE
accessMethod,
#endif
relpersistence,
AccessExclusiveLock);
Relation new_compressed_rel = table_open(new_compressed_relid, AccessExclusiveLock);
RowCompressor row_compressor;
double reltuples;
int32 relpages;
/* Initialize the compressor. */
row_compressor_init(settings,
&row_compressor,
rel,
new_compressed_rel,
RelationGetDescr(old_compressed_rel)->natts,
true /*need_bistate*/,
HEAP_INSERT_FROZEN);
row_compressor.on_flush = on_compression_progress;
row_compressor_append_sorted_rows(&row_compressor, tuplesort, tupdesc, old_compressed_rel);
reltuples = row_compressor.num_compressed_rows;
relpages = RelationGetNumberOfBlocks(new_compressed_rel);
row_compressor_close(&row_compressor);
table_close(new_compressed_rel, NoLock);
table_close(old_compressed_rel, NoLock);
/* Update stats for the compressed relation */
Relation relRelation = table_open(RelationRelationId, RowExclusiveLock);
HeapTuple reltup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(new_compressed_relid));
if (!HeapTupleIsValid(reltup))
elog(ERROR, "cache lookup failed for relation %u", new_compressed_relid);
Form_pg_class relform = (Form_pg_class) GETSTRUCT(reltup);
relform->relpages = relpages;
relform->reltuples = reltuples;
CatalogTupleUpdate(relRelation, &reltup->t_self, reltup);
/* Clean up. */
heap_freetuple(reltup);
table_close(relRelation, RowExclusiveLock);
/* Make the update visible */
CommandCounterIncrement();
/* Finish the heap swap for the compressed relation. Note that it is not
* possible to swap toast content since new tuples were generated via
* compression. */
finish_heap_swap(old_compressed_relid,
new_compressed_relid,
false /* is_system_catalog */,
false /* swap_toast_by_content */,
false,
true,
*xid_cutoff,
*multi_cutoff,
relpersistence);
return new_compressed_relid;
}
/*
* Rewrite/compress the relation for CLUSTER or VACUUM FULL.
*
* The copy_for_cluster() callback is called during a CLUSTER or VACUUM FULL,
* and performs a heap swap/rewrite. The code is based on the heap's
* copy_for_cluster(), with changes to handle two heaps and compressed tuples.
*
* For Hypercore, two heap swaps are performed: one on the non-compressed
* (user-visible) relation, which is managed by PostgreSQL and passed on to
* this callback, and one on the compressed relation that is implemented
* within the callback.
*
* The Hypercore implementation of copy_for_cluster() is similar to the one
* for Heap. However, instead of "rewriting" tuples into the new heap (while
* at the same time handling freezing and visibility), Hypercore will
* compress all the data and write it to a new compressed relation. Since the
* compression is based on the previous compression implementation, visibility
* of recently deleted tuples and freezing of tuples is not correctly handled,
* at least not for higher isolation levels than read committed. Changes to
* handle higher isolation levels should be considered in a future update of
* this code.
*
* Some things missing includes the handling of recently dead tuples that need
* to be transferred to the new heap since they might still be visible to some
* ongoing transactions. PostgreSQL's heap implementation handles this via the
* heap rewrite module. It should also be possible to write frozen compressed
* tuples if all rows it compresses are also frozen.
*/
static void
hypercore_relation_copy_for_cluster(Relation OldHypercore, Relation NewCompression,
Relation OldIndex, bool use_sort, TransactionId OldestXmin,
TransactionId *xid_cutoff, MultiXactId *multi_cutoff,
double *num_tuples, double *tups_vacuumed,
double *tups_recently_dead)
{
const HypercoreInfo *hsinfo = RelationGetHypercoreInfo(OldHypercore);
HypercoreScanDesc cscan;
HeapScanDesc chscan;
HeapScanDesc uhscan;
Tuplesortstate *tuplesort;
TableScanDesc tscan;
TupleTableSlot *slot;
ArrowTupleTableSlot *aslot;
BufferHeapTupleTableSlot *hslot;
BlockNumber prev_cblock = InvalidBlockNumber;
BlockNumber startblock;
BlockNumber nblocks;
if (ts_is_hypertable(RelationGetRelid(OldHypercore)))
return;
/* Error out if this is a CLUSTER. It would be possible to CLUSTER only
* the non-compressed relation, but utility of this is questionable as
* most of the data should be compressed (and ordered) anyway. */
if (OldIndex != NULL)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot cluster a hypercore table"),
errdetail("A hypercore table is already ordered by compression.")));
CompressionSettings *settings = ts_compression_settings_get(hsinfo->compressed_relid);
tuplesort = compression_create_tuplesort_state(settings, OldHypercore);
/* In scan-and-sort mode and also VACUUM FULL, set phase */
pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP);
/* This will scan via the Hypercore callbacks, getting tuples from both
* compressed and non-compressed relations */
tscan = table_beginscan(OldHypercore, SnapshotAny, 0, (ScanKey) NULL);
cscan = (HypercoreScanDesc) tscan;
chscan = (HeapScanDesc) cscan->cscan_desc;
uhscan = (HeapScanDesc) cscan->uscan_desc;
slot = table_slot_create(OldHypercore, NULL);
startblock = chscan->rs_startblock + uhscan->rs_startblock;
nblocks = chscan->rs_nblocks + uhscan->rs_nblocks;
/* Set total heap blocks */
pgstat_progress_update_param(PROGRESS_CLUSTER_TOTAL_HEAP_BLKS, nblocks);
aslot = (ArrowTupleTableSlot *) slot;
for (;;)
{
HeapTuple tuple;
Buffer buf;
bool isdead;
BlockNumber cblock;
CHECK_FOR_INTERRUPTS();
if (!table_scan_getnextslot(tscan, ForwardScanDirection, slot))
{
/*
* If the last pages of the scan were empty, we would go to
* the next phase while heap_blks_scanned != heap_blks_total.
* Instead, to ensure that heap_blks_scanned is equivalent to
* total_heap_blks after the table scan phase, this parameter
* is manually updated to the correct value when the table
* scan finishes.
*/
pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED, nblocks);
break;
}
/*
* In scan-and-sort mode and also VACUUM FULL, set heap blocks
* scanned
*
* Note that heapScan may start at an offset and wrap around, i.e.
* rs_startblock may be >0, and rs_cblock may end with a number
* below rs_startblock. To prevent showing this wraparound to the
* user, we offset rs_cblock by rs_startblock (modulo rs_nblocks).
*/
cblock = chscan->rs_cblock + uhscan->rs_cblock;
if (prev_cblock != cblock)
{
pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED,
((cblock + nblocks - startblock) % nblocks) + 1);
prev_cblock = cblock;
}
/* Get the actual tuple from the child slot (either compressed or
* non-compressed). The tuple has all the visibility information. */
tuple = ExecFetchSlotHeapTuple(aslot->child_slot, false, NULL);
hslot = (BufferHeapTupleTableSlot *) aslot->child_slot;
buf = hslot->buffer;
LockBuffer(buf, BUFFER_LOCK_SHARE);
switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf))
{
case HEAPTUPLE_DEAD:
/* Definitely dead */
isdead = true;
break;
case HEAPTUPLE_RECENTLY_DEAD:
/* Note: This case is treated as "dead" in Hypercore,
* although some of these tuples might still be visible to
* some transactions. For strict correctness, recently dead
* tuples should be transferred to the new heap if they are
* still visible to some transactions (e.g. under repeatable
* read). However, this is tricky since multiple rows with
* potentially different visibility is rolled up into one
* compressed row with singular visibility. */
isdead = true;
break;
case HEAPTUPLE_LIVE:
/* Live or recently dead, must copy it */
isdead = false;
break;
case HEAPTUPLE_INSERT_IN_PROGRESS:
/*
* Since we hold exclusive lock on the relation, normally the
* only way to see this is if it was inserted earlier in our
* own transaction. However, it can happen in system
* catalogs, since we tend to release write lock before commit
* there. Still, system catalogs don't use Hypercore.
*/
if (!TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data)))
elog(WARNING,
"concurrent insert in progress within table \"%s\"",
RelationGetRelationName(OldHypercore));
/* treat as live */
isdead = false;
break;
case HEAPTUPLE_DELETE_IN_PROGRESS:
/*
* Similar situation to INSERT_IN_PROGRESS case.
*/
if (!TransactionIdIsCurrentTransactionId(
HeapTupleHeaderGetUpdateXid(tuple->t_data)))
elog(WARNING,
"concurrent delete in progress within table \"%s\"",
RelationGetRelationName(OldHypercore));
/* Note: This case is treated as "dead" in Hypercore,
* although this is "recently dead" in heap */
isdead = true;
break;
default:
elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
isdead = false; /* keep compiler quiet */
break;
}
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
if (isdead)
{
*tups_vacuumed += 1;
/* Skip whole segment if a dead compressed tuple */
if (arrow_slot_is_compressed(slot))
arrow_slot_mark_consumed(slot);
continue;
}
while (!arrow_slot_is_last(slot))
{
*num_tuples += 1;
tuplesort_puttupleslot(tuplesort, slot);
ExecStoreNextArrowTuple(slot);
}
*num_tuples += 1;
tuplesort_puttupleslot(tuplesort, slot);
/* Report increase in number of tuples scanned */
pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED, *num_tuples);
}
table_endscan(tscan);
ExecDropSingleTupleTableSlot(slot);
/* Report that we are now sorting tuples */
pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, PROGRESS_CLUSTER_PHASE_SORT_TUPLES);
/* Sort and recreate compressed relation */
tuplesort_performsort(tuplesort);
/* Report that we are now writing new heap */
pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE, PROGRESS_CLUSTER_PHASE_WRITE_NEW_HEAP);
compress_and_swap_heap(OldHypercore, tuplesort, xid_cutoff, multi_cutoff);
tuplesort_end(tuplesort);
}
/*
* VACUUM (not VACUUM FULL).
*
* Vacuum the hypercore by calling vacuum on both the non-compressed and
* compressed relations.
*
* Indexes on a heap are normally vacuumed as part of vacuuming the
* heap. However, a hypercore index is defined on the non-compressed relation
* and contains tuples from both the non-compressed and compressed relations
* and therefore dead tuples vacuumed on the compressed relation won't be
* removed from a hypercore index by default. The vacuuming of dead
* compressed tuples from the hypercore index therefore requires special
* handling, which is triggered via a proxy index (hypercore_proxy) that relays the
* clean up to the "correct" hypercore indexes. (See hypercore_proxy.c)
*
* For future: It would make sense to (re-)compress all non-compressed data as
* part of vacuum since (re-)compression is a kind of cleanup but also leaves
* a lot of garbage.
*/
static void
hypercore_vacuum_rel(Relation rel, VacuumParams *params, BufferAccessStrategy bstrategy)
{
Oid relid = RelationGetRelid(rel);
HypercoreInfo *hsinfo;
RelStats relstats;
if (ts_is_hypertable(relid))
return;
relstats_fetch(relid, &relstats);
hsinfo = RelationGetHypercoreInfo(rel);
LOCKMODE lmode =
(params->options & VACOPT_FULL) ? AccessExclusiveLock : ShareUpdateExclusiveLock;
/* Vacuum the compressed relation */
Relation crel = vacuum_open_relation(hsinfo->compressed_relid,
NULL,
params->options,
params->log_min_duration >= 0,
lmode);
if (crel)
{
crel->rd_tableam->relation_vacuum(crel, params, bstrategy);
table_close(crel, NoLock);
}
/* Vacuum the non-compressed relation */
const TableAmRoutine *oldtam = switch_to_heapam(rel);
rel->rd_tableam->relation_vacuum(rel, params, bstrategy);
rel->rd_tableam = oldtam;
/* Unfortunately, relstats are currently incorrectly updated when
* vacuuming, because we vacuum the non-compressed rel separately, and
* last, and it will only update stats based on the data in that
* table. Therefore, as a work-around, it is better to restore relstats to
* what it was before vacuuming.
*/
relstats_update(relid, &relstats);
}
/*
* Analyze the next block with the given blockno.
*
* The underlying ANALYZE functionality that calls this function samples
* blocks in the relation. To be able to analyze all the blocks across both
* the non-compressed and the compressed relations, we need to make sure that
* both underlying relations are sampled.
*
* For versions before PG17, this function relies on the TAM giving the
* impression that the total number of blocks is the sum of compressed and
* non-compressed blocks. This is done by returning the sum of the total
* number of blocks across both relations in the relation_size() TAM callback.
*
* The non-compressed relation is sampled first, and, only once the blockno
* increases beyond the number of blocks in the non-compressed relation, the
* compressed relation is sampled.
*
* For versions starting with PG17 a new interface was introduced based on a
* ReadStream API, which allows blocks to be read from the relation using a
* dedicated set of functions.
*
* The ReadStream is usually set up in beginscan for the table access method,
* but for the ANALYZE command there is an exception and it sets up the
* ReadStream itself and uses that when scanning the blocks. The relation
* used is the default relation, which is the uncompressed relation, but we
* need a read stream for the compressed relation as well.
*
* When returning blocks, we can first sample the uncompressed relation and then
* continue with sampling the compressed relation when we have exhausted the
* uncompressed relation.
*/
#if PG17_LT
static bool
hypercore_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno,
BufferAccessStrategy bstrategy)
{
HypercoreScanDescData *cscan = (HypercoreScanDescData *) scan;
#if PG17_GE
HeapScanDesc chscan = (HeapScanDesc) cscan->cscan_desc;
#endif
HeapScanDesc uhscan = (HeapScanDesc) cscan->uscan_desc;
/* If blockno is past the blocks in the non-compressed relation, we should
* analyze the compressed relation */
if (blockno >= uhscan->rs_nblocks)
{
/* Get the compressed rel blockno by subtracting the number of
* non-compressed blocks */
blockno -= uhscan->rs_nblocks;
return cscan->compressed_rel->rd_tableam->scan_analyze_next_block(cscan->cscan_desc,
blockno,
bstrategy);
}
Relation rel = scan->rs_rd;
const TableAmRoutine *oldtam = switch_to_heapam(rel);
bool result = rel->rd_tableam->scan_analyze_next_block(cscan->uscan_desc, blockno, bstrategy);
rel->rd_tableam = oldtam;
return result;
}
#else
static ReadStream *
hypercore_setup_read_stream(Relation rel, BufferAccessStrategy bstrategy)
{
Assert(rel != NULL);
BlockSampler block_sampler = palloc(sizeof(BlockSamplerData));
const BlockNumber totalblocks = RelationGetNumberOfBlocks(rel);
const uint32 randseed = pg_prng_uint32(&pg_global_prng_state);
const int targrows = compute_targrows(rel);
const BlockNumber nblocks = BlockSampler_Init(block_sampler, totalblocks, targrows, randseed);
pgstat_progress_update_param(PROGRESS_ANALYZE_BLOCKS_TOTAL, nblocks);
TS_DEBUG_LOG("set up ReadStream for %s (%d), filenode: %d, pages: %d",
RelationGetRelationName(rel),
RelationGetRelid(rel),
rel->rd_rel->relfilenode,
rel->rd_rel->relpages);
return read_stream_begin_relation(READ_STREAM_MAINTENANCE,
bstrategy,
rel,
MAIN_FORKNUM,
hypercore_block_sampling_read_stream_next,
block_sampler,
0);
}
static bool
hypercore_scan_analyze_next_block(TableScanDesc scan, ReadStream *stream)
{
HypercoreScanDescData *cscan = (HypercoreScanDescData *) scan;
HeapScanDesc uhscan = (HeapScanDesc) cscan->uscan_desc;
HeapScanDesc chscan = (HeapScanDesc) cscan->cscan_desc;
/* We do not analyze parent table of hypertables. There is no data there. */
if (ts_is_hypertable(scan->rs_rd->rd_id))
return false;
BufferAccessStrategy bstrategy;
BlockNumber blockno = read_stream_next_block(stream, &bstrategy);
TS_DEBUG_LOG("blockno %d, uhscan->rs_nblocks: %d, chscan->rs_nblocks: %d",
blockno,
uhscan->rs_nblocks,
chscan->rs_nblocks);
if (!cscan->canalyze_read_stream)
{
Assert(cscan->compressed_rel);
cscan->canalyze_read_stream = hypercore_setup_read_stream(cscan->compressed_rel, bstrategy);
}
if (!cscan->uanalyze_read_stream)
{
const TableAmRoutine *oldtam = switch_to_heapam(scan->rs_rd);
cscan->uanalyze_read_stream = hypercore_setup_read_stream(scan->rs_rd, bstrategy);
scan->rs_rd->rd_tableam = oldtam;
}
/*
* If the block number is above the number of block in the uncompressed
* relation, we need to fetch a block from the compressed relation.
*
* Note that we have a different readstream for the compressed relation,
* so we are not sampling the block that is provided to the function, but
* we sample the correct number of blocks in for each relation.
*/
if (blockno >= uhscan->rs_nblocks)
{
TS_DEBUG_LOG("reading block %d from compressed relation", blockno);
return cscan->compressed_rel->rd_tableam
->scan_analyze_next_block(cscan->cscan_desc, cscan->canalyze_read_stream);
}
TS_DEBUG_LOG("reading block %d from non-compressed relation", blockno - chscan->rs_nblocks);
Assert(blockno < uhscan->rs_nblocks + chscan->rs_nblocks);
Relation rel = scan->rs_rd;
const TableAmRoutine *oldtam = switch_to_heapam(rel);
bool result =
rel->rd_tableam->scan_analyze_next_block(cscan->uscan_desc, cscan->uanalyze_read_stream);
rel->rd_tableam = oldtam;
return result;
}
#endif
/*
* Get the next tuple to sample during ANALYZE.
*
* Since the sampling happens across both the non-compressed and compressed
* relations, it is necessary to determine from which relation to return a
* tuple. This is driven by scan_analyze_next_block() above.
*
* When sampling from the compressed relation, a compressed segment is read
* and it is then necessary to return all tuples in the segment.
*
* NOTE: the function currently relies on heapAM's scan_analyze_next_tuple()
* to read compressed segments. This can lead to misrepresenting liverows and
* deadrows numbers since heap AM might skip tuples that are dead or
* concurrently inserted, but still count them in liverows or deadrows. Each
* compressed tuple represents many rows, but heapAM only counts each
* compressed tuple as one row. The only way to fix this is to either check
* the diff between the count before and after calling the heap AM function
* and then estimate the actual number of rows from that, or, reimplement the
* heapam_scan_analyze_next_tuple() function so that it can properly account
* for compressed tuples.
*/
static bool
hypercore_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, double *liverows,
double *deadrows, TupleTableSlot *slot)
{
HypercoreScanDescData *cscan = (HypercoreScanDescData *) scan;
HeapScanDesc chscan = (HeapScanDesc) cscan->cscan_desc;
uint16 tuple_index;
bool result;
/*
* Since non-compressed blocks are always sampled first, the current
* buffer for the compressed relation will be invalid until we reach the
* end of the non-compressed blocks.
*/
if (chscan->rs_cbuf != InvalidBuffer)
{
/* Keep on returning tuples from the compressed segment until it is
* consumed */
if (!TTS_EMPTY(slot))
{
tuple_index = arrow_slot_row_index(slot);
if (tuple_index != InvalidTupleIndex && !arrow_slot_is_last(slot))
{
ExecIncrArrowTuple(slot, 1);
*liverows += 1;
return true;
}
}
TupleTableSlot *child_slot =
arrow_slot_get_compressed_slot(slot, RelationGetDescr(cscan->compressed_rel));
result = cscan->compressed_rel->rd_tableam->scan_analyze_next_tuple(cscan->cscan_desc,
OldestXmin,
liverows,
deadrows,
child_slot);
/* Need to pick a row from the segment to sample. Might as well pick
* the first one, but might consider picking a random one. */
tuple_index = 1;
}
else
{
TupleTableSlot *child_slot = arrow_slot_get_noncompressed_slot(slot);
Relation rel = scan->rs_rd;
const TableAmRoutine *oldtam = switch_to_heapam(rel);
result = rel->rd_tableam->scan_analyze_next_tuple(cscan->uscan_desc,
OldestXmin,
liverows,
deadrows,
child_slot);
rel->rd_tableam = oldtam;
tuple_index = InvalidTupleIndex;
}
if (result)
{
slot->tts_tableOid = RelationGetRelid(scan->rs_rd);
ExecStoreArrowTuple(slot, tuple_index);
}
else
ExecClearTuple(slot);
return result;
}
typedef struct IndexBuildCallbackState
{
/* Original callback and state state */
IndexBuildCallback callback;
void *orig_state;
/* The table building an index over and original index info */
Relation rel;
IndexInfo *index_info;
/* Expression state and slot for predicate evaluation when building
* partial indexes */
EState *estate;
ExprContext *econtext;
ExprState *predicate;
TupleTableSlot *slot;
int num_non_index_predicates;
/* Information needed to process values from compressed data */
int16 tuple_index;
double ntuples;
Bitmapset *segmentby_cols;
Bitmapset *orderby_cols;
bool is_segmentby_index;
MemoryContext decompression_mcxt;
ArrowArray **arrow_columns;
} IndexBuildCallbackState;
/*
* Callback for index builds on compressed relation.
*
* See hypercore_index_build_range_scan() for general overview.
*
* When building an index, this function is called once for every compressed
* tuple. To build an index over the original (non-compressed) values, it is
* necessary to "unwrap" the compressed data. Therefore, the function calls
* the original index build callback once for every value in the compressed
* tuple.
*
* Note that, when the index covers only segmentby columns and the value is
* the same for all original rows in the segment, the index storage is
* optimized to only index the compressed row and then unwrapping it during
* scanning instead.
*/
static void
hypercore_index_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull,
bool tupleIsAlive, void *state)
{
IndexBuildCallbackState *icstate = state;
const TupleDesc tupdesc = RelationGetDescr(icstate->rel);
const Bitmapset *segmentby_cols = icstate->segmentby_cols;
/* We expect the compressed rel scan to produce a datum array that first
* includes the index columns, then any columns referenced in index
* predicates that are not index columns. */
const int natts = icstate->index_info->ii_NumIndexAttrs + icstate->num_non_index_predicates;
/* Read the actual number of rows in the compressed tuple from the count
* column. The count column is appended directly after the index
* attributes. */
const int32 num_actual_rows = DatumGetInt32(values[natts]);
int32 num_rows = num_actual_rows; /* Num rows to index. For segmentby
* indexes, we might change this from
* the actual number of rows to indexing
* only one row per segment. */
/* Update ntuples for accurate statistics. When building the index, the
* relation's reltuples is updated based on this count. */
if (tupleIsAlive)
icstate->ntuples += num_actual_rows;
/*
* Phase 1: Prepare to process the compressed segment.
*
* We need to figure out the number of rows in the segment, which is
* usually given by the "count" column (num_actual_rows). But for
* segmentby indexes, we only index whole segments (so num_rows = 1).
*
* For non-segmentby indexes, we need to go through all attribute values
* and decompress segments into multiple rows in columnar arrow array
* format.
*/
if (icstate->is_segmentby_index)
{
/* A segment index will index only the full segment. */
num_rows = 1;
#ifdef USE_ASSERT_CHECKING
/* A segment index can only index segmentby columns */
for (int i = 0; i < natts; i++)
{
const AttrNumber attno = icstate->index_info->ii_IndexAttrNumbers[i];
Assert(bms_is_member(attno, segmentby_cols));
}
#endif
}
else
{
for (int i = 0; i < natts; i++)
{
const AttrNumber attno = icstate->index_info->ii_IndexAttrNumbers[i];
if (bms_is_member(attno, segmentby_cols))
{
/*
* For a segmentby column, there is nothing to decompress, so just
* return the non-compressed value.
*/
}
else if (!isnull[i])
{
const Form_pg_attribute attr =
TupleDescAttr(tupdesc, AttrNumberGetAttrOffset(attno));
icstate->arrow_columns[i] = arrow_from_compressed(values[i],
attr->atttypid,
CurrentMemoryContext,
icstate->decompression_mcxt);
/* The number of elements in the arrow array should be the
* same as the number of rows in the segment (count
* column). */
Assert(num_rows == icstate->arrow_columns[i]->length);
}
else
{
icstate->arrow_columns[i] = NULL;
}
}
}
Assert((!icstate->is_segmentby_index && num_rows > 0) ||
(icstate->is_segmentby_index && num_rows == 1));
/*
* Phase 2: Loop over all "unwrapped" rows in the arrow arrays, build
* index tuples, and index them unless they fail predicate checks.
*/
/* Table slot for predicate checks. We need to re-create a slot in table
* format to be able to do predicate checks once we have decompressed the
* values. */
TupleTableSlot *slot = icstate->slot;
for (int rownum = 0; rownum < num_rows; rownum++)
{
/* The slot is a table slot, not index slot. But we only fill in the
* columns needed for the index and predicate checks. Therefore, make sure
* other columns are initialized to "null" */
memset(slot->tts_isnull, true, sizeof(bool) * slot->tts_tupleDescriptor->natts);
ExecClearTuple(slot);
for (int colnum = 0; colnum < natts; colnum++)
{
const AttrNumber attno = icstate->index_info->ii_IndexAttrNumbers[colnum];
if (bms_is_member(attno, segmentby_cols))
{
/* Segmentby columns are not compressed, so the datum in the
* values array is already set and valid */
}
else if (icstate->arrow_columns[colnum] != NULL)
{
const Form_pg_attribute attr =
TupleDescAttr(tupdesc, AttrNumberGetAttrOffset(attno));
NullableDatum datum = arrow_get_datum(icstate->arrow_columns[colnum],
attr->atttypid,
attr->attlen,
rownum);
values[colnum] = datum.value;
isnull[colnum] = datum.isnull;
}
else
{
/* No arrow array so all values are NULL */
values[colnum] = 0;
isnull[colnum] = true;
}
/* Fill in the values in the table slot for predicate checks */
slot->tts_values[AttrNumberGetAttrOffset(attno)] = values[colnum];
slot->tts_isnull[AttrNumberGetAttrOffset(attno)] = isnull[colnum];
}
ItemPointerData index_tid;
hypercore_tid_encode(&index_tid, tid, rownum + 1);
Assert(!icstate->is_segmentby_index || rownum == 0);
/*
* In a partial index, discard tuples that don't satisfy the
* predicate.
*/
if (icstate->predicate)
{
/* Mark the slot as valid */
ExecStoreVirtualTuple(slot);
if (!ExecQual(icstate->predicate, icstate->econtext))
continue;
}
icstate->callback(index, &index_tid, values, isnull, tupleIsAlive, icstate->orig_state);
}
}
/*
* Build an index over a Hypercore table.
*
* The task of this function is to scan all tuples in the table and then,
* after visibility and predicate checks, pass the tuple to the "index build
* callback" to have it indexed.
*
* Since a Hypercore table technically consists of two heaps: one
* non-compressed and one compressed, it is necessary to scan both of them. To
* avoid rewriting/copying the heap code, we make use of the heap AM's
* machinery. However, that comes with some complications when dealing with
* compressed tuples. To build an index over compressed tuples, we need to
* first decompress the segments into individual values. To make this work, we
* replace the given index build callback with our own, so that we can first
* decompress the data and then call the real index build callback.
*
* Partial indexes present an additional complication because every tuple
* scanned needs to be checked against the index predicate to know whether it
* should be part of the index or not. However, the index build callback only
* gets the values of the indexed columns, not the original table tuple. That
* won't work for predicates on non-indexed column. Therefore, before calling
* the heap AM machinery, we change the index definition so that also
* non-indexed predicate columns will be included in the values array passed
* on to the "our" index build callback. Then we can reconstruct a table tuple
* from those values in order to do the predicate check.
*/
static double
hypercore_index_build_range_scan(Relation relation, Relation indexRelation, IndexInfo *indexInfo,
bool allow_sync, bool anyvisible, bool progress,
BlockNumber start_blockno, BlockNumber numblocks,
IndexBuildCallback callback, void *callback_state,
TableScanDesc scan)
{
HypercoreInfo *hsinfo;
TransactionId OldestXmin;
bool need_unregister_snapshot = false;
Snapshot snapshot;
/*
* We can be called from ProcessUtility with a hypertable because we need
* to process all ALTER TABLE commands in the list to set options
* correctly for the hypertable.
*
* If we are called on a hypertable, we just skip scanning for tuples and
* say that the relation was empty.
*/
if (ts_is_hypertable(relation->rd_id))
return 0.0;
for (int i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
{
const AttrNumber attno = indexInfo->ii_IndexAttrNumbers[i];
/*
* User-defined attributes always have a positive attribute number (1
* or larger) and these are the only ones we support, so we check for
* that here and raise an error if it is not a user-defined attribute.
*/
if (!AttrNumberIsForUserDefinedAttr(attno))
{
/*
* If the attribute number if zero, it means that we have an
* expression index in this column and need to call the
* corresponding expression tree in ii_Expressions to compute the
* value to store in the index.
*
* If the attribute number is negative, it means that we have a
* reference to a system attribute (see sysattr.h), which we do
* not support either.
*/
if (attno == InvalidAttrNumber)
ereport(ERROR,
errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("expression indexes not supported"));
else
ereport(ERROR,
errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot index system columns"));
}
}
hsinfo = RelationGetHypercoreInfo(relation);
/*
* In accordance with the heapam implementation, setup the scan
* descriptor. Do it here instead of letting the heapam handler do it
* since we want a hypercore scan descriptor that includes the state for
* both the non-compressed and compressed relations.
*
* Prepare for scan of the base relation. In a normal index build, we use
* SnapshotAny because we must retrieve all tuples and do our own time
* qual checks (because we have to index RECENTLY_DEAD tuples). In a
* concurrent build, or during bootstrap, we take a regular MVCC snapshot
* and index whatever's live according to that.
*
* Hypercore is not used during bootstrap so skip that check.
*/
OldestXmin = InvalidTransactionId;
/* okay to ignore lazy VACUUMs here */
if (!indexInfo->ii_Concurrent)
{
#if PG14_LT
OldestXmin = GetOldestXmin(relation, PROCARRAY_FLAGS_VACUUM);
#else
OldestXmin = GetOldestNonRemovableTransactionId(relation);
#endif
}
if (!scan)
{
/*
* Serial index build.
*
* Must begin our own heap scan in this case. We may also need to
* register a snapshot whose lifetime is under our direct control.
*/
if (!TransactionIdIsValid(OldestXmin))
{
snapshot = RegisterSnapshot(GetTransactionSnapshot());
need_unregister_snapshot = true;
}
else
snapshot = SnapshotAny;
scan = table_beginscan_strat(relation, /* relation */
snapshot, /* snapshot */
0, /* number of keys */
NULL, /* scan key */
true, /* buffer access strategy OK */
allow_sync); /* syncscan OK? */
}
else
{
/*
* Parallel index build.
*
* Parallel case never registers/unregisters own snapshot. Snapshot
* is taken from parallel heap scan, and is SnapshotAny or an MVCC
* snapshot, based on same criteria as serial case.
*/
Assert(allow_sync);
snapshot = scan->rs_snapshot;
}
HypercoreScanDescData *hscan = (HypercoreScanDescData *) scan;
EState *estate = CreateExecutorState();
Relation crel = hscan->compressed_rel;
IndexBuildCallbackState icstate = {
.callback = callback,
.orig_state = callback_state,
.rel = relation,
.estate = estate,
.econtext = GetPerTupleExprContext(estate),
.slot = MakeSingleTupleTableSlot(RelationGetDescr(relation), &TTSOpsVirtual),
.index_info = indexInfo,
.tuple_index = -1,
.ntuples = 0,
.decompression_mcxt = AllocSetContextCreate(CurrentMemoryContext,
"bulk decompression",
/* minContextSize = */ 0,
/* initBlockSize = */ 64 * 1024,
/* maxBlockSize = */ 64 * 1024),
/* Allocate arrow array for all attributes in the relation although
* index might need only a subset. This is to accommodate any extra
* predicate attributes (see below). */
.arrow_columns =
(ArrowArray **) palloc(sizeof(ArrowArray *) * RelationGetDescr(relation)->natts),
.is_segmentby_index = true,
};
/* IndexInfo copy to use when processing compressed relation. It will be
* modified slightly since the compressed rel has different attribute
* number mappings. It is also not possible to do all index processing on
* compressed tuples, e.g., predicate checks (see below). */
IndexInfo compress_iinfo = *indexInfo;
build_segment_and_orderby_bms(hsinfo, &icstate.segmentby_cols, &icstate.orderby_cols);
/* Translate index attribute numbers for the compressed relation */
for (int i = 0; i < indexInfo->ii_NumIndexAttrs; i++)
{
const AttrNumber attno = indexInfo->ii_IndexAttrNumbers[i];
const AttrNumber cattno = hsinfo->columns[AttrNumberGetAttrOffset(attno)].cattnum;
compress_iinfo.ii_IndexAttrNumbers[i] = cattno;
icstate.arrow_columns[i] = NULL;
/* If the indexed column is not a segmentby column, then this is not a
* segmentby index */
if (!bms_is_member(attno, icstate.segmentby_cols))
icstate.is_segmentby_index = false;
}
Assert(indexInfo->ii_NumIndexAttrs == compress_iinfo.ii_NumIndexAttrs);
/* If there are predicates, it's a partial index build. It is necessary to
* find any columns referenced in the predicates that are not included in
* the index. We need to make sure that the heap AM will include these
* columns when building an index tuple so that we can later do predicate
* checks on them. */
if (indexInfo->ii_Predicate != NIL)
{
const List *vars = pull_vars_of_level((Node *) indexInfo->ii_Predicate, 0);
ListCell *lc;
/* Check if the predicate attribute is already part of the index or
* not. If not, append it to the end of the index attributes. */
foreach (lc, vars)
{
const Var *v = lfirst_node(Var, lc);
bool found = false;
for (int i = 0; i < compress_iinfo.ii_NumIndexAttrs; i++)
{
AttrNumber attno = compress_iinfo.ii_IndexAttrNumbers[i];
if (v->varattno == attno)
{
found = true;
break;
}
}
if (!found)
{
/* Need to translate attribute number for compressed rel */
const int offset = AttrNumberGetAttrOffset(v->varattno);
const AttrNumber cattno = hsinfo->columns[offset].cattnum;
const int num_index_attrs =
compress_iinfo.ii_NumIndexAttrs + icstate.num_non_index_predicates;
Ensure(compress_iinfo.ii_NumIndexAttrs < INDEX_MAX_KEYS,
"too many predicate attributes in index");
/* If the predicate column is not part of the index, we need
* to include it in the index info passed to heap AM when
* scanning the compressed relation. */
compress_iinfo.ii_IndexAttrNumbers[num_index_attrs] = cattno;
/* We also add the attribute mapping to the original index
* info, but we don't increase indexInfo->ii_NumIndexAttrs
* because that will change the index definition. Instead we
* track the number of additional predicate attributes in
* icstate.num_non_index_predicates. */
const int natts = indexInfo->ii_NumIndexAttrs + icstate.num_non_index_predicates;
indexInfo->ii_IndexAttrNumbers[natts] = v->varattno;
icstate.num_non_index_predicates++;
}
}
/* Can't evaluate predicates on compressed tuples. This is done in
* hypercore_index_build_callback instead. */
compress_iinfo.ii_Predicate = NULL;
/* Set final number of index attributes. Includes original number of
* attributes plus the new predicate attributes */
compress_iinfo.ii_NumIndexAttrs =
compress_iinfo.ii_NumIndexAttrs + icstate.num_non_index_predicates;
/* Set up predicate evaluation, including the slot for econtext */
icstate.econtext->ecxt_scantuple = icstate.slot;
icstate.predicate = ExecPrepareQual(indexInfo->ii_Predicate, icstate.estate);
}
/* Make sure the count column is included last in the index tuple
* generated by the heap AM machinery. It is needed to know the
* uncompressed tuple count in case of building an index on the segmentby
* column. */
Ensure(compress_iinfo.ii_NumIndexAttrs < INDEX_MAX_KEYS,
"too many predicate attributes in index");
compress_iinfo.ii_IndexAttrNumbers[compress_iinfo.ii_NumIndexAttrs++] = hsinfo->count_cattno;
/* Call heap's index_build_range_scan() on the compressed relation. The
* custom callback we give it will "unwrap" the compressed segments into
* individual tuples. Therefore, we cannot use the tuple count returned by
* the function since it only represents the number of compressed
* tuples. Instead, tuples are counted in the callback state. */
crel->rd_tableam->index_build_range_scan(crel,
indexRelation,
&compress_iinfo,
allow_sync,
anyvisible,
progress,
start_blockno,
numblocks,
hypercore_index_build_callback,
&icstate,
hscan->cscan_desc);
/* Heap's index_build_range_scan() ended the scan, so set the scan
* descriptor to NULL here in order to not try to close it again in our
* own table_endscan(). */
hscan->cscan_desc = NULL;
FreeExecutorState(icstate.estate);
ExecDropSingleTupleTableSlot(icstate.slot);
MemoryContextDelete(icstate.decompression_mcxt);
pfree((void *) icstate.arrow_columns);
bms_free(icstate.segmentby_cols);
bms_free(icstate.orderby_cols);
const TableAmRoutine *oldtam = switch_to_heapam(relation);
double ntuples = relation->rd_tableam->index_build_range_scan(relation,
indexRelation,
indexInfo,
allow_sync,
anyvisible,
progress,
start_blockno,
numblocks,
callback,
callback_state,
hscan->uscan_desc);
/* Heap's index_build_range_scan() should have ended the scan, so set the
* scan descriptor to NULL here in order to not try to close it again in
* our own table_endscan(). */
hscan->uscan_desc = NULL;
relation->rd_tableam = oldtam;
table_endscan(scan);
if (need_unregister_snapshot)
UnregisterSnapshot(snapshot);
return icstate.ntuples + ntuples;
}
/*
* Validate index.
*
* Used for concurrent index builds.
*/
static void
hypercore_index_validate_scan(Relation compressionRelation, Relation indexRelation,
IndexInfo *indexInfo, Snapshot snapshot, ValidateIndexState *state)
{
FEATURE_NOT_SUPPORTED;
}
/* ------------------------------------------------------------------------
* Miscellaneous callbacks for the Hypercore
* ------------------------------------------------------------------------
*/
static bool
hypercore_relation_needs_toast_table(Relation rel)
{
return false;
}
static Oid
hypercore_relation_toast_am(Relation rel)
{
FEATURE_NOT_SUPPORTED;
return InvalidOid;
}
/* ------------------------------------------------------------------------
* Planner related callbacks for the Hypercore
* ------------------------------------------------------------------------
*/
/*
* Return the relation size in bytes.
*
* The relation size in bytes is computed from the number of blocks in the
* relation multiplied by the block size.
*
* However, since the compression TAM is a "meta" relation over separate
* non-compressed and compressed heaps, the total size is actually the sum of
* the number of blocks in both heaps.
*
* To get the true size of the TAM (non-compressed) relation, it is possible
* to use switch_to_heapam() and bypass the TAM callbacks.
*/
static uint64
hypercore_relation_size(Relation rel, ForkNumber forkNumber)
{
uint64 ubytes = table_block_relation_size(rel, forkNumber);
int32 hyper_id = ts_chunk_get_hypertable_id_by_reloid(rel->rd_id);
if (hyper_id == INVALID_HYPERTABLE_ID)
return ubytes;
HypercoreInfo *hsinfo = RelationGetHypercoreInfo(rel);
/* For ANALYZE, need to return sum for both relations. */
Relation crel = try_relation_open(hsinfo->compressed_relid, AccessShareLock);
if (crel == NULL)
return ubytes;
uint64 cbytes = table_block_relation_size(crel, forkNumber);
relation_close(crel, NoLock);
return ubytes + cbytes;
}
#define HEAP_OVERHEAD_BYTES_PER_TUPLE (MAXALIGN(SizeofHeapTupleHeader) + sizeof(ItemIdData))
#define HEAP_USABLE_BYTES_PER_PAGE (BLCKSZ - SizeOfPageHeaderData)
/*
* Calculate fraction of visible pages.
*
* Same calculation as in PG's table_block_relation_estimate_size().
*/
static double
calc_allvisfrac(BlockNumber curpages, BlockNumber relallvisible)
{
double allvisfrac;
if (relallvisible == 0 || curpages <= 0)
allvisfrac = 0;
else if ((double) relallvisible >= curpages)
allvisfrac = 1;
else
allvisfrac = (double) relallvisible / curpages;
return allvisfrac;
}
/*
* Get the number of blocks on disk of a relation.
*
* Bypasses hypercore_relation_size()/RelationGetNumberOfBlocks(), which
* return the aggregate size (compressed + non-compressed).
*/
static BlockNumber
relation_number_of_disk_blocks(Relation rel)
{
uint64 szbytes = table_block_relation_size(rel, MAIN_FORKNUM);
return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
}
/*
* Estimate the size of a Hypercore relation.
*
* For "heap", PostgreSQL estimates the number of tuples based on the
* difference between the as-of-this-instant number of blocks on disk and the
* current pages in relstats (relpages). In other words, if there are more
* blocks on disk than pages according to relstats, the relation grew and the
* number of tuples can be extrapolated from the previous "tuple density" in
* relstats (reltuples / relpages).
*
* However, this extrapolation doesn't work well for a Hypercore since there
* are situations where a relation can shrink in terms of pages, but grow in
* terms of data. For example, simply compressing a hypercore (with no
* previous compressed data), will shrink the number of blocks significantly
* while there was no change in number of tuples. The standard PostgreSQL
* estimate will believe that a lot of data was deleted, thus vastly
* underestimating the number of tuples. Conversely, decompression will lead
* to overestimating since the number of pages increase drastically.
*
* Note that a hypercore stores the aggregate stats (compressed +
* non-compressed) in the non-compressed relation. So, reltuples is the actual
* number of tuples as of the last ANALYZE (or similar operation that updates
* relstats). Therefore, when estimating tuples, using the normal PG function,
* compute an "average" tuple that represents something in-between a
* non-compressed tuple and a compressed one, based on the fraction of
* compressed vs non-compressed pages. Once there's an estimation of the
* number of "average" tuples, multiply the fraction of compressed tuples with
* the target size of a compressed batch to get the final tuple count.
*
* An alternative approach could be to calculate each relation's estimate
* separately and then add the results. However, that requires having stats
* for each separate relation, but, currently, there are often no stats for
* the compressed relation (this could be fixed, though). However, even if
* there were stats for the compressed relation, those stats would only have
* an accurate compressed tuple count, and the actual number of tuples would
* have to be estimated from that.
*
* Another option is to store custom stats outside relstats where it is
* possible to maintain accurate tuple counts for each relation.
*
* However, until there's a better way to figure out whether data was actually
* added, removed, or stayed the same, it is better to just return the current
* stats, if they exist. Ideally, a hypercore should not be mutated often and
* be mostly (if not completely) compressed. When compressing or
* decompressing, relstats should also be updated. Therefore, the relstats
* should be quite accurate.
*/
static void
hypercore_relation_estimate_size(Relation rel, int32 *attr_widths, BlockNumber *pages,
double *tuples, double *allvisfrac)
{
/*
* We can be called from ProcessUtility with a hypertable because we need
* to process all ALTER TABLE commands in the list to set options
* correctly for the hypertable.
*
* If we are called on a hypertable, we just say that the hypertable does
* not have any pages or tuples.
*/
if (ts_is_hypertable(rel->rd_id))
{
*pages = 0;
*allvisfrac = 0;
*tuples = 0;
return;
}
const HypercoreInfo *hsinfo = RelationGetHypercoreInfo(rel);
const Form_pg_class form = RelationGetForm(rel);
Size overhead_bytes_per_tuples = HEAP_OVERHEAD_BYTES_PER_TUPLE;
Relation crel = table_open(hsinfo->compressed_relid, AccessShareLock);
BlockNumber nblocks = relation_number_of_disk_blocks(rel);
BlockNumber cnblocks = relation_number_of_disk_blocks(crel);
table_close(crel, AccessShareLock);
if (nblocks == 0 && cnblocks == 0)
{
*pages = 0;
*allvisfrac = 0;
*tuples = 0;
return;
}
double frac_noncompressed = 0;
if (form->reltuples >= 0)
{
/*
* There's stats, use it.
*/
*pages = form->relpages;
*tuples = form->reltuples;
*allvisfrac = calc_allvisfrac(nblocks + cnblocks, form->relallvisible);
TS_DEBUG_LOG("(stats) pages %u tuples %lf allvisfrac %f", *pages, *tuples, *allvisfrac);
return;
}
else if (nblocks == 0 && cnblocks > 0)
frac_noncompressed = 0;
else if (nblocks > 0 && cnblocks == 0)
frac_noncompressed = 1;
else
{
Assert(cnblocks != 0);
/* Try to figure out the fraction of data that is compressed vs
* non-compressed. */
frac_noncompressed = ((double) nblocks / (cnblocks * TARGET_COMPRESSED_BATCH_SIZE));
}
/* The overhead will be 0 for mostly compressed data, which is fine
* because compared to non-compressed data the overhead is negligible
* anyway. */
overhead_bytes_per_tuples = rint(HEAP_OVERHEAD_BYTES_PER_TUPLE * frac_noncompressed);
/*
* Compute an estimate based on the "aggregate" relation.
*
* Note that this function gets the number of blocks of the relation in
* order to extrapolate a new tuple count based on the "tuple
* density". This works for the hypercore relation because
* RelationGetNumberOfBlocks() returns the aggregate block count of both
* relations. Also note that using the attr_widths for the non-compressed
* rel won't be very representative for mostly compressed data. Should
* probably compute new "average" attr_widths based on the fraction. But
* that is left for the future.
*/
table_block_relation_estimate_size(rel,
attr_widths,
pages,
tuples,
allvisfrac,
overhead_bytes_per_tuples,
HEAP_USABLE_BYTES_PER_PAGE);
*tuples =
(*tuples * frac_noncompressed) + ((1 - frac_noncompressed) * TARGET_COMPRESSED_BATCH_SIZE);
TS_DEBUG_LOG("(estimated) pages %u tuples %lf allvisfrac %f frac_noncompressed %lf",
*pages,
*tuples,
*allvisfrac,
frac_noncompressed);
}
static void
hypercore_fetch_toast_slice(Relation toastrel, Oid valueid, int32 attrsize, int32 sliceoffset,
int32 slicelength, struct varlena *result)
{
FEATURE_NOT_SUPPORTED;
}
/* ------------------------------------------------------------------------
* Executor related callbacks for the Hypercore
* ------------------------------------------------------------------------
*/
static bool
hypercore_scan_sample_next_block(TableScanDesc scan, SampleScanState *scanstate)
{
FEATURE_NOT_SUPPORTED;
return false;
}
static bool
hypercore_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate,
TupleTableSlot *slot)
{
FEATURE_NOT_SUPPORTED;
return false;
}
/*
* Convert a table to Hypercore.
*
* Need to setup the conversion state used to compress the data.
*/
static void
convert_to_hypercore(Oid relid)
{
Relation relation = table_open(relid, AccessShareLock);
bool compress_chunk_created;
HypercoreInfo *hsinfo = lazy_build_hypercore_info_cache(relation,
false /* create constraints */,
&compress_chunk_created);
if (!compress_chunk_created)
{
/* A compressed relation already exists, so converting from legacy
* compression. It is only necessary to create the proxy vacuum
* index. */
create_proxy_vacuum_index(relation, hsinfo->compressed_relid);
table_close(relation, AccessShareLock);
return;
}
MemoryContext oldcxt = MemoryContextSwitchTo(CacheMemoryContext);
ConversionState *state = palloc0(sizeof(ConversionState));
CompressionSettings *settings = ts_compression_settings_get(hsinfo->compressed_relid);
state->before_size = ts_relation_size_impl(relid);
state->tuplesortstate = compression_create_tuplesort_state(settings, relation);
Assert(state->tuplesortstate);
state->relid = relid;
conversionstate = state;
MemoryContextSwitchTo(oldcxt);
table_close(relation, AccessShareLock);
}
/*
* List of relation IDs used to clean up the compressed relation when
* converting from Hypercore to another TAM (typically heap).
*/
static List *cleanup_relids = NIL;
static void
cleanup_compression_relations(void)
{
if (cleanup_relids != NIL)
{
ListCell *lc;
foreach (lc, cleanup_relids)
{
Oid relid = lfirst_oid(lc);
Chunk *chunk = ts_chunk_get_by_relid(relid, true);
Chunk *compress_chunk = ts_chunk_get_by_id(chunk->fd.compressed_chunk_id, false);
ts_chunk_clear_compressed_chunk(chunk);
if (compress_chunk)
ts_chunk_drop(compress_chunk, DROP_RESTRICT, -1);
}
list_free(cleanup_relids);
cleanup_relids = NIL;
}
}
void
hypercore_xact_event(XactEvent event, void *arg)
{
switch (event)
{
case XACT_EVENT_PRE_COMMIT:
{
ListCell *lc;
/* Check for relations that might now be partially compressed and
* update their status */
foreach (lc, partially_compressed_relids)
{
Oid relid = lfirst_oid(lc);
Relation rel = table_open(relid, AccessShareLock);
/* Calling RelationGetHypercoreInfo() here will create the
* compressed relation if not already created. */
HypercoreInfo *hsinfo = RelationGetHypercoreInfo(rel);
Ensure(OidIsValid(hsinfo->compressed_relid),
"hypercore \"%s\" has no compressed data relation",
get_rel_name(relid));
Chunk *chunk = ts_chunk_get_by_relid(relid, true);
ts_chunk_set_partial(chunk);
table_close(rel, NoLock);
}
break;
}
default:
break;
}
if (partially_compressed_relids != NIL)
{
list_free(partially_compressed_relids);
partially_compressed_relids = NIL;
}
/*
* Cleanup in case of aborted transaction. Need not explicitly check for
* abort since the states should only exist if it is an abort.
*/
if (cleanup_relids != NIL)
{
list_free(cleanup_relids);
cleanup_relids = NIL;
}
if (conversionstate)
{
if (conversionstate->tuplesortstate)
tuplesort_end(conversionstate->tuplesortstate);
pfree(conversionstate);
conversionstate = NULL;
}
}
static void
convert_to_hypercore_finish(Oid relid)
{
if (!conversionstate)
{
/* Without a tuple sort state, conversion happens from legacy
* compression where a compressed relation (chunk) already
* exists. There's nothing more to do. */
return;
}
Chunk *chunk = ts_chunk_get_by_relid(conversionstate->relid, true);
Relation relation = table_open(conversionstate->relid, AccessShareLock);
TupleDesc tupdesc = RelationGetDescr(relation);
if (!chunk)
elog(ERROR, "could not find uncompressed chunk for relation %s", get_rel_name(relid));
Hypertable *ht = ts_hypertable_get_by_id(chunk->fd.hypertable_id);
Hypertable *ht_compressed = ts_hypertable_get_by_id(ht->fd.compressed_hypertable_id);
tuplesort_performsort(conversionstate->tuplesortstate);
/*
* The compressed chunk should have been created in
* convert_to_hypercore_start() if it didn't already exist.
*/
Chunk *c_chunk = ts_chunk_get_by_id(chunk->fd.compressed_chunk_id, true);
Relation compressed_rel = table_open(c_chunk->table_id, RowExclusiveLock);
CompressionSettings *settings = ts_compression_settings_get(RelationGetRelid(compressed_rel));
RowCompressor row_compressor;
row_compressor_init(settings,
&row_compressor,
relation,
compressed_rel,
RelationGetDescr(compressed_rel)->natts,
true /*need_bistate*/,
HEAP_INSERT_FROZEN);
row_compressor_append_sorted_rows(&row_compressor,
conversionstate->tuplesortstate,
tupdesc,
compressed_rel);
row_compressor_close(&row_compressor);
tuplesort_end(conversionstate->tuplesortstate);
conversionstate->tuplesortstate = NULL;
/* Copy chunk constraints (including fkey) to compressed chunk.
* Do this after compressing the chunk to avoid holding strong, unnecessary locks on the
* referenced table during compression.
*/
ts_chunk_constraints_create(ht_compressed, c_chunk);
ts_trigger_create_all_on_chunk(c_chunk);
create_proxy_vacuum_index(relation, RelationGetRelid(compressed_rel));
table_close(relation, NoLock);
table_close(compressed_rel, NoLock);
/* Update compression statistics */
create_compression_relation_size_stats(chunk->fd.id,
chunk->table_id,
c_chunk->fd.id,
c_chunk->table_id,
&conversionstate->before_size,
row_compressor.rowcnt_pre_compression,
row_compressor.num_compressed_rows,
row_compressor.num_compressed_rows);
conversionstate = NULL;
}
/*
* Convert the chunk away from Hypercore to another table access method.
* When this happens it is necessary to cleanup metadata.
*/
static void
convert_from_hypercore(Oid relid)
{
int32 chunk_id = get_chunk_id_from_relid(relid);
ts_compression_chunk_size_delete(chunk_id);
/* Need to truncate the compressed relation after converting from Hypercore */
MemoryContext oldmcxt = MemoryContextSwitchTo(CurTransactionContext);
cleanup_relids = lappend_oid(cleanup_relids, relid);
MemoryContextSwitchTo(oldmcxt);
}
void
hypercore_alter_access_method_begin(Oid relid, bool to_other_am)
{
if (to_other_am)
convert_from_hypercore(relid);
else
convert_to_hypercore(relid);
}
/*
* Called at the end of converting a chunk to a table access method.
*/
void
hypercore_alter_access_method_finish(Oid relid, bool to_other_am)
{
if (to_other_am)
cleanup_compression_relations();
/* Finishing the conversion to Hypercore is handled in the
* finish_bulk_insert callback */
}
/*
* Convert any index-only scans on segmentby indexes to regular index scans
* since index-only scans are not supported on segmentby indexes.
*
* Indexes on segmentby columns are optimized to store only one index
* reference per segment instead of one per value in each segment. This relies
* on "unwrapping" the segment during scanning. However, with an
* IndexOnlyScan, Hypercore's index_fetch_tuple() is not be called to fetch
* the heap tuple (since the scan returns directly from the index), and there
* is no opportunity to unwrap the tuple. Therefore, turn IndexOnlyScans into
* regular IndexScans on segmentby indexes.
*/
static void
convert_index_only_scans(const HypercoreInfo *hsinfo, List *pathlist)
{
ListCell *lc;
foreach (lc, pathlist)
{
Path *path = lfirst(lc);
bool is_segmentby_index = true;
if (path->pathtype == T_IndexOnlyScan)
{
IndexPath *ipath = (IndexPath *) path;
Relation irel = relation_open(ipath->indexinfo->indexoid, AccessShareLock);
const int2vector *indkeys = &irel->rd_index->indkey;
for (int i = 0; i < indkeys->dim1; i++)
{
const AttrNumber attno = indkeys->values[i];
if (!hsinfo->columns[AttrNumberGetAttrOffset(attno)].is_segmentby)
{
is_segmentby_index = false;
break;
}
}
/* Convert this IndexOnlyScan to a regular IndexScan since
* segmentby indexes do not support IndexOnlyScans */
if (is_segmentby_index)
path->pathtype = T_IndexScan;
relation_close(irel, AccessShareLock);
}
}
}
void
hypercore_set_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, Hypertable *ht)
{
const RangeTblEntry *rte = planner_rt_fetch(rel->relid, root);
Relation relation = table_open(rte->relid, AccessShareLock);
const HypercoreInfo *hsinfo = RelationGetHypercoreInfo(relation);
convert_index_only_scans(hsinfo, rel->pathlist);
convert_index_only_scans(hsinfo, rel->partial_pathlist);
table_close(relation, AccessShareLock);
}
/* ------------------------------------------------------------------------
* Definition of the Hypercore table access method.
* ------------------------------------------------------------------------
*/
static const TableAmRoutine hypercore_methods = {
.type = T_TableAmRoutine,
.slot_callbacks = hypercore_slot_callbacks,
.scan_begin = hypercore_beginscan,
.scan_end = hypercore_endscan,
.scan_rescan = hypercore_rescan,
.scan_getnextslot = hypercore_getnextslot,
#if PG14_GE
/*-----------
* Optional functions to provide scanning for ranges of ItemPointers.
* Implementations must either provide both of these functions, or neither
* of them.
*/
.scan_set_tidrange = NULL,
.scan_getnextslot_tidrange = NULL,
#endif
/* ------------------------------------------------------------------------
* Parallel table scan related functions.
* ------------------------------------------------------------------------
*/
.parallelscan_estimate = hypercore_parallelscan_estimate,
.parallelscan_initialize = hypercore_parallelscan_initialize,
.parallelscan_reinitialize = hypercore_parallelscan_reinitialize,
/* ------------------------------------------------------------------------
* Index Scan Callbacks
* ------------------------------------------------------------------------
*/
.index_fetch_begin = hypercore_index_fetch_begin,
.index_fetch_reset = hypercore_index_fetch_reset,
.index_fetch_end = hypercore_index_fetch_end,
.index_fetch_tuple = hypercore_index_fetch_tuple,
/* ------------------------------------------------------------------------
* Manipulations of physical tuples.
* ------------------------------------------------------------------------
*/
.tuple_insert = hypercore_tuple_insert,
.tuple_insert_speculative = hypercore_tuple_insert_speculative,
.tuple_complete_speculative = hypercore_tuple_complete_speculative,
.multi_insert = hypercore_multi_insert,
.tuple_delete = hypercore_tuple_delete,
.tuple_update = hypercore_tuple_update,
.tuple_lock = hypercore_tuple_lock,
.finish_bulk_insert = hypercore_finish_bulk_insert,
/* ------------------------------------------------------------------------
* Callbacks for non-modifying operations on individual tuples
* ------------------------------------------------------------------------
*/
.tuple_fetch_row_version = hypercore_fetch_row_version,
.tuple_get_latest_tid = hypercore_get_latest_tid,
.tuple_tid_valid = hypercore_tuple_tid_valid,
.tuple_satisfies_snapshot = hypercore_tuple_satisfies_snapshot,
#if PG14_GE
.index_delete_tuples = hypercore_index_delete_tuples,
#endif
/* ------------------------------------------------------------------------
* DDL related functionality.
* ------------------------------------------------------------------------
*/
#if PG16_GE
.relation_set_new_filelocator = hypercore_relation_set_new_filelocator,
#else
.relation_set_new_filenode = hypercore_relation_set_new_filelocator,
#endif
.relation_nontransactional_truncate = hypercore_relation_nontransactional_truncate,
.relation_copy_data = hypercore_relation_copy_data,
.relation_copy_for_cluster = hypercore_relation_copy_for_cluster,
.relation_vacuum = hypercore_vacuum_rel,
.scan_analyze_next_block = hypercore_scan_analyze_next_block,
.scan_analyze_next_tuple = hypercore_scan_analyze_next_tuple,
.index_build_range_scan = hypercore_index_build_range_scan,
.index_validate_scan = hypercore_index_validate_scan,
/* ------------------------------------------------------------------------
* Miscellaneous functions.
* ------------------------------------------------------------------------
*/
.relation_size = hypercore_relation_size,
.relation_needs_toast_table = hypercore_relation_needs_toast_table,
.relation_toast_am = hypercore_relation_toast_am,
.relation_fetch_toast_slice = hypercore_fetch_toast_slice,
/* ------------------------------------------------------------------------
* Planner related functions.
* ------------------------------------------------------------------------
*/
.relation_estimate_size = hypercore_relation_estimate_size,
/* ------------------------------------------------------------------------
* Executor related functions.
* ------------------------------------------------------------------------
*/
/* We do not support bitmap heap scan at this point. */
.scan_bitmap_next_block = NULL,
.scan_bitmap_next_tuple = NULL,
.scan_sample_next_block = hypercore_scan_sample_next_block,
.scan_sample_next_tuple = hypercore_scan_sample_next_tuple,
};
const TableAmRoutine *
hypercore_routine(void)
{
return &hypercore_methods;
}
Datum
hypercore_handler(PG_FUNCTION_ARGS)
{
PG_RETURN_POINTER(&hypercore_methods);
}