2017-01-18 21:15:14 +01:00

257 lines
13 KiB
SQL

-- This file contains table definitions for various abstractions and data
-- structures for representing hypertables and lower level concepts.
-- Data node information for the cluster. database_name is the postgres database
-- located on at hostname. server_name is used to identify the connection.
-- schema_name is the name of the schema used to represent the node on the meta
-- node (it stores remote wrappers to update meta tables on data nodes).
CREATE TABLE IF NOT EXISTS node (
database_name NAME NOT NULL PRIMARY KEY,
schema_name NAME NOT NULL UNIQUE, --public schema of remote
server_name NAME NOT NULL UNIQUE,
hostname TEXT NOT NULL,
active BOOLEAN NOT NULL DEFAULT TRUE,
id SERIAL NOT NULL UNIQUE -- id for node. used in naming
);
-- Singleton (i.e. should only contain one row) holding info about meta db.
CREATE TABLE IF NOT EXISTS meta (
database_name NAME NOT NULL PRIMARY KEY,
hostname TEXT NOT NULL,
server_name NAME NOT NULL
);
CREATE UNIQUE INDEX there_can_be_only_one_meta
ON meta ((1));
-- Users should exist an all nodes+meta in the cluster.
CREATE TABLE IF NOT EXISTS cluster_user (
username TEXT NOT NULL PRIMARY KEY,
password TEXT NULL --not any more of a security hole than usual since stored in pg_user_mapping anyway
);
-- The hypertable is an abstraction that represents a replicated table that is
-- partitioned on 2 dimensions: time and another (user-)chosen one.
-- This abstraction also tracks the distinct value set of any columns marked as `distinct`.
--
-- Each row, representing a hypertable, creates 3 tables:
-- 1) main table - an alias to the 0'th replica for now. Represents the
-- hypertable to the user for insertion and modification.
-- 2) root table - ancesstor of all the data tables (across replicas).
-- Should not be queryable for data (TODO).
-- 3) distinct root table - ancestor of all distinct tables (across replicas).
-- Should not be queryable for data (TODO).
--
-- Additionally, a schema for associated tables (partitioned, replicated data
-- tables) is created.
--
-- The name and type of the time field (used to partition on time) are defined
-- in `time_field_name` and `time_field_type`.
CREATE TABLE IF NOT EXISTS hypertable (
name NAME NOT NULL PRIMARY KEY CHECK (name NOT LIKE '\_%'),
main_schema_name NAME NOT NULL,
main_table_name NAME NOT NULL,
associated_schema_name NAME NOT NULL,
associated_table_prefix NAME NOT NULL,
root_schema_name NAME NOT NULL,
root_table_name NAME NOT NULL,
distinct_schema_name NAME NOT NULL,
distinct_table_name NAME NOT NULL,
replication_factor SMALLINT NOT NULL CHECK (replication_factor > 0),
placement chunk_placement_type NOT NULL,
time_field_name NAME NOT NULL,
time_field_type REGTYPE NOT NULL,
created_on NAME NOT NULL REFERENCES node(database_name),
chunk_size_bytes BIGINT NOT NULL CHECK (chunk_size_bytes > 0),
UNIQUE (main_schema_name, main_table_name),
UNIQUE (associated_schema_name, associated_table_prefix),
UNIQUE (root_schema_name, root_table_name)
);
-- deleted_hypertable is used to avoid deadlocks when doing multinode drops.
CREATE TABLE IF NOT EXISTS deleted_hypertable (
LIKE hypertable,
deleted_on NAME
);
-- hypertable_replica contains information on how a hypertable's data replicas
-- are stored. A replica of the data is across all partitions and time.
--
-- Each row identifies 2 tables for each hypertable + replica_id combination:
-- 1) data replica table (schema_name.table_name) -
-- All the data for a hypertable.
-- Parent: hypertable's `root table`
-- Children: hypertable's `partition_replica` tables
-- 2) distinct replica tables (distinct_schema_name.distinct_table_name) -
-- Distinct values in a hypertable.
-- Parent: hypertable's `distinct root table`
-- Children: created by `distinct_replica_node` table
CREATE TABLE IF NOT EXISTS hypertable_replica (
hypertable_name NAME NOT NULL REFERENCES hypertable (name) ON DELETE CASCADE,
replica_id SMALLINT NOT NULL CHECK (replica_id >= 0),
schema_name NAME NOT NULL,
table_name NAME NOT NULL,
distinct_schema_name NAME NOT NULL,
distinct_table_name NAME NOT NULL,
PRIMARY KEY (hypertable_name, replica_id),
UNIQUE (schema_name, table_name)
);
-- Mapping that shows which replica is pointed to by the main table on
-- each node. The translation from main table to replica should happen
-- in C tranformation right after the parsing step.
-- (Postgres RULES cannot be used, unfortunately)
CREATE TABLE IF NOT EXISTS default_replica_node (
database_name NAME NOT NULL REFERENCES node (database_name),
hypertable_name NAME NOT NULL REFERENCES hypertable (name) ON DELETE CASCADE,
replica_id SMALLINT NOT NULL CHECK (replica_id >= 0),
PRIMARY KEY (database_name, hypertable_name),
FOREIGN KEY (hypertable_name, replica_id) REFERENCES hypertable_replica (hypertable_name, replica_id)
);
--there should be one distinct_replica_node for each node with a chunk from that replica
--so there can be multiple rows for one hypertable-replica on different nodes.
--that way writes are local. Optimized reads are also local for many queries.
--But, some read queries are cross-node.
--Each row creates a table.
-- Parent table: hypertable_replica.distinct_table
-- No children, created table contains data.
CREATE TABLE IF NOT EXISTS distinct_replica_node (
hypertable_name NAME NOT NULL,
replica_id SMALLINT NOT NULL,
database_name NAME NOT NULL REFERENCES node (database_name),
schema_name NAME NOT NULL,
table_name NAME NOT NULL,
PRIMARY KEY (hypertable_name, replica_id, database_name),
UNIQUE (schema_name, table_name),
FOREIGN KEY (hypertable_name, replica_id) REFERENCES hypertable_replica (hypertable_name, replica_id) ON DELETE CASCADE
);
-- A partition_epoch represents a different partitioning of the data.
-- It has a start and end time (data time). Data needs to be placed in the correct epoch by time.
-- Partitionings are defined by a function, field, and modulo:
-- 1) partitioning_func - Takes the partitioning_field and returns a number
-- which is modulo'd to place the data correctly
-- 2) partitioning_mod - Number used in modulo operation
-- 3) partitioning_field - Field in data to partition by (input to partitioning_func)
--
-- Changing a data's partitioning, and thus creating a new epoch, should be done
-- INFREQUENTLY as it's expensive operation.
CREATE TABLE IF NOT EXISTS partition_epoch (
id SERIAL NOT NULL PRIMARY KEY,
hypertable_name NAME NOT NULL REFERENCES hypertable (name) ON DELETE CASCADE,
start_time BIGINT NULL CHECK (start_time > 0),
end_time BIGINT NULL CHECK (end_time > 0),
partitioning_func NAME NOT NULL, --function name of a function of the form func(data_value, partitioning_mod) -> [0, partitioning_mod)
partitioning_mod INT NOT NULL CHECK (partitioning_mod < 65536),
partitioning_field NAME NOT NULL,
UNIQUE (hypertable_name, start_time),
UNIQUE (hypertable_name, end_time),
CHECK (start_time < end_time)
);
-- A partition defines a partition witin a partition_epoch.
-- For any partition the keyspace is defined as [keyspace_start, keyspace_end].
-- For any epoch, there must be a partition that covers every element in the
-- keyspace, i.e. from [0, partition_epoch.partitioning_mod].
CREATE TABLE IF NOT EXISTS partition (
id SERIAL NOT NULL PRIMARY KEY,
epoch_id INT NOT NULL REFERENCES partition_epoch (id) ON DELETE CASCADE,
keyspace_start SMALLINT NOT NULL CHECK (keyspace_start >= 0), --start inclusive
keyspace_end SMALLINT NOT NULL CHECK (keyspace_end > 0), --end inclusive; compatible with between operator
UNIQUE (epoch_id, keyspace_start),
CHECK (keyspace_end > keyspace_start)
);
--Represents a replica for a partition.
--Each row creates a table:
-- Parent: "hypertable_replica.schema_name"."hypertable_replica.table_name"
-- Children: "chunk_replica_node.schema_name"."chunk_replica_node.table_name"
--TODO: trigger to verify partition_epoch hypertable name matches this hypertable_name
CREATE TABLE IF NOT EXISTS partition_replica (
id SERIAL NOT NULL PRIMARY KEY,
partition_id INT NOT NULL REFERENCES partition (id) ON DELETE CASCADE,
hypertable_name NAME NOT NULL,
replica_id SMALLINT NOT NULL,
schema_name NAME NOT NULL,
table_name NAME NOT NULL,
UNIQUE (schema_name, table_name),
UNIQUE (partition_id, replica_id),
FOREIGN KEY (hypertable_name, replica_id) REFERENCES hypertable_replica (hypertable_name, replica_id) ON DELETE CASCADE
);
-- Represent a (replicated) chunk of data, which is data in a hypertable that is
-- both partitioned by both the partition_field and time.
--
-- For each partition, there can be 0 or more chunks, which are replicated.
-- At most two chunks per partition are "open-ended", i.e. having a NULL
-- start_time or a NULL end_time. A NULL start_time means the chunk has
-- data from the beginning of time until end_time. A NULL end_time means the
-- chunk has data from start_time until the end of time. Only when there is
-- one chunk for a partition can it be open-ended on BOTH start_time and end_time.
--
-- TODO(erik) - Describe conditions of closure.
CREATE TABLE IF NOT EXISTS chunk (
id SERIAL NOT NULL PRIMARY KEY,
partition_id INT NOT NULL REFERENCES partition (id) ON DELETE CASCADE,
start_time BIGINT NULL CHECK (start_time >= 0),
end_time BIGINT NULL CHECK (end_time >= 0),
UNIQUE (partition_id, start_time),
UNIQUE (partition_id, end_time),
CHECK (start_time <= end_time)
);
-- A mapping between chunks, partition_replica, and nodes representing where
-- actual data is stored. That is, a chunk_replica_node is a particular
-- replication instance of a chunk.
--
-- Each row represents a table:
-- Parent table: "partition_replica.schema_name"."partition_replica.table_name"
CREATE TABLE IF NOT EXISTS chunk_replica_node (
chunk_id INT NOT NULL REFERENCES chunk (id) ON DELETE CASCADE,
partition_replica_id INT NOT NULL REFERENCES partition_replica (id) ON DELETE CASCADE,
database_name NAME NOT NULL REFERENCES node (database_name),
schema_name NAME NOT NULL,
table_name NAME NOT NULL,
PRIMARY KEY (chunk_id, partition_replica_id), --a single chunk, replica tuple
UNIQUE (chunk_id, database_name), --no two chunk replicas on same node
UNIQUE (schema_name, table_name)
);
-- Represents a hypertable field.
CREATE TABLE IF NOT EXISTS field (
hypertable_name NAME NOT NULL REFERENCES hypertable (name) ON DELETE CASCADE,
name NAME NOT NULL,
attnum INT2 NOT NULL, --MUST match pg_attribute.attnum on main table. SHOULD match on root/hierarchy table as well.
data_type REGTYPE NOT NULL,
default_value TEXT NULL,
is_distinct BOOLEAN NOT NULL DEFAULT FALSE,
not_null BOOLEAN NOT NULL,
created_on NAME NOT NULL REFERENCES node(database_name),
modified_on NAME NOT NULL REFERENCES node(database_name),
PRIMARY KEY (hypertable_name, name),
UNIQUE(hypertable_name, attnum)
);
-- TODO(mat) - Description?
CREATE TABLE IF NOT EXISTS deleted_field (
LIKE field,
deleted_on NAME
);
CREATE TABLE IF NOT EXISTS hypertable_index (
hypertable_name NAME NOT NULL REFERENCES hypertable (name) ON DELETE CASCADE,
main_schema_name NAME NOT NULL, --schema name of main table (needed for a uniqueness constraint)
main_index_name NAME NOT NULL, --index name on main table
definition TEXT NOT NULL, --def with /*INDEX_NAME*/ and /*TABLE_NAME*/ placeholders
created_on NAME NOT NULL REFERENCES node(database_name),
PRIMARY KEY (hypertable_name, main_index_name),
UNIQUE(main_schema_name, main_index_name) --globally unique since index names globally unique
);
-- TODO(mat) - Description?
CREATE TABLE IF NOT EXISTS deleted_hypertable_index (
LIKE hypertable_index,
deleted_on NAME
);