211 lines
10 KiB
SQL

--data nodes
CREATE TABLE IF NOT EXISTS node (
database_name NAME NOT NULL PRIMARY KEY,
schema_name NAME NOT NULL UNIQUE, --public schema of remote
server_name NAME NOT NULL UNIQUE,
hostname TEXT NOT NULL,
active BOOLEAN NOT NULL DEFAULT TRUE,
id SERIAL NOT NULL UNIQUE -- id for node. used in naming
);
--singleton holding info about meta db.
CREATE TABLE IF NOT EXISTS meta (
database_name NAME NOT NULL PRIMARY KEY,
hostname TEXT NOT NULL,
server_name NAME NOT NULL
);
CREATE UNIQUE INDEX there_can_be_only_one_meta
ON meta ((1));
--these users should exist an all nodes+meta in the cluster.
CREATE TABLE IF NOT EXISTS cluster_user (
username TEXT NOT NULL PRIMARY KEY,
password TEXT NULL --not any more of a security hole than usual since stored in pg_user_mapping anyway
);
--The hypertable is an abstraction that represents a replicated table that is partition on 2 dimensions.
--One of the dimensions is time, the other is arbitrary.
--This abstraction also tracks the distinct value set of any columns marked as `distinct`.
--Each row creates 3 tables:
-- i) main table is just an alias to the 0'th replica for now. Represents the hypertable to the user.
-- ii) root table is the ancesstor of all the data tables (across replicas). Should not be queryable for data (TODO).
-- iii) distinct root table is the ancestor of all distinct tables (across replicas). Should not be queryable for data (TODO).
CREATE TABLE IF NOT EXISTS hypertable (
name NAME NOT NULL PRIMARY KEY CHECK (name NOT LIKE '\_%'),
main_schema_name NAME NOT NULL,
main_table_name NAME NOT NULL,
associated_schema_name NAME NOT NULL,
associated_table_prefix NAME NOT NULL,
root_schema_name NAME NOT NULL,
root_table_name NAME NOT NULL,
distinct_schema_name NAME NOT NULL,
distinct_table_name NAME NOT NULL,
insert_temp_table_name NAME NOT NULL,
replication_factor SMALLINT NOT NULL CHECK (replication_factor > 0),
placement chunk_placement_type NOT NULL,
time_field_name NAME NOT NULL,
time_field_type REGTYPE NOT NULL,
created_on NAME NOT NULL REFERENCES node(database_name),
UNIQUE (main_schema_name, main_table_name),
UNIQUE (associated_schema_name, associated_table_prefix),
UNIQUE (root_schema_name, root_table_name)
);
--This represents one replica of the data across all partitions and time.
--Each row creates 2 tables:
-- i) data replica table (schema_name.table_name)
-- parent is the hypertable root table.
-- childen are the partition_replica tables.
-- ii) distinct replica tables (distinct_schema_name.distinct_table_name)
-- parent is the hypertable distinct root table.
-- childen are created by distinct_replica_node table.
CREATE TABLE IF NOT EXISTS hypertable_replica (
hypertable_name NAME NOT NULL REFERENCES hypertable (name),
replica_id SMALLINT NOT NULL CHECK (replica_id >= 0),
schema_name NAME NOT NULL,
table_name NAME NOT NULL,
distinct_schema_name NAME NOT NULL,
distinct_table_name NAME NOT NULL,
PRIMARY KEY (hypertable_name, replica_id),
UNIQUE (schema_name, table_name)
);
--mapping that shows which replica is pointed to by the main table, for each node.
--the translation from main table to replica should happens in C tranformation
--right after the parsing step. (RULES cannot be used, unfortunately)
CREATE TABLE IF NOT EXISTS default_replica_node (
database_name NAME NOT NULL REFERENCES node (database_name),
hypertable_name NAME NOT NULL REFERENCES hypertable (name),
replica_id SMALLINT NOT NULL CHECK (replica_id >= 0),
PRIMARY KEY (database_name, hypertable_name),
FOREIGN KEY (hypertable_name, replica_id) REFERENCES hypertable_replica (hypertable_name, replica_id)
);
--there should be one distinct_replica_node for each node with a chunk from that replica
--so there can be multiple rows for one hypertable-replica on different nodes.
--that way writes are local. Optimized reads are also local for many queries.
--But, some read queries are cross-node.
--Each row creates a table.
-- Parent table: hypertable_replica.distinct_table
-- No children, created table contains data.
CREATE TABLE IF NOT EXISTS distinct_replica_node (
hypertable_name NAME NOT NULL,
replica_id SMALLINT NOT NULL,
database_name NAME NOT NULL REFERENCES node (database_name),
schema_name NAME NOT NULL,
table_name NAME NOT NULL,
PRIMARY KEY (hypertable_name, replica_id, database_name),
UNIQUE (schema_name, table_name),
FOREIGN KEY (hypertable_name, replica_id) REFERENCES hypertable_replica (hypertable_name, replica_id)
);
--A partition epoch represents a different partitioning of the data.
--It has a start and end time (data time). Data needs to be placed in the correct epoch by time.
--This should change very infrequently. Expensive to start new epoch.
CREATE TABLE IF NOT EXISTS partition_epoch (
id SERIAL NOT NULL PRIMARY KEY,
hypertable_name NAME NOT NULL REFERENCES hypertable (name),
start_time BIGINT NULL CHECK (start_time > 0),
end_time BIGINT NULL CHECK (end_time > 0),
partitioning_func NAME NOT NULL,
partitioning_mod INT NOT NULL CHECK (partitioning_mod < 65536),
partitioning_field NAME NOT NULL,
UNIQUE (hypertable_name, start_time),
UNIQUE (hypertable_name, end_time),
CHECK (start_time < end_time)
);
-- A partition defines a partition in a partition_epoch.
-- For any partition the keyspace is defined as [0, partition_epoch.partitioning_mod]
-- For any epoch, there must be a partition that covers every element in the keyspace.
CREATE TABLE IF NOT EXISTS partition (
id SERIAL NOT NULL PRIMARY KEY,
epoch_id INT NOT NULL REFERENCES partition_epoch (id),
keyspace_start SMALLINT NOT NULL CHECK (keyspace_start >= 0), --start inclusive
keyspace_end SMALLINT NOT NULL CHECK (keyspace_end > 0), --end inclusive; compatible with between operator
UNIQUE (epoch_id, keyspace_start),
CHECK (keyspace_end > keyspace_start)
);
--Represents a replica for a partition.
--Each row creates a table:
-- Parent: "hypertable_replica.schema_name"."hypertable_replica.table_name"
-- Children: "chunk_replica_node.schema_name"."chunk_replica_node.table_name"
--TODO: trigger to verify partition_epoch hypertable name matches this hypertable_name
CREATE TABLE IF NOT EXISTS partition_replica (
id SERIAL NOT NULL PRIMARY KEY,
partition_id INT NOT NULL REFERENCES partition (id),
hypertable_name NAME NOT NULL,
replica_id SMALLINT NOT NULL,
schema_name NAME NOT NULL,
table_name NAME NOT NULL,
UNIQUE (schema_name, table_name),
UNIQUE (partition_id, replica_id),
FOREIGN KEY (hypertable_name, replica_id) REFERENCES hypertable_replica (hypertable_name, replica_id)
);
-- Represent a chunk of data
-- i.e. data for a particular hypername-partition for a particular time.
CREATE TABLE IF NOT EXISTS chunk (
id SERIAL NOT NULL PRIMARY KEY,
partition_id INT NOT NULL REFERENCES partition (id),
start_time BIGINT NULL CHECK (start_time > 0),
end_time BIGINT NULL CHECK (end_time > 0),
UNIQUE (partition_id, start_time),
UNIQUE (partition_id, end_time),
CHECK (start_time < end_time)
);
--A mapping between chunks, partition_replica, and node.
--This represents the table where actual data is stored.
--Each row represents a table:
-- Parent table: "partition_replica.schema_name"."partition_replica.table_name"
CREATE TABLE IF NOT EXISTS chunk_replica_node (
chunk_id INT NOT NULL REFERENCES chunk (id),
partition_replica_id INT NOT NULL REFERENCES partition_replica (id),
database_name NAME NOT NULL REFERENCES node (database_name),
schema_name NAME NOT NULL,
table_name NAME NOT NULL,
PRIMARY KEY (chunk_id, partition_replica_id), --a single chunk, replica tuple
UNIQUE (chunk_id, database_name), --no two chunk replicas on same node
UNIQUE (schema_name, table_name)
);
--Represents a hypertable field.
--TODO: remove is_partitioning. defined in partition_epoch table.
CREATE TABLE IF NOT EXISTS field (
hypertable_name NAME NOT NULL REFERENCES hypertable (name),
name NAME NOT NULL,
attnum INT2 NOT NULL, --MUST match pg_attribute.attnum on main table. SHOULD match on root/hierarchy table as well.
data_type REGTYPE NOT NULL,
default_value TEXT NULL,
is_distinct BOOLEAN NOT NULL DEFAULT FALSE,
not_null BOOLEAN NOT NULL,
created_on NAME NOT NULL REFERENCES node(database_name),
modified_on NAME NOT NULL REFERENCES node(database_name),
PRIMARY KEY (hypertable_name, name),
UNIQUE(hypertable_name, attnum)
);
CREATE TABLE IF NOT EXISTS deleted_field (
LIKE field,
deleted_on NAME
);
CREATE TABLE IF NOT EXISTS hypertable_index (
hypertable_name NAME NOT NULL REFERENCES hypertable (name),
main_schema_name NAME NOT NULL, --schema name of main table (needed for a uniqueness constraint)
main_index_name NAME NOT NULL, --index name on main table
definition TEXT NOT NULL, --def with /*INDEX_NAME*/ and /*TABLE_NAME*/ placeholders
created_on NAME NOT NULL REFERENCES node(database_name),
PRIMARY KEY (hypertable_name, main_index_name),
UNIQUE(main_schema_name, main_index_name) --globally unique since index names globally unique
);
CREATE TABLE IF NOT EXISTS deleted_hypertable_index (
LIKE hypertable_index,
deleted_on NAME
);