1
0
mirror of https://github.com/timescale/timescaledb.git synced 2025-05-15 18:13:18 +08:00
Matvey Arye 53ff6567ef Add state machine and polling to launcher
This PR changes the launcher to use a state machine to keep track
of the state of each database scheduler. Further, it add polling
to go through the list of databases and check their states. This
solves several issues

1) A CREATE DATABASE call using a template that already has TimescaleDB
installed previously did not start a scheduler until the next database
restart. A test for this case has been added.
2) A lack of available slots or background workers when a new database was added
meant that the scheduler would not be stared until the next database
restart. Now this will be retried on every polling event.

This PR also simplifies logic since database entries are never removed
from the hash table and thus never added more than once. State
transitions are now easier to read and reason about.

Documentations for the state transitions has been added.
2018-10-15 11:10:05 -07:00

416 lines
13 KiB
Plaintext

\c single_2 :ROLE_SUPERUSER
\ir include/bgw_launcher_utils.sql
/*
* Note on testing: need a couple wrappers that pg_sleep in a loop to wait for changes
* to appear in pg_stat_activity.
* Further Note: PG 9.6 changed what appeared in pg_stat_activity, so the launcher doesn't actually show up.
* we can still test its interactions with its children, but can't test some of the things specific to the launcher.
* So we've added some bits about the version number as needed.
*/
CREATE VIEW worker_counts as SELECT count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Launcher') as launcher,
count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = 'single') as single_scheduler,
count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = 'single_2') as single_2_scheduler,
count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = 'template1') as template1_scheduler
FROM pg_stat_activity;
CREATE FUNCTION wait_worker_counts(launcher_ct INTEGER, scheduler1_ct INTEGER, scheduler2_ct INTEGER, template1_ct INTEGER) RETURNS BOOLEAN LANGUAGE PLPGSQL AS
$BODY$
DECLARE
r INTEGER;
BEGIN
FOR i in 1..10
LOOP
SELECT COUNT(*) from worker_counts where (launcher = launcher_ct OR current_setting('server_version_num')::int < 100000)
AND single_scheduler = scheduler1_ct AND single_2_scheduler = scheduler2_ct and template1_scheduler = template1_ct into r;
if(r < 1) THEN
PERFORM pg_sleep(0.1);
PERFORM pg_stat_clear_snapshot();
ELSE
--We have the correct counts!
RETURN TRUE;
END IF;
END LOOP;
RETURN FALSE;
END
$BODY$;
/*
* When we've connected to single_2, we should be able to see the cluster launcher
* and the scheduler for single in pg_stat_activity
* but single_2 shouldn't have a scheduler because ext not created yet
*/
SELECT wait_worker_counts(1,1,0,0);
wait_worker_counts
--------------------
t
(1 row)
/*Now create the extension in single_2*/
SET client_min_messages = ERROR;
CREATE EXTENSION timescaledb CASCADE;
RESET client_min_messages;
SELECT wait_worker_counts(1,1,1,0);
wait_worker_counts
--------------------
t
(1 row)
DROP DATABASE single;
/* Now the db_scheduler for single should have disappeared*/
SELECT wait_worker_counts(1,0,1,0);
wait_worker_counts
--------------------
t
(1 row)
/*Now let's restart the scheduler and make sure our backend_start changed */
SELECT backend_start as orig_backend_start
FROM pg_stat_activity
WHERE application_name = 'TimescaleDB Background Worker Scheduler'
AND datname = 'single_2' \gset
/* We'll do this in a txn so that we can see that the worker locks on our txn before continuing*/
BEGIN;
SELECT _timescaledb_internal.restart_background_workers();
restart_background_workers
----------------------------
t
(1 row)
SELECT wait_worker_counts(1,0,1,0);
wait_worker_counts
--------------------
t
(1 row)
SELECT (backend_start > :'orig_backend_start'::timestamptz) backend_start_changed,
(wait_event = 'virtualxid') wait_event_changed
FROM pg_stat_activity
WHERE application_name = 'TimescaleDB Background Worker Scheduler'
AND datname = 'single_2';
backend_start_changed | wait_event_changed
-----------------------+--------------------
t | t
(1 row)
COMMIT;
SELECT wait_worker_counts(1,0,1,0);
wait_worker_counts
--------------------
t
(1 row)
SELECT (wait_event IS DISTINCT FROM 'virtualxid') wait_event_changed
FROM pg_stat_activity
WHERE application_name = 'TimescaleDB Background Worker Scheduler'
AND datname = 'single_2';
wait_event_changed
--------------------
t
(1 row)
/*Test stop*/
SELECT _timescaledb_internal.stop_background_workers();
stop_background_workers
-------------------------
t
(1 row)
SELECT wait_worker_counts(1,0,0,0);
wait_worker_counts
--------------------
t
(1 row)
/*Make sure it doesn't break if we stop twice in a row*/
SELECT _timescaledb_internal.stop_background_workers();
stop_background_workers
-------------------------
t
(1 row)
SELECT wait_worker_counts(1,0,0,0);
wait_worker_counts
--------------------
t
(1 row)
/*test start*/
SELECT _timescaledb_internal.start_background_workers();
start_background_workers
--------------------------
t
(1 row)
SELECT wait_worker_counts(1,0,1,0);
wait_worker_counts
--------------------
t
(1 row)
/*make sure start is idempotent*/
SELECT backend_start as orig_backend_start
FROM pg_stat_activity
WHERE application_name = 'TimescaleDB Background Worker Scheduler'
AND datname = 'single_2' \gset
/* Since we're doing idempotency tests, we're also going to exercise our queue and start 20 times*/
SELECT _timescaledb_internal.start_background_workers() as start_background_workers, * FROM generate_series(1,20);
start_background_workers | generate_series
--------------------------+-----------------
t | 1
t | 2
t | 3
t | 4
t | 5
t | 6
t | 7
t | 8
t | 9
t | 10
t | 11
t | 12
t | 13
t | 14
t | 15
t | 16
t | 17
t | 18
t | 19
t | 20
(20 rows)
/*Here we're waiting to see if something shows up in pg_stat_activity,
* so we have to condition our loop in the opposite way. We'll only wait
* half a second in total as well so that tests don't take too long. */
CREATE FUNCTION wait_equals(TIMESTAMPTZ) RETURNS BOOLEAN LANGUAGE PLPGSQL AS
$BODY$
DECLARE
r BOOLEAN;
BEGIN
FOR i in 1..5
LOOP
SELECT (backend_start = $1::timestamptz) backend_start_unchanged
FROM pg_stat_activity
WHERE application_name = 'TimescaleDB Background Worker Scheduler'
AND datname = 'single_2' into r;
if(r) THEN
PERFORM pg_sleep(0.1);
PERFORM pg_stat_clear_snapshot();
ELSE
RETURN FALSE;
END IF;
END LOOP;
RETURN TRUE;
END
$BODY$;
select wait_equals(:'orig_backend_start');
wait_equals
-------------
t
(1 row)
SELECT _timescaledb_internal.restart_background_workers();
restart_background_workers
----------------------------
t
(1 row)
SELECT wait_worker_counts(1,0,1,0);
wait_worker_counts
--------------------
t
(1 row)
/*Make sure drop extension statement restarts the worker and on rollback it keeps running*/
/*Now let's restart the scheduler and make sure our backend_start changed */
SELECT backend_start as orig_backend_start
FROM pg_stat_activity
WHERE application_name = 'TimescaleDB Background Worker Scheduler'
AND datname = 'single_2' \gset
BEGIN;
DROP EXTENSION timescaledb;
SELECT wait_worker_counts(1,0,1,0);
wait_worker_counts
--------------------
t
(1 row)
ROLLBACK;
CREATE FUNCTION wait_greater(TIMESTAMPTZ) RETURNS BOOLEAN LANGUAGE PLPGSQL AS
$BODY$
DECLARE
r BOOLEAN;
BEGIN
FOR i in 1..10
LOOP
SELECT (backend_start > $1::timestamptz) backend_start_changed
FROM pg_stat_activity
WHERE application_name = 'TimescaleDB Background Worker Scheduler'
AND datname = 'single_2' into r;
if(NOT r) THEN
PERFORM pg_sleep(0.1);
PERFORM pg_stat_clear_snapshot();
ELSE
RETURN TRUE;
END IF;
END LOOP;
RETURN FALSE;
END
$BODY$;
SELECT wait_greater(:'orig_backend_start');
wait_greater
--------------
t
(1 row)
/* Make sure canceling the launcher backend causes a restart of schedulers */
SELECT backend_start as orig_backend_start
FROM pg_stat_activity
WHERE application_name = 'TimescaleDB Background Worker Scheduler'
AND datname = 'single_2' \gset
SELECT coalesce(
(SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE application_name = 'TimescaleDB Background Worker Launcher'),
(SELECT current_setting('server_version_num')::int < 100000));
coalesce
----------
t
(1 row)
SELECT wait_worker_counts(1,0,1,0);
wait_worker_counts
--------------------
t
(1 row)
SELECT ((current_setting('server_version_num')::int < 100000) OR wait_greater(:'orig_backend_start')) as wait_greater;
wait_greater
--------------
t
(1 row)
/* Make sure dropping the extension means that the scheduler is stopped*/
BEGIN;
DROP EXTENSION timescaledb;
COMMIT;
SELECT wait_worker_counts(1,0,0,0);
wait_worker_counts
--------------------
t
(1 row)
/* Test that background workers are stopped with DROP OWNED */
ALTER ROLE :ROLE_DEFAULT_PERM_USER WITH SUPERUSER;
\c single_2 :ROLE_DEFAULT_PERM_USER
SET client_min_messages = ERROR;
CREATE EXTENSION timescaledb CASCADE;
RESET client_min_messages;
/* Make sure there is 1 launcher and 1 bgw in single_2 */
SELECT wait_worker_counts(launcher_ct=>1, scheduler1_ct=> 0, scheduler2_ct=>1, template1_ct=>0);
wait_worker_counts
--------------------
t
(1 row)
/* drop a non-owner of the extension results in no change to worker counts */
DROP OWNED BY :ROLE_DEFAULT_PERM_USER_2;
SELECT wait_worker_counts(launcher_ct=>1, scheduler1_ct=> 0, scheduler2_ct=>1, template1_ct=>0);
wait_worker_counts
--------------------
t
(1 row)
/* drop of owner of extension results in extension drop and a stop to the bgw */
DROP OWNED BY :ROLE_DEFAULT_PERM_USER;
/* The worker in single_2 is dead. Note that 0s are respected */
SELECT wait_worker_counts(launcher_ct=>1, scheduler1_ct=>0, scheduler2_ct=>0, template1_ct=>0);
wait_worker_counts
--------------------
t
(1 row)
\c single_2 :ROLE_SUPERUSER
ALTER ROLE :ROLE_DEFAULT_PERM_USER WITH NOSUPERUSER;
/* Connect to the template1 database */
\c template1
\ir include/bgw_launcher_utils.sql
/*
* Note on testing: need a couple wrappers that pg_sleep in a loop to wait for changes
* to appear in pg_stat_activity.
* Further Note: PG 9.6 changed what appeared in pg_stat_activity, so the launcher doesn't actually show up.
* we can still test its interactions with its children, but can't test some of the things specific to the launcher.
* So we've added some bits about the version number as needed.
*/
CREATE VIEW worker_counts as SELECT count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Launcher') as launcher,
count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = 'single') as single_scheduler,
count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = 'single_2') as single_2_scheduler,
count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = 'template1') as template1_scheduler
FROM pg_stat_activity;
CREATE FUNCTION wait_worker_counts(launcher_ct INTEGER, scheduler1_ct INTEGER, scheduler2_ct INTEGER, template1_ct INTEGER) RETURNS BOOLEAN LANGUAGE PLPGSQL AS
$BODY$
DECLARE
r INTEGER;
BEGIN
FOR i in 1..10
LOOP
SELECT COUNT(*) from worker_counts where (launcher = launcher_ct OR current_setting('server_version_num')::int < 100000)
AND single_scheduler = scheduler1_ct AND single_2_scheduler = scheduler2_ct and template1_scheduler = template1_ct into r;
if(r < 1) THEN
PERFORM pg_sleep(0.1);
PERFORM pg_stat_clear_snapshot();
ELSE
--We have the correct counts!
RETURN TRUE;
END IF;
END LOOP;
RETURN FALSE;
END
$BODY$;
BEGIN;
/* Then create extension there in a txn and make sure we see a scheduler start */
SET client_min_messages = ERROR;
CREATE EXTENSION timescaledb CASCADE;
RESET client_min_messages;
SELECT wait_worker_counts(1,0,0,1);
wait_worker_counts
--------------------
t
(1 row)
COMMIT;
/* End our transaction and it should immediately exit because it's a template database.*/
SELECT wait_worker_counts(1,0,0,0);
wait_worker_counts
--------------------
t
(1 row)
/* Clean up the template database, removing our test utilities etc */
\ir include/bgw_launcher_utils_cleanup.sql
DROP FUNCTION wait_worker_counts(integer, integer, integer, integer);
DROP VIEW worker_counts;
\c single_2
/* Now try creating a DB from a template with the extension already installed.
* Make sure we see a scheduler start. */
CREATE DATABASE single;
SELECT wait_worker_counts(1,1,0,0);
wait_worker_counts
--------------------
t
(1 row)
DROP DATABASE single;
/* Now make sure that there's no race between create database and create extension.
* Although to be honest, this race probably wouldn't manifest in this test. */
\c template1
DROP EXTENSION timescaledb;
\c single_2
CREATE DATABASE single;
\c single
SET client_min_messages = ERROR;
CREATE EXTENSION timescaledb;
RESET client_min_messages;
\c single_2
SELECT wait_worker_counts(1,1,0,0);
wait_worker_counts
--------------------
t
(1 row)