mirror of
https://github.com/timescale/timescaledb.git
synced 2025-05-18 03:23:37 +08:00
Do not spam log with telemetry problems
The telemetry process runs on a regular basis and usually does not make a lot of noise, but in a few particular cases, it writes entries to the log unnecessarily. If the telemetry server cannot be contacted, it will print a warning in the log that the server cannot be contacted. Since it is nothing wrong with the system and the telemetry process will try to re-connect at a later time, it is unnecessary to print as a warning. If the telemetry response is malformed, a warning is printed. This is also unnecessary since there is nothing wrong with the system, there is nothing the user can do about it, and this warning can be largely ignored. If the hard-coded telemetry scheme is incorrect, a warning will be printed. This should not normally happen, and if it happens on a running server, there is nothing that can be done to eliminate the error message and the message is unnecessary. When the telemetry job exits, a standard termination message is printed in the log. Although harmless, it is mostly confusing and provide no value to the user. If the telemetry process is attempting to connect, or is connected, to the telemetry server, the telemetry server will wait until the connection gets a timeout before shutting down. This is unnecessary since there is no critical problem in aborting the connection and doing a direct shutdown. This commit turns those warnings into notices and installs a signal handler so that the telemetry job exits silently and abort any outstanding connections. Fixes #4028
This commit is contained in:
parent
558da2c5c6
commit
09c0ba7136
@ -26,6 +26,7 @@
|
||||
#include <utils/elog.h>
|
||||
#include <utils/jsonb.h>
|
||||
#include <utils/snapmgr.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "job.h"
|
||||
#include "config.h"
|
||||
@ -67,7 +68,11 @@ ts_bgw_job_start(BgwJob *job, Oid user_oid)
|
||||
BgwParams bgw_params = {
|
||||
.job_id = Int32GetDatum(job->fd.id),
|
||||
.user_oid = user_oid,
|
||||
#ifdef USE_TELEMETRY
|
||||
.croak_silently = is_telemetry_job(job),
|
||||
#endif
|
||||
};
|
||||
|
||||
strlcpy(bgw_params.bgw_main, job_entrypoint_function_name, sizeof(bgw_params.bgw_main));
|
||||
|
||||
return ts_bgw_start_worker(NameStr(job->fd.application_name), &bgw_params);
|
||||
@ -1115,6 +1120,26 @@ ts_job_errors_insert_tuple(const FormData_job_error *job_err)
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Die silently without printing a message.
|
||||
*
|
||||
* This will also abort any connection attempts and reads that are blocking,
|
||||
* e.g., waiting for responses from the telemetry server. Also indicate to the
|
||||
* postmaster that it does not have to print a message.
|
||||
*
|
||||
* We do this deliberately to avoid having a telemetry job that blocks a
|
||||
* shutdown, but that means that the job has to be very careful about what it
|
||||
* writes and not use any atexit routines.
|
||||
*/
|
||||
static void
|
||||
croak(SIGNAL_ARGS)
|
||||
{
|
||||
/* We exit using exit status 0 so that it will only generate a debug log
|
||||
* message. If we exit with 1, it will log the exiting process with a log
|
||||
* message, which we want to avoid. */
|
||||
_exit(0);
|
||||
}
|
||||
|
||||
extern Datum
|
||||
ts_bgw_job_entrypoint(PG_FUNCTION_ARGS)
|
||||
{
|
||||
@ -1135,9 +1160,10 @@ ts_bgw_job_entrypoint(PG_FUNCTION_ARGS)
|
||||
|
||||
/*
|
||||
* do not use the default `bgworker_die` sigterm handler because it does
|
||||
* not respect critical sections
|
||||
* not respect critical sections. We install an silent signal handler for
|
||||
* invisible jobs, which will then die without printing an error message.
|
||||
*/
|
||||
pqsignal(SIGTERM, die);
|
||||
pqsignal(SIGTERM, params.croak_silently ? croak : die);
|
||||
BackgroundWorkerUnblockSignals();
|
||||
|
||||
BackgroundWorkerInitializeConnectionByOid(db_oid, params.user_oid, 0);
|
||||
|
@ -33,6 +33,18 @@ typedef struct BgwParams
|
||||
* connection. */
|
||||
Oid user_oid;
|
||||
|
||||
/**
|
||||
* Job will croak silently and not print normal errors. This is used for
|
||||
* the telemetry job to avoid spamming the log with unrelated issues or
|
||||
* block a shutdown because it is using a blocking system call.
|
||||
*
|
||||
* @note It does not affect printing inside the actual job so you need to
|
||||
* make sure that you use a suitable level for those. You should also keep
|
||||
* in mind that these jobs exits quickly, so no atexit handlers will be
|
||||
* executed.
|
||||
*/
|
||||
bool croak_silently;
|
||||
|
||||
/** Job id to use for the worker when executing the job */
|
||||
int32 job_id;
|
||||
|
||||
|
@ -37,7 +37,10 @@ ts_connection_create(ConnectionType type)
|
||||
Connection *conn;
|
||||
|
||||
if (type == _CONNECTION_MAX)
|
||||
elog(ERROR, "invalid connection type");
|
||||
{
|
||||
elog(NOTICE, "invalid connection type");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (NULL == conn_ops[type])
|
||||
ereport(ERROR,
|
||||
|
@ -8,6 +8,7 @@
|
||||
#include <fmgr.h>
|
||||
#include <miscadmin.h>
|
||||
#include <commands/extension.h>
|
||||
#include <storage/ipc.h>
|
||||
#include <catalog/pg_collation.h>
|
||||
#include <utils/builtins.h>
|
||||
#include <utils/json.h>
|
||||
@ -234,7 +235,7 @@ ts_check_version_response(const char *json)
|
||||
{
|
||||
if (!ts_validate_server_version(json, &result))
|
||||
{
|
||||
elog(WARNING, "server did not return a valid TimescaleDB version: %s", result.errhint);
|
||||
elog(NOTICE, "server did not return a valid TimescaleDB version: %s", result.errhint);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -997,25 +998,28 @@ ts_build_version_request(const char *host, const char *path)
|
||||
return req;
|
||||
}
|
||||
|
||||
static ConnectionType
|
||||
connection_type(const char *service)
|
||||
{
|
||||
if (strcmp("http", service) == 0)
|
||||
return CONNECTION_PLAIN;
|
||||
else if (strcmp("https", service) == 0)
|
||||
return CONNECTION_SSL;
|
||||
|
||||
ereport(NOTICE,
|
||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||
errmsg("scheme \"%s\" not supported for telemetry", service)));
|
||||
return _CONNECTION_MAX;
|
||||
}
|
||||
|
||||
Connection *
|
||||
ts_telemetry_connect(const char *host, const char *service)
|
||||
{
|
||||
Connection *conn = NULL;
|
||||
int ret;
|
||||
Connection *conn = ts_connection_create(connection_type(service));
|
||||
|
||||
if (strcmp("http", service) == 0)
|
||||
conn = ts_connection_create(CONNECTION_PLAIN);
|
||||
else if (strcmp("https", service) == 0)
|
||||
conn = ts_connection_create(CONNECTION_SSL);
|
||||
else
|
||||
ereport(WARNING,
|
||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||
errmsg("scheme \"%s\" not supported for telemetry", service)));
|
||||
|
||||
if (conn == NULL)
|
||||
return NULL;
|
||||
|
||||
ret = ts_connection_connect(conn, host, service, 0);
|
||||
if (conn)
|
||||
{
|
||||
int ret = ts_connection_connect(conn, host, service, 0);
|
||||
|
||||
if (ret < 0)
|
||||
{
|
||||
@ -1024,11 +1028,12 @@ ts_telemetry_connect(const char *host, const char *service)
|
||||
ts_connection_destroy(conn);
|
||||
conn = NULL;
|
||||
|
||||
ereport(WARNING,
|
||||
ereport(NOTICE,
|
||||
(errcode(ERRCODE_INTERNAL_ERROR),
|
||||
errmsg("telemetry could not connect to \"%s\"", host),
|
||||
errdetail("%s", errstr)));
|
||||
}
|
||||
}
|
||||
|
||||
return conn;
|
||||
}
|
||||
@ -1086,13 +1091,13 @@ ts_telemetry_main(const char *host, const char *path, const char *service)
|
||||
|
||||
if (err != HTTP_ERROR_NONE)
|
||||
{
|
||||
elog(WARNING, "telemetry error: %s", ts_http_strerror(err));
|
||||
elog(NOTICE, "telemetry error: %s", ts_http_strerror(err));
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (!ts_http_response_state_valid_status(rsp))
|
||||
{
|
||||
elog(WARNING,
|
||||
elog(NOTICE,
|
||||
"telemetry got unexpected HTTP response status: %d",
|
||||
ts_http_response_state_status_code(rsp));
|
||||
goto cleanup;
|
||||
@ -1113,7 +1118,7 @@ ts_telemetry_main(const char *host, const char *path, const char *service)
|
||||
{
|
||||
/* If the response is malformed, ts_check_version_response() will
|
||||
* throw an error, so we capture the error here and print debugging
|
||||
* information before re-throwing the error. */
|
||||
* information. */
|
||||
ereport(NOTICE,
|
||||
(errmsg("malformed telemetry response body"),
|
||||
errdetail("host=%s, service=%s, path=%s: %s",
|
||||
@ -1121,7 +1126,12 @@ ts_telemetry_main(const char *host, const char *path, const char *service)
|
||||
service,
|
||||
path,
|
||||
json ? json : "<EMPTY>")));
|
||||
PG_RE_THROW();
|
||||
/* Do not throw an error in this case, there is really nothing wrong
|
||||
with the system. It's only telemetry that is having problems, so we
|
||||
just wrap this up and exit. */
|
||||
if (started)
|
||||
AbortCurrentTransaction();
|
||||
return false;
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
|
@ -311,28 +311,28 @@ SELECT test_check_version_response('{"current_timescaledb_version": "10.1.1-rc1"
|
||||
----------------------------------------------------------------
|
||||
-- Test well-formed response but invalid versions
|
||||
SELECT test_check_version_response('{"current_timescaledb_version": "1.0.0-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", "is_up_to_date": false}');
|
||||
WARNING: server did not return a valid TimescaleDB version: version string is too long
|
||||
NOTICE: server did not return a valid TimescaleDB version: version string is too long
|
||||
test_check_version_response
|
||||
-----------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
SELECT test_check_version_response('{"current_timescaledb_version": "10.1.1+rc1", "is_up_to_date": false}');
|
||||
WARNING: server did not return a valid TimescaleDB version: version string has invalid characters
|
||||
NOTICE: server did not return a valid TimescaleDB version: version string has invalid characters
|
||||
test_check_version_response
|
||||
-----------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
SELECT test_check_version_response('{"current_timescaledb_version": "@10.1.1", "is_up_to_date": false}');
|
||||
WARNING: server did not return a valid TimescaleDB version: version string has invalid characters
|
||||
NOTICE: server did not return a valid TimescaleDB version: version string has invalid characters
|
||||
test_check_version_response
|
||||
-----------------------------
|
||||
|
||||
(1 row)
|
||||
|
||||
SELECT test_check_version_response('{"current_timescaledb_version": "10.1.1@", "is_up_to_date": false}');
|
||||
WARNING: server did not return a valid TimescaleDB version: version string has invalid characters
|
||||
NOTICE: server did not return a valid TimescaleDB version: version string has invalid characters
|
||||
test_check_version_response
|
||||
-----------------------------
|
||||
|
||||
@ -354,7 +354,7 @@ ERROR: malformed telemetry response body
|
||||
SET timescaledb.telemetry_level=basic;
|
||||
-- Connect to a bogus host and path to test error handling in telemetry_main()
|
||||
SELECT _timescaledb_internal.test_telemetry_main_conn('noservice.timescale.com', 'path');
|
||||
WARNING: telemetry could not connect to "noservice.timescale.com"
|
||||
NOTICE: telemetry could not connect to "noservice.timescale.com"
|
||||
test_telemetry_main_conn
|
||||
--------------------------
|
||||
f
|
||||
|
Loading…
x
Reference in New Issue
Block a user