Address lag in health check resource error update.

We will now call resource check even for health checks.
2025-05-28 09:46:05 +08:00 · 2024-03-03 19:29:19 +05:30 · 2024-03-03 19:29:19 +05:30 · 9c90290ea7
commit 9c90290ea7
parent 51525108c2
3 changed files with 33 additions and 29 deletions
--- a/include/cached_resource_stat.h
+++ b/include/cached_resource_stat.h
@ -1,5 +1,7 @@
 #pragma once
 #include <cstdint>
+#include <atomic>
+#include <mutex>
 #include <chrono>
 #include <string>
 #include <sys/statvfs.h>
@ -14,18 +16,8 @@ public:

 private:
    const static size_t REFRESH_INTERVAL_SECS = 5;
-    uint64_t disk_total_bytes = 0;
-    uint64_t disk_used_bytes = 0;
-
-    uint64_t memory_total_bytes = 0;
-    uint64_t memory_available_bytes = 0;
-
-    uint64_t swap_total_bytes = 0;
-    uint64_t swap_free_bytes = 0;
-
-    uint64_t last_checked_ts = 0;
-
-    resource_check_t resource_error;
+    std::atomic<uint64_t> last_checked_ts = 0;
+    std::mutex m;

    cached_resource_stat_t() = default;

@ -41,6 +33,4 @@ public:
    resource_check_t has_enough_resources(const std::string& data_dir_path,
                                          const int disk_used_max_percentage,
                                          const int memory_used_max_percentage);
-
-    const resource_check_t get_out_of_resource_error() const;
 };
--- a/src/cached_resource_stat.cpp
+++ b/src/cached_resource_stat.cpp
@ -11,6 +11,17 @@ cached_resource_stat_t::has_enough_resources(const std::string& data_dir_path,
        return cached_resource_stat_t::OK;
    }

+    std::unique_lock lk(m);
+
+    uint64_t disk_total_bytes = 0;
+    uint64_t disk_used_bytes = 0;
+
+    uint64_t memory_total_bytes = 0;
+    uint64_t memory_available_bytes = 0;
+
+    uint64_t swap_total_bytes = 0;
+    uint64_t swap_free_bytes = 0;
+
    uint64_t now = std::chrono::duration_cast<std::chrono::seconds>(
            std::chrono::system_clock::now().time_since_epoch()).count();

@ -61,12 +72,12 @@ cached_resource_stat_t::has_enough_resources(const std::string& data_dir_path,
        last_checked_ts = now;
    }

+    lk.unlock();
+
    double disk_used_percentage = (double(disk_used_bytes)/double(disk_total_bytes)) * 100;
    if(disk_used_percentage > disk_used_max_percentage) {
        LOG(INFO) << "disk_total_bytes: " << disk_total_bytes << ", disk_used_bytes: " << disk_used_bytes
                  << ", disk_used_percentage: " << disk_used_percentage;
-
-        resource_error = cached_resource_stat_t::OUT_OF_DISK;
        return cached_resource_stat_t::OUT_OF_DISK;
    }

@ -79,7 +90,6 @@ cached_resource_stat_t::has_enough_resources(const std::string& data_dir_path,
    uint64_t all_memory_used = (memory_total_bytes - memory_available_bytes) + (swap_total_bytes - swap_free_bytes);

    if(all_memory_used >= memory_total_bytes) {
-        resource_error = cached_resource_stat_t::OUT_OF_MEMORY;
        return cached_resource_stat_t::OUT_OF_MEMORY;
    }

@ -92,14 +102,8 @@ cached_resource_stat_t::has_enough_resources(const std::string& data_dir_path,
        LOG(INFO) << "memory_total: " << memory_total_bytes << ", memory_available: " << memory_available_bytes
                  << ", all_memory_used: " << all_memory_used << ", free_mem: " << free_mem
                  << ", memory_free_min: " << memory_free_min_bytes;
-        resource_error = cached_resource_stat_t::OUT_OF_MEMORY;
        return cached_resource_stat_t::OUT_OF_MEMORY;
    }

-    resource_error = cached_resource_stat_t::OK;
    return cached_resource_stat_t::OK;
 }
-
-const cached_resource_stat_t::resource_check_t cached_resource_stat_t::get_out_of_resource_error() const {
-    return resource_error;
-}
--- a/src/core_api.cpp
+++ b/src/core_api.cpp
@ -315,9 +315,14 @@ bool get_health_with_resource_usage(const std::shared_ptr<http_req>& req, const
    nlohmann::json result;
    bool alive = server->is_alive();

-    auto resource_error = cached_resource_stat_t::get_instance().get_out_of_resource_error();
-    if (resource_error != cached_resource_stat_t::resource_check_t::OK) {
-        result["resource_error"] = std::string(magic_enum::enum_name(resource_error));
+    auto resource_check = cached_resource_stat_t::get_instance().has_enough_resources(
+        Config::get_instance().get_data_dir(),
+        Config::get_instance().get_disk_used_max_percentage(),
+        Config::get_instance().get_memory_used_max_percentage()
+    );
+
+    if (resource_check != cached_resource_stat_t::resource_check_t::OK) {
+        result["resource_error"] = std::string(magic_enum::enum_name(resource_check));
    }

    if(req->params.count("cpu_threshold") != 0 && StringUtils::is_float(req->params["cpu_threshold"])) {
@ -345,9 +350,14 @@ bool get_health(const std::shared_ptr<http_req>& req, const std::shared_ptr<http
    bool alive = server->is_alive();
    result["ok"] = alive;

-    auto resource_error = cached_resource_stat_t::get_instance().get_out_of_resource_error();
-    if (resource_error != cached_resource_stat_t::resource_check_t::OK) {
-        result["resource_error"] = std::string(magic_enum::enum_name(resource_error));
+    auto resource_check = cached_resource_stat_t::get_instance().has_enough_resources(
+        Config::get_instance().get_data_dir(),
+        Config::get_instance().get_disk_used_max_percentage(),
+        Config::get_instance().get_memory_used_max_percentage()
+    );
+
+    if (resource_check != cached_resource_stat_t::resource_check_t::OK) {
+        result["resource_error"] = std::string(magic_enum::enum_name(resource_check));
    }

    if(alive) {