From c43223a16de9f1c65e0aa14fd6a71837655385f8 Mon Sep 17 00:00:00 2001
From: Louis Lam <louislam@users.noreply.github.com>
Date: Wed, 1 Nov 2023 09:36:12 +0800
Subject: [PATCH] Restart running monitors if no heartbeat (#3952)

---
 server/model/monitor.js      | 18 +++++++-
 server/uptime-kuma-server.js | 89 ++++++++++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+), 1 deletion(-)

diff --git a/server/model/monitor.js b/server/model/monitor.js
index 1e08ccd4..c2d5a9a4 100644
--- a/server/model/monitor.js
+++ b/server/model/monitor.js
@@ -3,7 +3,7 @@ const dayjs = require("dayjs");
 const axios = require("axios");
 const { Prometheus } = require("../prometheus");
 const { log, UP, DOWN, PENDING, MAINTENANCE, flipStatus, TimeLogger, MAX_INTERVAL_SECOND, MIN_INTERVAL_SECOND,
-    SQL_DATETIME_FORMAT
+    SQL_DATETIME_FORMAT, isDev, sleep, getRandomInt
 } = require("../../src/util");
 const { tcping, ping, dnsResolve, checkCertificate, checkStatusCode, getTotalClientInRoom, setting, mssqlQuery, postgresQuery, mysqlQuery, mqttAsync, setSetting, httpNtlm, radius, grpcQuery,
     redisPingAsync, mongodbPing, kafkaProducerAsync, getOidcTokenClientCredentials, rootCertificatesFingerprints
@@ -328,6 +328,16 @@ class Monitor extends BeanModel {
                 }
             }
 
+            // Evil
+            if (isDev) {
+                if (process.env.EVIL_RANDOM_MONITOR_SLEEP === "SURE") {
+                    if (getRandomInt(0, 100) === 0) {
+                        log.debug("evil", `[${this.name}] Evil mode: Random sleep: ` + beatInterval * 10000);
+                        await sleep(beatInterval * 10000);
+                    }
+                }
+            }
+
             // Expose here for prometheus update
             // undefined if not https
             let tlsInfo = undefined;
@@ -995,6 +1005,7 @@ class Monitor extends BeanModel {
             if (! this.isStop) {
                 log.debug("monitor", `[${this.name}] SetTimeout for next check.`);
                 this.heartbeatInterval = setTimeout(safeBeat, beatInterval * 1000);
+                this.lastScheduleBeatTime = dayjs();
             } else {
                 log.info("monitor", `[${this.name}] isStop = true, no next check.`);
             }
@@ -1004,7 +1015,9 @@ class Monitor extends BeanModel {
         /** Get a heartbeat and handle errors */
         const safeBeat = async () => {
             try {
+                this.lastStartBeatTime = dayjs();
                 await beat();
+                this.lastEndBeatTime = dayjs();
             } catch (e) {
                 console.trace(e);
                 UptimeKumaServer.errorLog(e, false);
@@ -1013,6 +1026,9 @@ class Monitor extends BeanModel {
                 if (! this.isStop) {
                     log.info("monitor", "Try to restart the monitor");
                     this.heartbeatInterval = setTimeout(safeBeat, this.interval * 1000);
+                    this.lastScheduleBeatTime = dayjs();
+                } else {
+                    log.info("monitor", "isStop = true, no next check.");
                 }
             }
         };
diff --git a/server/uptime-kuma-server.js b/server/uptime-kuma-server.js
index 6acc8d4d..6b1d3d01 100644
--- a/server/uptime-kuma-server.js
+++ b/server/uptime-kuma-server.js
@@ -12,6 +12,7 @@ const { Settings } = require("./settings");
 const dayjs = require("dayjs");
 const childProcess = require("child_process");
 const path = require("path");
+const axios = require("axios");
 // DO NOT IMPORT HERE IF THE MODULES USED `UptimeKumaServer.getInstance()`, put at the bottom of this file instead.
 
 /**
@@ -62,6 +63,8 @@ class UptimeKumaServer {
      */
     jwtSecret = null;
 
+    checkMonitorsInterval = null;
+
     static getInstance(args) {
         if (UptimeKumaServer.instance == null) {
             UptimeKumaServer.instance = new UptimeKumaServer(args);
@@ -75,6 +78,9 @@ class UptimeKumaServer {
         const sslCert = args["ssl-cert"] || process.env.UPTIME_KUMA_SSL_CERT || process.env.SSL_CERT || undefined;
         const sslKeyPassphrase = args["ssl-key-passphrase"] || process.env.UPTIME_KUMA_SSL_KEY_PASSPHRASE || process.env.SSL_KEY_PASSPHRASE || undefined;
 
+        // Set default axios timeout to 5 minutes instead of infinity
+        axios.defaults.timeout = 300 * 1000;
+
         log.info("server", "Creating express and socket.io instance");
         this.app = express();
         if (sslKey && sslCert) {
@@ -346,6 +352,10 @@ class UptimeKumaServer {
         if (enable || enable === null) {
             this.startNSCDServices();
         }
+
+        this.checkMonitorsInterval = setInterval(() => {
+            this.checkMonitors();
+        }, 60 * 1000);
     }
 
     /**
@@ -358,6 +368,8 @@ class UptimeKumaServer {
         if (enable || enable === null) {
             this.stopNSCDServices();
         }
+
+        clearInterval(this.checkMonitorsInterval);
     }
 
     /**
@@ -388,6 +400,83 @@ class UptimeKumaServer {
             }
         }
     }
+
+    /**
+     * Start the specified monitor
+     * @param {number} monitorID ID of monitor to start
+     * @returns {Promise<void>}
+     */
+    async startMonitor(monitorID) {
+        log.info("manage", `Resume Monitor: ${monitorID} by server`);
+
+        await R.exec("UPDATE monitor SET active = 1 WHERE id = ?", [
+            monitorID,
+        ]);
+
+        let monitor = await R.findOne("monitor", " id = ? ", [
+            monitorID,
+        ]);
+
+        if (monitor.id in this.monitorList) {
+            this.monitorList[monitor.id].stop();
+        }
+
+        this.monitorList[monitor.id] = monitor;
+        monitor.start(this.io);
+    }
+
+    /**
+     * Restart a given monitor
+     * @param {number} monitorID ID of monitor to start
+     * @returns {Promise<void>}
+     */
+    async restartMonitor(monitorID) {
+        return await this.startMonitor(monitorID);
+    }
+
+    /**
+     * Check if monitors are running properly
+     */
+    async checkMonitors() {
+        log.debug("monitor_checker", "Checking monitors");
+
+        for (let monitorID in this.monitorList) {
+            let monitor = this.monitorList[monitorID];
+
+            // Not for push monitor
+            if (monitor.type === "push") {
+                continue;
+            }
+
+            if (!monitor.active) {
+                continue;
+            }
+
+            // Check the lastStartBeatTime, if it is too long, then restart
+            if (monitor.lastScheduleBeatTime ) {
+                let diff = dayjs().diff(monitor.lastStartBeatTime, "second");
+
+                if (diff > monitor.interval * 1.5) {
+                    log.error("monitor_checker", `Monitor Interval: ${monitor.interval} Monitor ` + monitorID + " lastStartBeatTime diff: " + diff);
+                    log.error("monitor_checker", "Unexpected error: Monitor " + monitorID + " is struck for unknown reason");
+                    log.error("monitor_checker", "Last start beat time: " + R.isoDateTime(monitor.lastStartBeatTime));
+                    log.error("monitor_checker", "Last end beat time: " + R.isoDateTime(monitor.lastEndBeatTime));
+                    log.error("monitor_checker", "Last ScheduleBeatTime: " + R.isoDateTime(monitor.lastScheduleBeatTime));
+
+                    // Restart
+                    log.error("monitor_checker", `Restarting monitor ${monitorID} automatically now`);
+                    this.restartMonitor(monitorID);
+                } else {
+                    //log.debug("monitor_checker", "Monitor " + monitorID + " is running normally");
+                }
+            } else {
+                //log.debug("monitor_checker", "Monitor " + monitorID + " is not started yet, skipp");
+            }
+
+        }
+
+        log.debug("monitor_checker", "Checking monitors end");
+    }
 }
 
 module.exports = {