From 386e1d5891dc7a801ba7b81e3d2ea4a90d4b4e7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20Fra=C3=9F?= Date: Wed, 22 Mar 2023 11:06:32 +0100 Subject: [PATCH] [mod] mutex --- examples/main.hmdl.json | 2 +- source/localization/de.json | 4 +- source/localization/en.json | 4 +- source/logic/lib.py | 54 +++++---- source/logic/main.py | 228 ++++++++++++++++++++---------------- todo.md | 2 + 6 files changed, 166 insertions(+), 128 deletions(-) diff --git a/examples/main.hmdl.json b/examples/main.hmdl.json index 0ea0fb2..7b6abb1 100644 --- a/examples/main.hmdl.json +++ b/examples/main.hmdl.json @@ -16,6 +16,6 @@ "includes": [ "script.hmdl.json", "file_state.hmdl.json", - "generic_remote.hmdl.json" + "generic_remote.hmdl.json" ] } diff --git a/source/localization/de.json b/source/localization/de.json index 1f510ea..af7b4f2 100644 --- a/source/localization/de.json +++ b/source/localization/de.json @@ -6,6 +6,7 @@ "help.title": "Heimdall — Werkzeug zur System-Überwachung", "help.args.conf_path": "Pfad zur Konfigurations-Datei", "help.args.state_path": "Pfad zur Zustands-Datei, welche Daten über vorherige Prüfungen enthält; Standard-Wert: Pfad im temporären Verzeichnis des Systems mit eindeutigem Namen in Bezug auf den Pfad zur Konfigurations-Datei", + "help.args.mutex_path": "Pfad zur Datei zur Verhinderung paralleler Ausführung", "help.args.send_ok_notifications": "ob '{{condition_name}}'-Zustände gemeldet werden sollen", "help.args.language": "welche Sprache verwendet werden soll (statt der in den Umgebungs-Variablen gesetzten)", "help.args.erase_state": "ob der Zustand bei Start gelöscht werden soll; das hat zur Folge, dass alle Prüfungen unmittelbar durchgeführt werden", @@ -22,5 +23,6 @@ "checks.http_request.header_value_mismatch": "Header-Wert für Schlüssel '{{key}}' '{{value_actual}}' stimmt nicht mit erwartetem Wert {{value_expected}} überein", "checks.http_request.body_misses_part": "Rumpf enthält nicht den erwarteten Teil '{{part}}'", "misc.state_file_path": "Pfad zur Zustands-Datei", - "misc.check_procedure_failed": "Prüfungs-Prozedur fehlgeschlagen" + "misc.check_procedure_failed": "Prüfungs-Prozedur fehlgeschlagen", + "misc.still_running": "läuft bereits/noch" } diff --git a/source/localization/en.json b/source/localization/en.json index 77ef5b7..9ace217 100644 --- a/source/localization/en.json +++ b/source/localization/en.json @@ -6,6 +6,7 @@ "help.title": "Heimdall — Monitoring Tool", "help.args.conf_path": "path to the configuration file", "help.args.state_path": "path to the state file, which contains information about the recent checks; default: file in temporary directory, unique for the conf-path input", + "help.args.mutex_path": "path to file for preventing mutual execution", "help.args.send_ok_notifications": "whether an '{{condition_name}}' condition shall be reported", "help.args.language": "language to use (instead of the language, set in the environment variables)", "help.args.erase_state": "whether the state shall be deleted on start; this will cause that all checks are executed immediatly", @@ -22,5 +23,6 @@ "checks.http_request.header_value_mismatch": "actual header value for key '{{key}}' '{{value_actual}}' and does not match the expected value {{value_expected}}", "checks.http_request.body_misses_part": "body does not contain the expected part '{{part}}'", "misc.state_file_path": "state file path", - "misc.check_procedure_failed": "check procedure failed" + "misc.check_procedure_failed": "check procedure failed", + "misc.still_running": "already/still running" } diff --git a/source/logic/lib.py b/source/logic/lib.py index de3d928..f57aa5e 100644 --- a/source/logic/lib.py +++ b/source/logic/lib.py @@ -1,27 +1,3 @@ -def file_read(path): - handle = open(path, "r") - content = handle.read() - handle.close() - return content - - -def file_write(path, content): - handle = open(path, "w") - handle.write(content) - handle.close() - - -def string_coin(template, arguments): - result = template - for (key, value, ) in arguments.items(): - result = result.replace("{{%s}}" % key, value) - return result - - -def get_current_timestamp(): - return int(round(_time.time(), 0)) - - def dict_merge(core_dict, mantle_dict, recursive = False): result_dict = {} for current_dict in [core_dict, mantle_dict]: @@ -38,6 +14,36 @@ def dict_merge(core_dict, mantle_dict, recursive = False): return result_dict +def file_read(path): + handle = open(path, "r") + content = handle.read() + handle.close() + return content + + +def file_write(path, content, options = None): + options = dict_merge( + { + "append": False, + }, + ({} if (options is None) else options) + ) + handle = open(path, "a" if options["append"] else "w") + handle.write(content) + handle.close() + + +def string_coin(template, arguments): + result = template + for (key, value, ) in arguments.items(): + result = result.replace("{{%s}}" % key, value) + return result + + +def get_current_timestamp(): + return int(round(_time.time(), 0)) + + def env_get_language(): try: env_lang = _os.environ.get("LANG") diff --git a/source/logic/main.py b/source/logic/main.py index 864c1bb..20654e6 100644 --- a/source/logic/main.py +++ b/source/logic/main.py @@ -41,6 +41,15 @@ def main(): metavar = "", help = translation_get("help.args.state_path"), ) + argumentparser.add_argument( + "-m", + "--mutex-path", + type = str, + default = "/tmp/heimdall.mutex", + dest = "mutex_path", + metavar = "", + help = translation_get("help.args.mutex_path"), + ) argumentparser.add_argument( "-y", "--send-ok-notifications", @@ -151,119 +160,136 @@ def main(): ) ) - ### get state data - if ( - (not _os.path.exists(state_path)) - or - args.erase_state - ): - state_data = {} - file_write(state_path, _json.dumps(state_data, indent = "\t")) + ### mutex check + if (_os.path.exists(args.mutex_path)): + _sys.stderr.write( + string_coin( + "[error] {{message}} ({{path}})\n", + { + "message": translation_get("misc.still_running"), + "path": args.mutex_path, + } + ) + ) + _sys.exit(2) else: - state_data = _json.loads(file_read(state_path)) - - ### iterate through checks - for check_data in conf["checks"]: - if (not check_data["active"]): - pass + file_write(args.mutex_path, "", {"append": True}) + + ### get state data + if ( + (not _os.path.exists(state_path)) + or + args.erase_state + ): + state_data = {} + file_write(state_path, _json.dumps(state_data, indent = "\t")) else: - ### get old state and examine whether the check shall be executed - old_item_state = ( - None - if (check_data["name"] not in state_data) else - state_decode(state_data[check_data["name"]]) - ) - timestamp = get_current_timestamp() - due = ( - (old_item_state is None) - or - (old_item_state["condition"] != enum_condition.ok) - or - ((timestamp - old_item_state["timestamp"]) >= check_data["schedule"]["regular_interval"]) - or - ( - (old_item_state["count"] is not None) - and - ((timestamp - old_item_state["timestamp"]) >= check_data["schedule"]["attentive_interval"]) - ) - ) - if (not due): + state_data = _json.loads(file_read(state_path)) + + ### iterate through checks + for check_data in conf["checks"]: + if (not check_data["active"]): pass else: - _sys.stderr.write( - string_coin( - "-- {{check_name}}\n", - { - "check_name": check_data["name"], - } + ### get old state and examine whether the check shall be executed + old_item_state = ( + None + if (check_data["name"] not in state_data) else + state_decode(state_data[check_data["name"]]) + ) + timestamp = get_current_timestamp() + due = ( + (old_item_state is None) + or + (old_item_state["condition"] != enum_condition.ok) + or + ((timestamp - old_item_state["timestamp"]) >= check_data["schedule"]["regular_interval"]) + or + ( + (old_item_state["count"] is not None) + and + ((timestamp - old_item_state["timestamp"]) >= check_data["schedule"]["attentive_interval"]) ) ) - - ### execute check and set new state - try: - result = check_kind_implementations[check_data["kind"]].run(check_data["parameters"]) - except Exception as error: - result = { - "condition": enum_condition.unknown, - "info": { - # "cause": translation_get("misc.check_procedure_failed"), - "error": str(error), - }, - } - new_item_state = { - "timestamp": timestamp, - "condition": result["condition"], - "count": ( - 1 - if ( - (old_item_state is None) - or - (old_item_state["condition"] != result["condition"]) - ) else - ( - (old_item_state["count"] + 1) + if (not due): + pass + else: + _sys.stderr.write( + string_coin( + "-- {{check_name}}\n", + { + "check_name": check_data["name"], + } + ) + ) + + ### execute check and set new state + try: + result = check_kind_implementations[check_data["kind"]].run(check_data["parameters"]) + except Exception as error: + result = { + "condition": enum_condition.unknown, + "info": { + # "cause": translation_get("misc.check_procedure_failed"), + "error": str(error), + }, + } + new_item_state = { + "timestamp": timestamp, + "condition": result["condition"], + "count": ( + 1 if ( - (old_item_state["count"] is not None) - and - ((old_item_state["count"] + 1) <= check_data["threshold"]) + (old_item_state is None) + or + (old_item_state["condition"] != result["condition"]) ) else - None - ) - ), - } - state_data[check_data["name"]] = state_encode(new_item_state) - file_write(state_path, _json.dumps(state_data, indent = "\t")) - - ### send notifications - if ( - ( + ( + (old_item_state["count"] + 1) + if ( + (old_item_state["count"] is not None) + and + ((old_item_state["count"] + 1) <= check_data["threshold"]) + ) else + None + ) + ), + } + state_data[check_data["name"]] = state_encode(new_item_state) + file_write(state_path, _json.dumps(state_data, indent = "\t")) + + ### send notifications + if ( ( - (new_item_state["count"] is not None) - and - (new_item_state["count"] == check_data["threshold"]) + ( + (new_item_state["count"] is not None) + and + (new_item_state["count"] == check_data["threshold"]) + ) + or + ( + (new_item_state["count"] is None) + and + check_data["annoy"] + ) ) - or + and ( - (new_item_state["count"] is None) - and - check_data["annoy"] - ) - ) - and - ( - (new_item_state["condition"] != enum_condition.ok) - or - args.send_ok_notifications - ) - ): - for notification in check_data["notifications"]: - notification_channel_implementations[notification["kind"]].notify( - notification["parameters"], - check_data["name"], - check_data, - new_item_state, - result["info"] + (new_item_state["condition"] != enum_condition.ok) + or + args.send_ok_notifications ) + ): + for notification in check_data["notifications"]: + notification_channel_implementations[notification["kind"]].notify( + notification["parameters"], + check_data["name"], + check_data, + new_item_state, + result["info"] + ) + + _os.remove(args.mutex_path) main() diff --git a/todo.md b/todo.md index 0766f5e..db8e275 100644 --- a/todo.md +++ b/todo.md @@ -1,4 +1,6 @@ - parallele Zugriffe auf die Zustands-Datei verhindern + - mutex? + - auf DB umstellen? - Benachrichtigungen versenden, wenn ein Zustand sich wieder normalisiert hat (aber vorher über dem Schwellwert oft nicht OK war) - erneute Benachrichtigung über nicht-OK-Zustand nach einer Weile (siehe https://gitlab.greenscale.de/tools/heimdall/-/issues/3) - längere Statistiken über Metriken führen um auch Anstiege/Abfälle auszuwerten (z.B. "Speicherplatzverbrauch innerhalb einer Woche um 5GB gestiegen")