[mod] mutex

This commit is contained in:
Christian Fraß 2023-03-22 11:06:32 +01:00
parent 760ae50e55
commit 386e1d5891
6 changed files with 166 additions and 128 deletions

View file

@ -16,6 +16,6 @@
"includes": [
"script.hmdl.json",
"file_state.hmdl.json",
"generic_remote.hmdl.json"
"generic_remote.hmdl.json"
]
}

View file

@ -6,6 +6,7 @@
"help.title": "Heimdall — Werkzeug zur System-Überwachung",
"help.args.conf_path": "Pfad zur Konfigurations-Datei",
"help.args.state_path": "Pfad zur Zustands-Datei, welche Daten über vorherige Prüfungen enthält; Standard-Wert: Pfad im temporären Verzeichnis des Systems mit eindeutigem Namen in Bezug auf den Pfad zur Konfigurations-Datei",
"help.args.mutex_path": "Pfad zur Datei zur Verhinderung paralleler Ausführung",
"help.args.send_ok_notifications": "ob '{{condition_name}}'-Zustände gemeldet werden sollen",
"help.args.language": "welche Sprache verwendet werden soll (statt der in den Umgebungs-Variablen gesetzten)",
"help.args.erase_state": "ob der Zustand bei Start gelöscht werden soll; das hat zur Folge, dass alle Prüfungen unmittelbar durchgeführt werden",
@ -22,5 +23,6 @@
"checks.http_request.header_value_mismatch": "Header-Wert für Schlüssel '{{key}}' '{{value_actual}}' stimmt nicht mit erwartetem Wert {{value_expected}} überein",
"checks.http_request.body_misses_part": "Rumpf enthält nicht den erwarteten Teil '{{part}}'",
"misc.state_file_path": "Pfad zur Zustands-Datei",
"misc.check_procedure_failed": "Prüfungs-Prozedur fehlgeschlagen"
"misc.check_procedure_failed": "Prüfungs-Prozedur fehlgeschlagen",
"misc.still_running": "läuft bereits/noch"
}

View file

@ -6,6 +6,7 @@
"help.title": "Heimdall — Monitoring Tool",
"help.args.conf_path": "path to the configuration file",
"help.args.state_path": "path to the state file, which contains information about the recent checks; default: file in temporary directory, unique for the conf-path input",
"help.args.mutex_path": "path to file for preventing mutual execution",
"help.args.send_ok_notifications": "whether an '{{condition_name}}' condition shall be reported",
"help.args.language": "language to use (instead of the language, set in the environment variables)",
"help.args.erase_state": "whether the state shall be deleted on start; this will cause that all checks are executed immediatly",
@ -22,5 +23,6 @@
"checks.http_request.header_value_mismatch": "actual header value for key '{{key}}' '{{value_actual}}' and does not match the expected value {{value_expected}}",
"checks.http_request.body_misses_part": "body does not contain the expected part '{{part}}'",
"misc.state_file_path": "state file path",
"misc.check_procedure_failed": "check procedure failed"
"misc.check_procedure_failed": "check procedure failed",
"misc.still_running": "already/still running"
}

View file

@ -1,27 +1,3 @@
def file_read(path):
handle = open(path, "r")
content = handle.read()
handle.close()
return content
def file_write(path, content):
handle = open(path, "w")
handle.write(content)
handle.close()
def string_coin(template, arguments):
result = template
for (key, value, ) in arguments.items():
result = result.replace("{{%s}}" % key, value)
return result
def get_current_timestamp():
return int(round(_time.time(), 0))
def dict_merge(core_dict, mantle_dict, recursive = False):
result_dict = {}
for current_dict in [core_dict, mantle_dict]:
@ -38,6 +14,36 @@ def dict_merge(core_dict, mantle_dict, recursive = False):
return result_dict
def file_read(path):
handle = open(path, "r")
content = handle.read()
handle.close()
return content
def file_write(path, content, options = None):
options = dict_merge(
{
"append": False,
},
({} if (options is None) else options)
)
handle = open(path, "a" if options["append"] else "w")
handle.write(content)
handle.close()
def string_coin(template, arguments):
result = template
for (key, value, ) in arguments.items():
result = result.replace("{{%s}}" % key, value)
return result
def get_current_timestamp():
return int(round(_time.time(), 0))
def env_get_language():
try:
env_lang = _os.environ.get("LANG")

View file

@ -41,6 +41,15 @@ def main():
metavar = "<state-path>",
help = translation_get("help.args.state_path"),
)
argumentparser.add_argument(
"-m",
"--mutex-path",
type = str,
default = "/tmp/heimdall.mutex",
dest = "mutex_path",
metavar = "<mutex-path>",
help = translation_get("help.args.mutex_path"),
)
argumentparser.add_argument(
"-y",
"--send-ok-notifications",
@ -151,119 +160,136 @@ def main():
)
)
### get state data
if (
(not _os.path.exists(state_path))
or
args.erase_state
):
state_data = {}
file_write(state_path, _json.dumps(state_data, indent = "\t"))
### mutex check
if (_os.path.exists(args.mutex_path)):
_sys.stderr.write(
string_coin(
"[error] {{message}} ({{path}})\n",
{
"message": translation_get("misc.still_running"),
"path": args.mutex_path,
}
)
)
_sys.exit(2)
else:
state_data = _json.loads(file_read(state_path))
### iterate through checks
for check_data in conf["checks"]:
if (not check_data["active"]):
pass
file_write(args.mutex_path, "", {"append": True})
### get state data
if (
(not _os.path.exists(state_path))
or
args.erase_state
):
state_data = {}
file_write(state_path, _json.dumps(state_data, indent = "\t"))
else:
### get old state and examine whether the check shall be executed
old_item_state = (
None
if (check_data["name"] not in state_data) else
state_decode(state_data[check_data["name"]])
)
timestamp = get_current_timestamp()
due = (
(old_item_state is None)
or
(old_item_state["condition"] != enum_condition.ok)
or
((timestamp - old_item_state["timestamp"]) >= check_data["schedule"]["regular_interval"])
or
(
(old_item_state["count"] is not None)
and
((timestamp - old_item_state["timestamp"]) >= check_data["schedule"]["attentive_interval"])
)
)
if (not due):
state_data = _json.loads(file_read(state_path))
### iterate through checks
for check_data in conf["checks"]:
if (not check_data["active"]):
pass
else:
_sys.stderr.write(
string_coin(
"-- {{check_name}}\n",
{
"check_name": check_data["name"],
}
### get old state and examine whether the check shall be executed
old_item_state = (
None
if (check_data["name"] not in state_data) else
state_decode(state_data[check_data["name"]])
)
timestamp = get_current_timestamp()
due = (
(old_item_state is None)
or
(old_item_state["condition"] != enum_condition.ok)
or
((timestamp - old_item_state["timestamp"]) >= check_data["schedule"]["regular_interval"])
or
(
(old_item_state["count"] is not None)
and
((timestamp - old_item_state["timestamp"]) >= check_data["schedule"]["attentive_interval"])
)
)
### execute check and set new state
try:
result = check_kind_implementations[check_data["kind"]].run(check_data["parameters"])
except Exception as error:
result = {
"condition": enum_condition.unknown,
"info": {
# "cause": translation_get("misc.check_procedure_failed"),
"error": str(error),
},
}
new_item_state = {
"timestamp": timestamp,
"condition": result["condition"],
"count": (
1
if (
(old_item_state is None)
or
(old_item_state["condition"] != result["condition"])
) else
(
(old_item_state["count"] + 1)
if (not due):
pass
else:
_sys.stderr.write(
string_coin(
"-- {{check_name}}\n",
{
"check_name": check_data["name"],
}
)
)
### execute check and set new state
try:
result = check_kind_implementations[check_data["kind"]].run(check_data["parameters"])
except Exception as error:
result = {
"condition": enum_condition.unknown,
"info": {
# "cause": translation_get("misc.check_procedure_failed"),
"error": str(error),
},
}
new_item_state = {
"timestamp": timestamp,
"condition": result["condition"],
"count": (
1
if (
(old_item_state["count"] is not None)
and
((old_item_state["count"] + 1) <= check_data["threshold"])
(old_item_state is None)
or
(old_item_state["condition"] != result["condition"])
) else
None
)
),
}
state_data[check_data["name"]] = state_encode(new_item_state)
file_write(state_path, _json.dumps(state_data, indent = "\t"))
### send notifications
if (
(
(
(old_item_state["count"] + 1)
if (
(old_item_state["count"] is not None)
and
((old_item_state["count"] + 1) <= check_data["threshold"])
) else
None
)
),
}
state_data[check_data["name"]] = state_encode(new_item_state)
file_write(state_path, _json.dumps(state_data, indent = "\t"))
### send notifications
if (
(
(new_item_state["count"] is not None)
and
(new_item_state["count"] == check_data["threshold"])
(
(new_item_state["count"] is not None)
and
(new_item_state["count"] == check_data["threshold"])
)
or
(
(new_item_state["count"] is None)
and
check_data["annoy"]
)
)
or
and
(
(new_item_state["count"] is None)
and
check_data["annoy"]
)
)
and
(
(new_item_state["condition"] != enum_condition.ok)
or
args.send_ok_notifications
)
):
for notification in check_data["notifications"]:
notification_channel_implementations[notification["kind"]].notify(
notification["parameters"],
check_data["name"],
check_data,
new_item_state,
result["info"]
(new_item_state["condition"] != enum_condition.ok)
or
args.send_ok_notifications
)
):
for notification in check_data["notifications"]:
notification_channel_implementations[notification["kind"]].notify(
notification["parameters"],
check_data["name"],
check_data,
new_item_state,
result["info"]
)
_os.remove(args.mutex_path)
main()

View file

@ -1,4 +1,6 @@
- parallele Zugriffe auf die Zustands-Datei verhindern
- mutex?
- auf DB umstellen?
- Benachrichtigungen versenden, wenn ein Zustand sich wieder normalisiert hat (aber vorher über dem Schwellwert oft nicht OK war)
- erneute Benachrichtigung über nicht-OK-Zustand nach einer Weile (siehe https://gitlab.greenscale.de/tools/heimdall/-/issues/3)
- längere Statistiken über Metriken führen um auch Anstiege/Abfälle auszuwerten (z.B. "Speicherplatzverbrauch innerhalb einer Woche um 5GB gestiegen")