[mod] mutex
This commit is contained in:
parent
760ae50e55
commit
386e1d5891
|
|
@ -16,6 +16,6 @@
|
|||
"includes": [
|
||||
"script.hmdl.json",
|
||||
"file_state.hmdl.json",
|
||||
"generic_remote.hmdl.json"
|
||||
"generic_remote.hmdl.json"
|
||||
]
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@
|
|||
"help.title": "Heimdall — Werkzeug zur System-Überwachung",
|
||||
"help.args.conf_path": "Pfad zur Konfigurations-Datei",
|
||||
"help.args.state_path": "Pfad zur Zustands-Datei, welche Daten über vorherige Prüfungen enthält; Standard-Wert: Pfad im temporären Verzeichnis des Systems mit eindeutigem Namen in Bezug auf den Pfad zur Konfigurations-Datei",
|
||||
"help.args.mutex_path": "Pfad zur Datei zur Verhinderung paralleler Ausführung",
|
||||
"help.args.send_ok_notifications": "ob '{{condition_name}}'-Zustände gemeldet werden sollen",
|
||||
"help.args.language": "welche Sprache verwendet werden soll (statt der in den Umgebungs-Variablen gesetzten)",
|
||||
"help.args.erase_state": "ob der Zustand bei Start gelöscht werden soll; das hat zur Folge, dass alle Prüfungen unmittelbar durchgeführt werden",
|
||||
|
|
@ -22,5 +23,6 @@
|
|||
"checks.http_request.header_value_mismatch": "Header-Wert für Schlüssel '{{key}}' '{{value_actual}}' stimmt nicht mit erwartetem Wert {{value_expected}} überein",
|
||||
"checks.http_request.body_misses_part": "Rumpf enthält nicht den erwarteten Teil '{{part}}'",
|
||||
"misc.state_file_path": "Pfad zur Zustands-Datei",
|
||||
"misc.check_procedure_failed": "Prüfungs-Prozedur fehlgeschlagen"
|
||||
"misc.check_procedure_failed": "Prüfungs-Prozedur fehlgeschlagen",
|
||||
"misc.still_running": "läuft bereits/noch"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@
|
|||
"help.title": "Heimdall — Monitoring Tool",
|
||||
"help.args.conf_path": "path to the configuration file",
|
||||
"help.args.state_path": "path to the state file, which contains information about the recent checks; default: file in temporary directory, unique for the conf-path input",
|
||||
"help.args.mutex_path": "path to file for preventing mutual execution",
|
||||
"help.args.send_ok_notifications": "whether an '{{condition_name}}' condition shall be reported",
|
||||
"help.args.language": "language to use (instead of the language, set in the environment variables)",
|
||||
"help.args.erase_state": "whether the state shall be deleted on start; this will cause that all checks are executed immediatly",
|
||||
|
|
@ -22,5 +23,6 @@
|
|||
"checks.http_request.header_value_mismatch": "actual header value for key '{{key}}' '{{value_actual}}' and does not match the expected value {{value_expected}}",
|
||||
"checks.http_request.body_misses_part": "body does not contain the expected part '{{part}}'",
|
||||
"misc.state_file_path": "state file path",
|
||||
"misc.check_procedure_failed": "check procedure failed"
|
||||
"misc.check_procedure_failed": "check procedure failed",
|
||||
"misc.still_running": "already/still running"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,27 +1,3 @@
|
|||
def file_read(path):
|
||||
handle = open(path, "r")
|
||||
content = handle.read()
|
||||
handle.close()
|
||||
return content
|
||||
|
||||
|
||||
def file_write(path, content):
|
||||
handle = open(path, "w")
|
||||
handle.write(content)
|
||||
handle.close()
|
||||
|
||||
|
||||
def string_coin(template, arguments):
|
||||
result = template
|
||||
for (key, value, ) in arguments.items():
|
||||
result = result.replace("{{%s}}" % key, value)
|
||||
return result
|
||||
|
||||
|
||||
def get_current_timestamp():
|
||||
return int(round(_time.time(), 0))
|
||||
|
||||
|
||||
def dict_merge(core_dict, mantle_dict, recursive = False):
|
||||
result_dict = {}
|
||||
for current_dict in [core_dict, mantle_dict]:
|
||||
|
|
@ -38,6 +14,36 @@ def dict_merge(core_dict, mantle_dict, recursive = False):
|
|||
return result_dict
|
||||
|
||||
|
||||
def file_read(path):
|
||||
handle = open(path, "r")
|
||||
content = handle.read()
|
||||
handle.close()
|
||||
return content
|
||||
|
||||
|
||||
def file_write(path, content, options = None):
|
||||
options = dict_merge(
|
||||
{
|
||||
"append": False,
|
||||
},
|
||||
({} if (options is None) else options)
|
||||
)
|
||||
handle = open(path, "a" if options["append"] else "w")
|
||||
handle.write(content)
|
||||
handle.close()
|
||||
|
||||
|
||||
def string_coin(template, arguments):
|
||||
result = template
|
||||
for (key, value, ) in arguments.items():
|
||||
result = result.replace("{{%s}}" % key, value)
|
||||
return result
|
||||
|
||||
|
||||
def get_current_timestamp():
|
||||
return int(round(_time.time(), 0))
|
||||
|
||||
|
||||
def env_get_language():
|
||||
try:
|
||||
env_lang = _os.environ.get("LANG")
|
||||
|
|
|
|||
|
|
@ -41,6 +41,15 @@ def main():
|
|||
metavar = "<state-path>",
|
||||
help = translation_get("help.args.state_path"),
|
||||
)
|
||||
argumentparser.add_argument(
|
||||
"-m",
|
||||
"--mutex-path",
|
||||
type = str,
|
||||
default = "/tmp/heimdall.mutex",
|
||||
dest = "mutex_path",
|
||||
metavar = "<mutex-path>",
|
||||
help = translation_get("help.args.mutex_path"),
|
||||
)
|
||||
argumentparser.add_argument(
|
||||
"-y",
|
||||
"--send-ok-notifications",
|
||||
|
|
@ -151,119 +160,136 @@ def main():
|
|||
)
|
||||
)
|
||||
|
||||
### get state data
|
||||
if (
|
||||
(not _os.path.exists(state_path))
|
||||
or
|
||||
args.erase_state
|
||||
):
|
||||
state_data = {}
|
||||
file_write(state_path, _json.dumps(state_data, indent = "\t"))
|
||||
### mutex check
|
||||
if (_os.path.exists(args.mutex_path)):
|
||||
_sys.stderr.write(
|
||||
string_coin(
|
||||
"[error] {{message}} ({{path}})\n",
|
||||
{
|
||||
"message": translation_get("misc.still_running"),
|
||||
"path": args.mutex_path,
|
||||
}
|
||||
)
|
||||
)
|
||||
_sys.exit(2)
|
||||
else:
|
||||
state_data = _json.loads(file_read(state_path))
|
||||
|
||||
### iterate through checks
|
||||
for check_data in conf["checks"]:
|
||||
if (not check_data["active"]):
|
||||
pass
|
||||
file_write(args.mutex_path, "", {"append": True})
|
||||
|
||||
### get state data
|
||||
if (
|
||||
(not _os.path.exists(state_path))
|
||||
or
|
||||
args.erase_state
|
||||
):
|
||||
state_data = {}
|
||||
file_write(state_path, _json.dumps(state_data, indent = "\t"))
|
||||
else:
|
||||
### get old state and examine whether the check shall be executed
|
||||
old_item_state = (
|
||||
None
|
||||
if (check_data["name"] not in state_data) else
|
||||
state_decode(state_data[check_data["name"]])
|
||||
)
|
||||
timestamp = get_current_timestamp()
|
||||
due = (
|
||||
(old_item_state is None)
|
||||
or
|
||||
(old_item_state["condition"] != enum_condition.ok)
|
||||
or
|
||||
((timestamp - old_item_state["timestamp"]) >= check_data["schedule"]["regular_interval"])
|
||||
or
|
||||
(
|
||||
(old_item_state["count"] is not None)
|
||||
and
|
||||
((timestamp - old_item_state["timestamp"]) >= check_data["schedule"]["attentive_interval"])
|
||||
)
|
||||
)
|
||||
if (not due):
|
||||
state_data = _json.loads(file_read(state_path))
|
||||
|
||||
### iterate through checks
|
||||
for check_data in conf["checks"]:
|
||||
if (not check_data["active"]):
|
||||
pass
|
||||
else:
|
||||
_sys.stderr.write(
|
||||
string_coin(
|
||||
"-- {{check_name}}\n",
|
||||
{
|
||||
"check_name": check_data["name"],
|
||||
}
|
||||
### get old state and examine whether the check shall be executed
|
||||
old_item_state = (
|
||||
None
|
||||
if (check_data["name"] not in state_data) else
|
||||
state_decode(state_data[check_data["name"]])
|
||||
)
|
||||
timestamp = get_current_timestamp()
|
||||
due = (
|
||||
(old_item_state is None)
|
||||
or
|
||||
(old_item_state["condition"] != enum_condition.ok)
|
||||
or
|
||||
((timestamp - old_item_state["timestamp"]) >= check_data["schedule"]["regular_interval"])
|
||||
or
|
||||
(
|
||||
(old_item_state["count"] is not None)
|
||||
and
|
||||
((timestamp - old_item_state["timestamp"]) >= check_data["schedule"]["attentive_interval"])
|
||||
)
|
||||
)
|
||||
|
||||
### execute check and set new state
|
||||
try:
|
||||
result = check_kind_implementations[check_data["kind"]].run(check_data["parameters"])
|
||||
except Exception as error:
|
||||
result = {
|
||||
"condition": enum_condition.unknown,
|
||||
"info": {
|
||||
# "cause": translation_get("misc.check_procedure_failed"),
|
||||
"error": str(error),
|
||||
},
|
||||
}
|
||||
new_item_state = {
|
||||
"timestamp": timestamp,
|
||||
"condition": result["condition"],
|
||||
"count": (
|
||||
1
|
||||
if (
|
||||
(old_item_state is None)
|
||||
or
|
||||
(old_item_state["condition"] != result["condition"])
|
||||
) else
|
||||
(
|
||||
(old_item_state["count"] + 1)
|
||||
if (not due):
|
||||
pass
|
||||
else:
|
||||
_sys.stderr.write(
|
||||
string_coin(
|
||||
"-- {{check_name}}\n",
|
||||
{
|
||||
"check_name": check_data["name"],
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
### execute check and set new state
|
||||
try:
|
||||
result = check_kind_implementations[check_data["kind"]].run(check_data["parameters"])
|
||||
except Exception as error:
|
||||
result = {
|
||||
"condition": enum_condition.unknown,
|
||||
"info": {
|
||||
# "cause": translation_get("misc.check_procedure_failed"),
|
||||
"error": str(error),
|
||||
},
|
||||
}
|
||||
new_item_state = {
|
||||
"timestamp": timestamp,
|
||||
"condition": result["condition"],
|
||||
"count": (
|
||||
1
|
||||
if (
|
||||
(old_item_state["count"] is not None)
|
||||
and
|
||||
((old_item_state["count"] + 1) <= check_data["threshold"])
|
||||
(old_item_state is None)
|
||||
or
|
||||
(old_item_state["condition"] != result["condition"])
|
||||
) else
|
||||
None
|
||||
)
|
||||
),
|
||||
}
|
||||
state_data[check_data["name"]] = state_encode(new_item_state)
|
||||
file_write(state_path, _json.dumps(state_data, indent = "\t"))
|
||||
|
||||
### send notifications
|
||||
if (
|
||||
(
|
||||
(
|
||||
(old_item_state["count"] + 1)
|
||||
if (
|
||||
(old_item_state["count"] is not None)
|
||||
and
|
||||
((old_item_state["count"] + 1) <= check_data["threshold"])
|
||||
) else
|
||||
None
|
||||
)
|
||||
),
|
||||
}
|
||||
state_data[check_data["name"]] = state_encode(new_item_state)
|
||||
file_write(state_path, _json.dumps(state_data, indent = "\t"))
|
||||
|
||||
### send notifications
|
||||
if (
|
||||
(
|
||||
(new_item_state["count"] is not None)
|
||||
and
|
||||
(new_item_state["count"] == check_data["threshold"])
|
||||
(
|
||||
(new_item_state["count"] is not None)
|
||||
and
|
||||
(new_item_state["count"] == check_data["threshold"])
|
||||
)
|
||||
or
|
||||
(
|
||||
(new_item_state["count"] is None)
|
||||
and
|
||||
check_data["annoy"]
|
||||
)
|
||||
)
|
||||
or
|
||||
and
|
||||
(
|
||||
(new_item_state["count"] is None)
|
||||
and
|
||||
check_data["annoy"]
|
||||
)
|
||||
)
|
||||
and
|
||||
(
|
||||
(new_item_state["condition"] != enum_condition.ok)
|
||||
or
|
||||
args.send_ok_notifications
|
||||
)
|
||||
):
|
||||
for notification in check_data["notifications"]:
|
||||
notification_channel_implementations[notification["kind"]].notify(
|
||||
notification["parameters"],
|
||||
check_data["name"],
|
||||
check_data,
|
||||
new_item_state,
|
||||
result["info"]
|
||||
(new_item_state["condition"] != enum_condition.ok)
|
||||
or
|
||||
args.send_ok_notifications
|
||||
)
|
||||
):
|
||||
for notification in check_data["notifications"]:
|
||||
notification_channel_implementations[notification["kind"]].notify(
|
||||
notification["parameters"],
|
||||
check_data["name"],
|
||||
check_data,
|
||||
new_item_state,
|
||||
result["info"]
|
||||
)
|
||||
|
||||
_os.remove(args.mutex_path)
|
||||
|
||||
|
||||
main()
|
||||
|
|
|
|||
2
todo.md
2
todo.md
|
|
@ -1,4 +1,6 @@
|
|||
- parallele Zugriffe auf die Zustands-Datei verhindern
|
||||
- mutex?
|
||||
- auf DB umstellen?
|
||||
- Benachrichtigungen versenden, wenn ein Zustand sich wieder normalisiert hat (aber vorher über dem Schwellwert oft nicht OK war)
|
||||
- erneute Benachrichtigung über nicht-OK-Zustand nach einer Weile (siehe https://gitlab.greenscale.de/tools/heimdall/-/issues/3)
|
||||
- längere Statistiken über Metriken führen um auch Anstiege/Abfälle auszuwerten (z.B. "Speicherplatzverbrauch innerhalb einer Woche um 5GB gestiegen")
|
||||
|
|
|
|||
Loading…
Reference in a new issue