From aa1c2d07814d22b77083e8fdaaebc1de72ac663e Mon Sep 17 00:00:00 2001 From: Fredrik Eriksson Date: Sat, 13 Jul 2024 19:32:04 +0200 Subject: [PATCH] Initial commit --- pyproject.toml | 13 +++++ sysalert/__init__.py | 0 sysalert/db.py | 130 +++++++++++++++++++++++++++++++++++++++++++ sysalert/email.py | 98 ++++++++++++++++++++++++++++++++ sysalert/util.py | 92 ++++++++++++++++++++++++++++++ 5 files changed, 333 insertions(+) create mode 100644 pyproject.toml create mode 100644 sysalert/__init__.py create mode 100644 sysalert/db.py create mode 100644 sysalert/email.py create mode 100644 sysalert/util.py diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..cc34ed9 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,13 @@ +[build-system] +requires = ["setuptools", "systemd-python"] +build-backend = "setuptools.build_meta" + +[project] +name = "sysalert" +version = "0.1.0" +dependencies = [ + "systemd-python" +] + +[project.scripts] +sysalert = "sysalert.util:cli" diff --git a/sysalert/__init__.py b/sysalert/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sysalert/db.py b/sysalert/db.py new file mode 100644 index 0000000..63ca3f8 --- /dev/null +++ b/sysalert/db.py @@ -0,0 +1,130 @@ +import datetime +import os +import sqlite3 + +def register_success(name, db): + cur = db.cursor() + cur.execute(''' + DELETE FROM alert + WHERE failure IN ( + SELECT c.id + FROM failure c INNER JOIN service s + ON c.service=s.id + WHERE s.name=? + ); + ''', (name,)) + cur.execute(''' + DELETE FROM failure + WHERE service IN ( + SELECT id FROM service WHERE name=? + ); + ''', (name,)) + cur.execute(''' + DELETE FROM service WHERE name=?; + ''', (name,)) + db.commit() + +def register_failure(name, alert_method, db): + cur = db.cursor() + cur.execute('INSERT OR IGNORE INTO service(name) VALUES (?);', (name,)) + cur.execute('SELECT id FROM service WHERE name=?;', (name,)) + (service_id,) = cur.fetchone() + cur.execute(''' INSERT INTO failure + ( service, service_result, exit_code, exit_status, invocation_id, timestamp) + VALUES + ( ?, ?, ?, ?, ?, ?) + RETURNING id; + ''', ( + service_id, + os.environ['MONITOR_SERVICE_RESULT'], + os.environ['MONITOR_EXIT_CODE'], + os.environ['MONITOR_EXIT_STATUS'], + os.environ['MONITOR_INVOCATION_ID'], + datetime.datetime.now() + )) + (failure_id,) = cur.fetchone() + if alert_method: + cur.execute(''' INSERT INTO alert + (failure, timestamp, method) + VALUES + (?, ?, ?); + ''', ( + failure_id, + datetime.datetime.now(), + alert_method + )) + db.commit() + + + +def get_failures(name, db): + cur = db.cursor() + cur.execute('''SELECT + failure.id, service_result, exit_code, exit_status, invocation_id, timestamp + FROM failure + INNER JOIN service ON + failure.service=service.id + WHERE service.name=?; + ''', (name,)) + + failures=[] + for f in cur.fetchall(): + cur.execute('SELECT method FROM alert where failure=?', (f[0],)) + alert_method = cur.fetchone() + if alert_method: + alert_method = alert_method[0] + failures.append({ + 'service_result': f[1], + 'exit_code': f[2], + 'exit_status': f[3], + 'invocation_id': f[4], + 'timestamp': f[5], + 'alert_method': alert_method + }) + db.commit() + + failures.sort(key=lambda x: x['timestamp']) + return failures + + +def init(path): + con = sqlite3.connect(path, detect_types=sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES) + cur = con.cursor() + cur.execute(''' + CREATE TABLE IF NOT EXISTS + service ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + name TEXT UNIQUE + ); + ''') + cur.execute(''' + CREATE TABLE IF NOT EXISTS + failure ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + service INTEGER, + service_result TEXT, + exit_code TEXT, + exit_status INTEGER, + invocation_id TEXT, + timestamp timestamp, + + FOREIGN KEY(service) REFERENCES service(id) + ); + ''') + cur.execute(''' + CREATE TABLE IF NOT EXISTS + alert ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + failure INTEGER, + timestamp timestamp, + method TEXT, + + FOREIGN KEY(failure) REFERENCES failure(id) + ); + ''') + con.commit(); + return con + +def close(con): + con.close() + diff --git a/sysalert/email.py b/sysalert/email.py new file mode 100644 index 0000000..5d6fd80 --- /dev/null +++ b/sysalert/email.py @@ -0,0 +1,98 @@ +import email +import smtplib +import socket +import subprocess + +import systemd.journal + +def _get_status(name): + cmd = ['/usr/bin/systemctl', 'status', name] + + proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE) + out, err = proc.communicate() + out = out.decode('utf-8') + return out + +def _send_email(subject, message, config): + mail = email.message.EmailMessage() + hostname = socket.getfqdn() + smtp_host='localhost' + mail_from=f'sysalert ' + mail_to=f'root ' + if 'smtp_host' in config: + smtp_host = config['smtp_host'] + if 'mail_from' in config: + mail_from = config['mail_from'] + if 'mail_to' in config: + mail_to = config['mail_to'] + + + mail.set_content(message) + mail['To'] = mail_to + mail['From'] = mail_from + mail['Subject'] = subject + + with smtplib.SMTP(smtp_host) as smtp: + smtp.send_message(mail) + +def success(name, failures, config): + cur_status = _get_status(name) + nr_failures = len(failures) + hostname = socket.getfqdn() + subject=f'{hostname} - {name}: recovery' + message=f""" +{name} has recovered after {nr_failures} failures. +Current status: + +{cur_status} +""" + _send_email(subject, message, config) + + +def failure(name, failures, config): + cur_status = _get_status(name) + hostname = socket.getfqdn() + latest_inv_id = failures[-1]['invocation_id'] + nr_failures = len(failures) + prev_failures=0 + for i,f in enumerate(reversed(failures[:-1])): + if f['alert_method']: + prev_failures=i+1 + break + + reader = systemd.journal.Reader() + reader.add_match(MONITOR_INVOCATION_ID=latest_inv_id) + reader.add_disjunction() + reader.add_match(_SYSTEMD_INVOCATION_ID=latest_inv_id) + journal_txt = "\n".join([entry['MESSAGE'] for entry in reader]) + + if nr_failures <= 1: + subject=f"{hostname} - {name}: failure" + message=f""" +New failure for {name}. +Current status: + +{cur_status} + + +Latest journal log: + +{journal_txt} +""" + else: + subject=f"{hostname} - {name}: {nr_failures} failures" + message=f""" +{nr_failures} failures for {name} ({prev_failures} since last notification). +Current status: + +{cur_status} + + +Latest journal log: + +{journal_txt} +""" + + _send_email(subject, message, config) diff --git a/sysalert/util.py b/sysalert/util.py new file mode 100644 index 0000000..e071f3a --- /dev/null +++ b/sysalert/util.py @@ -0,0 +1,92 @@ + +import configparser +import datetime +import importlib +import os +import sqlite3 +import sys + +import systemd.journal + +import sysalert.db + +config_file='/etc/sysalert.ini' +db_file='/var/lib/sysalert/sysalert.db' + +def _test_env(): + # return True if all MONITOR environment variables are set, indicating + # utility is started as a handler for OnSuccess= or OnError= + return all([x in os.environ for x in ( + 'MONITOR_SERVICE_RESULT', + 'MONITOR_EXIT_CODE', + 'MONITOR_EXIT_STATUS', + 'MONITOR_INVOCATION_ID', + 'MONITOR_UNIT', + )]) + +def register_exit(config, db): + service_name=sys.argv[1] + if service_name in config.sections(): + section_name = service_name + else: + section_name = config.default_section + alert_method = config.get(section_name, 'alert_method') + alert_config = {x[0]:x[1] for x in config.items(alert_method)} + alert = importlib.import_module(alert_method) + + if os.environ['MONITOR_SERVICE_RESULT'] == 'success': + # exit with success status + failures = sysalert.db.get_failures(service_name, db) + sysalert.db.register_success(service_name, db) + try: + do_alert = config.getboolean(section_name, 'recovery_alert') + except ValueError: + if config.get(section_name, 'recovery_alert') == 'if-alerted' and failures: + do_alert = True + else: + do_alert = False + + if do_alert: + alert.success(service_name, failures, alert_config) + return 0 + + # exit with failed status + failures = sysalert.db.get_failures(service_name, db) + if len(failures) < config.getint(section_name, 'max_failures'): + sysalert.db.register_failure(service_name, None, db) + return 0 + + now = datetime.datetime.now() + last_alert = now + for f in reversed(failures): + if f['alert_method']: + last_alert = f['timestamp'] + break + + diff = datetime.timedelta(seconds=config.getint(section_name, 'resend_alert_time', fallback=0)) + if now != last_alert and last_alert + diff > now: + sysalert.db.register_failure(service_name, None, db) + return 0 + + # refresh failure list before alerting so we include this failure in alert + # call. + sysalert.db.register_failure(service_name, alert_method, db) + failures = sysalert.db.get_failures(service_name, db) + alert.failure(service_name, failures, alert_config) + return 0 + + +def cli(): + config = configparser.ConfigParser() + config.read(config_file) + db = sysalert.db.init(db_file) + + if _test_env(): + # invoked by systemd + ret = register_exit(config, db) + + sysalert.db.close(db) + return ret + +if __name__ == '__main__': + sys.exit(cli())