Initial commit

This commit is contained in:
Fredrik Eriksson 2024-07-13 19:32:04 +02:00
parent 27ca848b79
commit ba928af1a2
Signed by: feffe
GPG Key ID: E6B5580B853D322B
5 changed files with 329 additions and 0 deletions

13
pyproject.toml Normal file
View File

@ -0,0 +1,13 @@
[build-system]
requires = ["setuptools", "systemd-python"]
build-backend = "setuptools.build_meta"
[project]
name = "sysalert"
version = "0.1.0"
dependencies = [
"systemd-python"
]
[project.scripts]
sysalert = "sysalert.util:cli"

0
sysalert/__init__.py Normal file
View File

127
sysalert/db.py Normal file
View File

@ -0,0 +1,127 @@
import datetime
import os
import sqlite3
def register_success(name, db):
cur = db.cursor()
cur.execute('''
DELETE FROM alert
WHERE failure IN (
SELECT c.id
FROM failure c INNER JOIN service s
ON c.service=s.id
WHERE s.name=?
);
''', name)
cur.execute('''
DELETE FROM failure
WHERE service IN (
SELECT id FROM service WHERE name=?
);
''', name)
cur.execute('''
DELETE FROM service WHERE name=?;
''', name)
db.commit()
def register_failure(name, alert_method, db):
cur = db.cursor()
cur.execute('INSERT OR IGNORE INTO service(name) VALUES (?);', name)
cur.execute('SELECT id FROM service WHERE name=?;', name)
(service_id,) = cur.fetchone()
cur.execute(''' INSERT INTO failure
( service, service_result, exit_code, exit_status, invocation_id, timestamp)
VALUES
( ?, ?, ?, ?, ?, ?)
RETURNING id;
''',
service_id,
os.environ['MONITOR_SERVICE_RESULT'],
os.environ['MONITOR_EXIT_CODE'],
os.environ['MONITOR_EXIT_STATUS'],
os.environ['MONITOR_INVOCATION_ID'],
datetime.datetime.now())
if alert_method:
(failure_id,) = cur.fetchone()
cur.execute(''' INSERT INTO alert
(failure, timestamp, method)
VALUES
(?, ?, ?);
''',
failure_id,
datetime.datetime.now(),
alert_method)
db.commit()
def get_failures(name, db):
cur = db.cursor()
cur.execute('''SELECT
id, service_result, exit_code, exit_status, invocation_id, timestamp
FROM failure
INNER JOIN service ON
failure.service=service.id
WHERE service.name=?;
''', name)
failures=[]
for f in cur.fetchall():
cur.execute('SELECT method FROM alert where failure=?', f[0])
alert_method = cur.fetchone()
if alert_method:
alert_method = alert_method[0]
failures.append({
'service_result': f[1],
'exit_code': f[2],
'exit_status': f[3],
'invocation_id': f[4],
'timestamp': f[5],
'alert_method': alert_method
})
failures.sort(key=lambda x: x['timestamp'])
return failures
def init(path):
con = sqlite3.connect(path, detect_types=sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES)
cur = con.cursor()
cur.execute('''
CREATE TABLE IF NOT EXISTS
service (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT UNIQUE
);
''')
cur.execute('''
CREATE TABLE IF NOT EXISTS
failure (
id INTEGER PRIMARY KEY AUTOINCREMENT,
service INTEGER,
service_result TEXT,
exit_code TEXT,
exit_status INTEGER,
invocation_id TEXT,
timestamp timestamp,
FOREIGN KEY(service) REFERENCES service(id)
);
''')
cur.execute('''
CREATE TABLE IF NOT EXISTS
alert (
id INTEGER PRIMARY KEY AUTOINCREMENT,
failure INTEGER,
timestamp timestamp,
method TEXT,
FOREIGN KEY(failure) REFERENCES failure(id)
);
''')
con.commit();
return con
def close(con):
con.close()

98
sysalert/email.py Normal file
View File

@ -0,0 +1,98 @@
import email
import smtplib
import socket
import subprocess
import systemd.journal
def _get_status(name):
cmd = ['/usr/bin/systemctl', 'status', name]
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE)
out, err = proc.communicate()
out = out.decode('utf-8')
return out
def _send_email(subject, message, config):
mail = email.message.EmailMessage()
hostname = socket.getfqdn()
smtp_host='localhost'
mail_from=f'sysalert <root@{hostname}>'
mail_to=f'root <root@{hostname}>'
if 'smtp_host' in config:
smtp_host = config['smtp_host']
if 'mail_from' in config:
mail_from = config['mail_from']
if 'mail_to' in config:
mail_to = config['mail_to']
mail.set_content(message)
mail['To'] = mail_to
mail['From'] = mail_from
mail['Subject'] = subject
with smtplib.SMTP(smtp_host) as smtp:
smtp.send_message(mail)
def success(name, failures, config):
cur_status = _get_status(name)
nr_failures = len(failures)
hostname = socket.getfqdn()
subject=f'{hostname} - {name}: recovery'
message=f"""
{name} has recovered after {nr_failures} failures.
Current status:
{cur_status}
"""
_send_email(subject, message, config)
def failure(name, failures, config):
cur_status = _get_status(name)
hostname = socket.getfqdn()
latest_inv_id = failures[-1]['invocation_id']
nr_failures = len(failures)
prev_failures=0
for i,f in enumerate(reversed(failures[:-1])):
if f['alert_method']:
prev_failures=i+1
break
reader = systemd.journal.Reader()
reader.add_match(MONITOR_INVOCATION_ID=latest_inv_id)
reader.add_disjunction()
reader.add_match(_SYSTEMD_INVOCATION_ID=lateset_inv_id)
journal_txt = "\n".join([entry['MESSAGE'] for entry in reader])
if nr_failures <= 1:
subject=f"{hostname} - {name}: failure"
message=f"""
New failure for {name}.
Current status:
{cur_status}
Latest journal log:
{journal_txt}
"""
else:
subject=f"{hostname} - {name}: {nr_failures} failures"
message=f"""
{nr_failures} failures for {name} ({prev_failures} since last notification).
Current status:
{cur_status}
Latest journal log:
{journal_txt}
"""
_send_email(subject, message, config)

91
sysalert/util.py Normal file
View File

@ -0,0 +1,91 @@
import configparser
import datetime
import importlib
import os
import sqlite3
import sys
import systemd.journal
import sysalert.db
config_file='/etc/sysalert.ini'
db_file='/var/lib/sysalert/sysalert.db'
def _test_env():
# return True if all MONITOR environment variables are set, indicating
# utility is started as a handler for OnSuccess= or OnError=
return all([x in os.environ for x in (
'MONITOR_SERVICE_RESULT',
'MONITOR_EXIT_CODE',
'MONITOR_EXIT_STATUS',
'MONITOR_INVOCATION_ID',
'MONITOR_UNIT',
)])
def register_exit(config, db):
service_name=sys.argv[1]
if service_name in config.sections():
section_name = service_name
else:
section_name = config.default_section
alert_method = config.get(section_name, 'alert_method')
alert_config = {x[0]:x[1] for x in config.items(alert_method)}
alert = importlib.import_module(alert_method)
if os.environ['MONITOR_SERVICE_RESULT'] == 'success':
# exit with success status
failures = sysalert.db.get_failures(service_name, db)
sysalert.db.register_success(service_name, db)
try:
do_alert = config.getboolean(section_name 'recovery_alert')
except ValueError:
if config.get(section_name, 'recovery_alert') == 'if-alerted' and failures:
do_alert = True
else:
do_alert = False
if do_alert:
alert.success(service_name, failures, alert_config)
return 0
# exit with failed status
failures = sysalert.db.get_failures(service_name, db)
if len(failures) < config.getint(section_name, 'max_failures'):
sysalert.db.register_failure(service_name, None, db)
return 0
now = datetime.datetime.now()
last_alert = now
for f in reversed(failures):
if f['alert_method']:
last_alert = f['timestamp']
break
diff = datetime.timedelta(seconds=config.getint(section_name, 'resend_alert_time', 0))
if now != last_alert and last_alert + diff > now:
sysalert.db.register_failure(service_name, None, db)
return 0
# refresh failure list before alerting so we include this failure in alert
# call.
sysalert.db.register_failure(service_name, alert_method, db)
failures = sysalert.db.get_failures(service_name, db)
alert.failure(service_name, failures, alert_config)
return 0
def cli():
config = configparser.ConfigParser()
config.read(config_file)
db = sysalert.db.init(db_file)
if _test_env():
# invoked by systemd
return register_exit(config, db)
if __name__ == '__main__':
sys.exit(cli())