Initial commit
This commit is contained in:
parent
27ca848b79
commit
aa1c2d0781
13
pyproject.toml
Normal file
13
pyproject.toml
Normal file
@ -0,0 +1,13 @@
|
||||
[build-system]
|
||||
requires = ["setuptools", "systemd-python"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "sysalert"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"systemd-python"
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
sysalert = "sysalert.util:cli"
|
0
sysalert/__init__.py
Normal file
0
sysalert/__init__.py
Normal file
130
sysalert/db.py
Normal file
130
sysalert/db.py
Normal file
@ -0,0 +1,130 @@
|
||||
import datetime
|
||||
import os
|
||||
import sqlite3
|
||||
|
||||
def register_success(name, db):
|
||||
cur = db.cursor()
|
||||
cur.execute('''
|
||||
DELETE FROM alert
|
||||
WHERE failure IN (
|
||||
SELECT c.id
|
||||
FROM failure c INNER JOIN service s
|
||||
ON c.service=s.id
|
||||
WHERE s.name=?
|
||||
);
|
||||
''', (name,))
|
||||
cur.execute('''
|
||||
DELETE FROM failure
|
||||
WHERE service IN (
|
||||
SELECT id FROM service WHERE name=?
|
||||
);
|
||||
''', (name,))
|
||||
cur.execute('''
|
||||
DELETE FROM service WHERE name=?;
|
||||
''', (name,))
|
||||
db.commit()
|
||||
|
||||
def register_failure(name, alert_method, db):
|
||||
cur = db.cursor()
|
||||
cur.execute('INSERT OR IGNORE INTO service(name) VALUES (?);', (name,))
|
||||
cur.execute('SELECT id FROM service WHERE name=?;', (name,))
|
||||
(service_id,) = cur.fetchone()
|
||||
cur.execute(''' INSERT INTO failure
|
||||
( service, service_result, exit_code, exit_status, invocation_id, timestamp)
|
||||
VALUES
|
||||
( ?, ?, ?, ?, ?, ?)
|
||||
RETURNING id;
|
||||
''', (
|
||||
service_id,
|
||||
os.environ['MONITOR_SERVICE_RESULT'],
|
||||
os.environ['MONITOR_EXIT_CODE'],
|
||||
os.environ['MONITOR_EXIT_STATUS'],
|
||||
os.environ['MONITOR_INVOCATION_ID'],
|
||||
datetime.datetime.now()
|
||||
))
|
||||
(failure_id,) = cur.fetchone()
|
||||
if alert_method:
|
||||
cur.execute(''' INSERT INTO alert
|
||||
(failure, timestamp, method)
|
||||
VALUES
|
||||
(?, ?, ?);
|
||||
''', (
|
||||
failure_id,
|
||||
datetime.datetime.now(),
|
||||
alert_method
|
||||
))
|
||||
db.commit()
|
||||
|
||||
|
||||
|
||||
def get_failures(name, db):
|
||||
cur = db.cursor()
|
||||
cur.execute('''SELECT
|
||||
failure.id, service_result, exit_code, exit_status, invocation_id, timestamp
|
||||
FROM failure
|
||||
INNER JOIN service ON
|
||||
failure.service=service.id
|
||||
WHERE service.name=?;
|
||||
''', (name,))
|
||||
|
||||
failures=[]
|
||||
for f in cur.fetchall():
|
||||
cur.execute('SELECT method FROM alert where failure=?', (f[0],))
|
||||
alert_method = cur.fetchone()
|
||||
if alert_method:
|
||||
alert_method = alert_method[0]
|
||||
failures.append({
|
||||
'service_result': f[1],
|
||||
'exit_code': f[2],
|
||||
'exit_status': f[3],
|
||||
'invocation_id': f[4],
|
||||
'timestamp': f[5],
|
||||
'alert_method': alert_method
|
||||
})
|
||||
db.commit()
|
||||
|
||||
failures.sort(key=lambda x: x['timestamp'])
|
||||
return failures
|
||||
|
||||
|
||||
def init(path):
|
||||
con = sqlite3.connect(path, detect_types=sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES)
|
||||
cur = con.cursor()
|
||||
cur.execute('''
|
||||
CREATE TABLE IF NOT EXISTS
|
||||
service (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
name TEXT UNIQUE
|
||||
);
|
||||
''')
|
||||
cur.execute('''
|
||||
CREATE TABLE IF NOT EXISTS
|
||||
failure (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
service INTEGER,
|
||||
service_result TEXT,
|
||||
exit_code TEXT,
|
||||
exit_status INTEGER,
|
||||
invocation_id TEXT,
|
||||
timestamp timestamp,
|
||||
|
||||
FOREIGN KEY(service) REFERENCES service(id)
|
||||
);
|
||||
''')
|
||||
cur.execute('''
|
||||
CREATE TABLE IF NOT EXISTS
|
||||
alert (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
failure INTEGER,
|
||||
timestamp timestamp,
|
||||
method TEXT,
|
||||
|
||||
FOREIGN KEY(failure) REFERENCES failure(id)
|
||||
);
|
||||
''')
|
||||
con.commit();
|
||||
return con
|
||||
|
||||
def close(con):
|
||||
con.close()
|
||||
|
98
sysalert/email.py
Normal file
98
sysalert/email.py
Normal file
@ -0,0 +1,98 @@
|
||||
import email
|
||||
import smtplib
|
||||
import socket
|
||||
import subprocess
|
||||
|
||||
import systemd.journal
|
||||
|
||||
def _get_status(name):
|
||||
cmd = ['/usr/bin/systemctl', 'status', name]
|
||||
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE)
|
||||
out, err = proc.communicate()
|
||||
out = out.decode('utf-8')
|
||||
return out
|
||||
|
||||
def _send_email(subject, message, config):
|
||||
mail = email.message.EmailMessage()
|
||||
hostname = socket.getfqdn()
|
||||
smtp_host='localhost'
|
||||
mail_from=f'sysalert <root@{hostname}>'
|
||||
mail_to=f'root <root@{hostname}>'
|
||||
if 'smtp_host' in config:
|
||||
smtp_host = config['smtp_host']
|
||||
if 'mail_from' in config:
|
||||
mail_from = config['mail_from']
|
||||
if 'mail_to' in config:
|
||||
mail_to = config['mail_to']
|
||||
|
||||
|
||||
mail.set_content(message)
|
||||
mail['To'] = mail_to
|
||||
mail['From'] = mail_from
|
||||
mail['Subject'] = subject
|
||||
|
||||
with smtplib.SMTP(smtp_host) as smtp:
|
||||
smtp.send_message(mail)
|
||||
|
||||
def success(name, failures, config):
|
||||
cur_status = _get_status(name)
|
||||
nr_failures = len(failures)
|
||||
hostname = socket.getfqdn()
|
||||
subject=f'{hostname} - {name}: recovery'
|
||||
message=f"""
|
||||
{name} has recovered after {nr_failures} failures.
|
||||
Current status:
|
||||
|
||||
{cur_status}
|
||||
"""
|
||||
_send_email(subject, message, config)
|
||||
|
||||
|
||||
def failure(name, failures, config):
|
||||
cur_status = _get_status(name)
|
||||
hostname = socket.getfqdn()
|
||||
latest_inv_id = failures[-1]['invocation_id']
|
||||
nr_failures = len(failures)
|
||||
prev_failures=0
|
||||
for i,f in enumerate(reversed(failures[:-1])):
|
||||
if f['alert_method']:
|
||||
prev_failures=i+1
|
||||
break
|
||||
|
||||
reader = systemd.journal.Reader()
|
||||
reader.add_match(MONITOR_INVOCATION_ID=latest_inv_id)
|
||||
reader.add_disjunction()
|
||||
reader.add_match(_SYSTEMD_INVOCATION_ID=latest_inv_id)
|
||||
journal_txt = "\n".join([entry['MESSAGE'] for entry in reader])
|
||||
|
||||
if nr_failures <= 1:
|
||||
subject=f"{hostname} - {name}: failure"
|
||||
message=f"""
|
||||
New failure for {name}.
|
||||
Current status:
|
||||
|
||||
{cur_status}
|
||||
|
||||
|
||||
Latest journal log:
|
||||
|
||||
{journal_txt}
|
||||
"""
|
||||
else:
|
||||
subject=f"{hostname} - {name}: {nr_failures} failures"
|
||||
message=f"""
|
||||
{nr_failures} failures for {name} ({prev_failures} since last notification).
|
||||
Current status:
|
||||
|
||||
{cur_status}
|
||||
|
||||
|
||||
Latest journal log:
|
||||
|
||||
{journal_txt}
|
||||
"""
|
||||
|
||||
_send_email(subject, message, config)
|
92
sysalert/util.py
Normal file
92
sysalert/util.py
Normal file
@ -0,0 +1,92 @@
|
||||
|
||||
import configparser
|
||||
import datetime
|
||||
import importlib
|
||||
import os
|
||||
import sqlite3
|
||||
import sys
|
||||
|
||||
import systemd.journal
|
||||
|
||||
import sysalert.db
|
||||
|
||||
config_file='/etc/sysalert.ini'
|
||||
db_file='/var/lib/sysalert/sysalert.db'
|
||||
|
||||
def _test_env():
|
||||
# return True if all MONITOR environment variables are set, indicating
|
||||
# utility is started as a handler for OnSuccess= or OnError=
|
||||
return all([x in os.environ for x in (
|
||||
'MONITOR_SERVICE_RESULT',
|
||||
'MONITOR_EXIT_CODE',
|
||||
'MONITOR_EXIT_STATUS',
|
||||
'MONITOR_INVOCATION_ID',
|
||||
'MONITOR_UNIT',
|
||||
)])
|
||||
|
||||
def register_exit(config, db):
|
||||
service_name=sys.argv[1]
|
||||
if service_name in config.sections():
|
||||
section_name = service_name
|
||||
else:
|
||||
section_name = config.default_section
|
||||
alert_method = config.get(section_name, 'alert_method')
|
||||
alert_config = {x[0]:x[1] for x in config.items(alert_method)}
|
||||
alert = importlib.import_module(alert_method)
|
||||
|
||||
if os.environ['MONITOR_SERVICE_RESULT'] == 'success':
|
||||
# exit with success status
|
||||
failures = sysalert.db.get_failures(service_name, db)
|
||||
sysalert.db.register_success(service_name, db)
|
||||
try:
|
||||
do_alert = config.getboolean(section_name, 'recovery_alert')
|
||||
except ValueError:
|
||||
if config.get(section_name, 'recovery_alert') == 'if-alerted' and failures:
|
||||
do_alert = True
|
||||
else:
|
||||
do_alert = False
|
||||
|
||||
if do_alert:
|
||||
alert.success(service_name, failures, alert_config)
|
||||
return 0
|
||||
|
||||
# exit with failed status
|
||||
failures = sysalert.db.get_failures(service_name, db)
|
||||
if len(failures) < config.getint(section_name, 'max_failures'):
|
||||
sysalert.db.register_failure(service_name, None, db)
|
||||
return 0
|
||||
|
||||
now = datetime.datetime.now()
|
||||
last_alert = now
|
||||
for f in reversed(failures):
|
||||
if f['alert_method']:
|
||||
last_alert = f['timestamp']
|
||||
break
|
||||
|
||||
diff = datetime.timedelta(seconds=config.getint(section_name, 'resend_alert_time', fallback=0))
|
||||
if now != last_alert and last_alert + diff > now:
|
||||
sysalert.db.register_failure(service_name, None, db)
|
||||
return 0
|
||||
|
||||
# refresh failure list before alerting so we include this failure in alert
|
||||
# call.
|
||||
sysalert.db.register_failure(service_name, alert_method, db)
|
||||
failures = sysalert.db.get_failures(service_name, db)
|
||||
alert.failure(service_name, failures, alert_config)
|
||||
return 0
|
||||
|
||||
|
||||
def cli():
|
||||
config = configparser.ConfigParser()
|
||||
config.read(config_file)
|
||||
db = sysalert.db.init(db_file)
|
||||
|
||||
if _test_env():
|
||||
# invoked by systemd
|
||||
ret = register_exit(config, db)
|
||||
|
||||
sysalert.db.close(db)
|
||||
return ret
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(cli())
|
Loading…
Reference in New Issue
Block a user