#!/usr/bin/python3.6 import argparse import datetime import os import re import shutil import subprocess import sys def parse_args(): parser = argparse.ArgumentParser(description='Wrapper for cronjobs') parser.add_argument( '-e', '--execution-time-limit', help='max execution time (in seconds) for cron job', type=int, metavar='timeout', default=[]) parser.add_argument( '-n', '--name', help='unique identifier for this cron job, defaults to executable name', type=str, metavar='name', required=False) parser.add_argument( '-s', '--success-exit-codes', help='Return code of cron job that should be considered a successful run', type=int, metavar='code', default=[0], nargs='+') parser.add_argument( '-i', '--ignore-error', help='Regex to ignore if cronjob writes to stderr', type=str, metavar='err_re', default=[], nargs='+') parser.add_argument( '-w', '--warn-interval', help='Warn to stderr (as in "send mail") at most once within this time interval (seconds)', type=int, metavar='warn_interval', default=86400) parser.add_argument( '-L', '--no-lock', help='Allow multiple simultanious executions of this cron job', action='store_true') parser.add_argument( '-c', '--cachedir', help="Where to store output of failed runs", type=str, metavar='cachedir', default=['/var/lib/cronwrapper'], nargs=1) parser.add_argument( '-l', '--lockdir', help="Where to write lock-files", type=str, metavar='lockdir', default=['/var/lock'], nargs=1) parser.add_argument( 'command', help='Full path to cron-script to execute', type=str, nargs=1) parser.add_argument( 'arg', help='Optional arguments to cron script', type=str, nargs='*') args = parser.parse_args() if not args.name: args.name = os.path.basename(args.command[0]) return args def exec_command(args, outfile, errfile, resfile): proc = subprocess.Popen( args.command + args.arg, stdout=outfile, stderr=errfile) if args.execution_time_limit: timeout = args.execution_time_limit else: timeout = None try: proc.communicate(timeout=timeout) except subprocess.TimeoutExpired: proc.kill() try: proc.communicate(timeout=30) except subprocess.TimeoutExpired: proc.terminate() proc.communicate(timeout=10) now=datetime.datetime.utcnow() nowstr=now.strftime('%Y-%m-%d_%H%M.%S') resfile.write('{}\n{}'.format(nowstr, proc.returncode)) return proc.returncode def aquire_lock(lockfile): me = "{}\n".format(os.getpid()) try: with open(lockfile, 'x') as lock: lock.write(me) # we got the lock! return True except FileExistsError: # some other process has this lock with open(lockfile, 'r') as lock: owner = lock.read() owner_pid = int(owner) try: os.kill(owner_pid, 0) # owner is still running return False except OSError: os.remove(lockfile) # owner is gone, we can make another attempt to lock try: with open(lockfile, 'x') as lock: lock.write(me) return True except FileExistsError: # someone stole our lock anyway :( return False def release_lock(lockfile): if os.path.isfile(lockfile): os.remove(lockfile) def print_runs(runs, clean=True): for run in sorted(runs): with open(os.path.join(run, 'result'), 'r') as f: try: retcode = f.read().splitlines()[-1] except IndexError: retcode = 'unknown' print("{} returncode {}".format(run, retcode)) print("STDOUT:".format(run)) with open(os.path.join(run, 'stdout'), 'r') as f: for line in f: print(line.strip()) print("\nSTDERR:".format(run)) with open(os.path.join(run, 'stderr'), 'r') as f: for line in f: print(line.strip()) print("\n\n") if clean: for run in runs: try: shutil.rmtree(run) except FileNotFoundError: pass def main(): time_format = '%Y-%m-%d_%H%M' args = parse_args() now = datetime.datetime.utcnow() nowstr = now.strftime(time_format) libdir = os.path.join(args.cachedir[0], args.name, nowstr) lckdir = os.path.join(args.lockdir[0], args.name) os.makedirs(lckdir, exist_ok=True) os.makedirs(libdir) lckfile = os.path.join(lckdir, args.name) outfile = os.path.join(libdir, 'stdout') errfile = os.path.join(libdir, 'stderr') resfile = os.path.join(libdir, 'result') success = True with open(outfile, 'w') as o, open(errfile, 'w+') as e, open(resfile, 'w') as r: if args.no_lock or aquire_lock(lckfile): res = exec_command(args, o, e, r) else: e.write("CRONWRAPPER: Unable to aquire lock, previous instance still running?\n") r.write("\nFalse\n") res = False if res in args.success_exit_codes: # Possible success, check error output re_checks = [re.compile(r) for r in args.ignore_error] e.seek(0) for line in e: success = False for r in re_checks: if re.match(r, line): success = True break if not success: break else: success = False previous_runs = {} for root, dirs, files in os.walk(os.path.join(args.cachedir[0], args.name)): previous_runs = { os.path.join(root, d): datetime.datetime.strptime(d, time_format) for d in dirs if datetime.datetime.strptime(d, time_format) < datetime.datetime.strptime(nowstr, time_format)} break if success: # Yes! Success! report any errors until now if previous_runs: print("Success after {} failed runs\n".format(len(previous_runs))) print_runs(previous_runs.keys()) try: shutil.rmtree(libdir) except FileNotFoundError: pass return 0 # Failure if previous_runs: # Not the first failure... oldest = min(previous_runs.values()) delta = datetime.timedelta(seconds=args.warn_interval) diff = now-oldest if diff > delta: # we have failed for a long time, send a report print("Cronjob is still failing after {} seconds ({} failures)\n".format(args.warn_interval, len(previous_runs))) print_runs(previous_runs.keys()) else: # Within limits, save the output for a later instance pass else: # First failure, send a report, but keep the output print("Cronjob failed\n") print_runs([libdir], clean=False) release_lock(lckfile) if __name__ == '__main__': sys.exit(main())