better support for service restarts on systemd

This commit is contained in:
Fredrik Eriksson 2024-07-19 17:04:18 +02:00
parent 22a2b4557b
commit b355a6ceb2
Signed by: feffe
GPG Key ID: E6B5580B853D322B

View File

@ -7,6 +7,7 @@ import psutil
import sau import sau
import sau.errors import sau.errors
import sau.helpers
import sau.platforms import sau.platforms
proc_fd_map_re = re.compile(r'^.*(/[^\(]*) \(deleted\)$') proc_fd_map_re = re.compile(r'^.*(/[^\(]*) \(deleted\)$')
@ -16,7 +17,7 @@ def _warn(policy, msg):
if not policy.startswith('silent'): if not policy.startswith('silent'):
log.warning(msg) log.warning(msg)
def _get_deleted_open_files(proc): def get_deleted_open_files(proc):
log = logging.getLogger(sau.LOGNAME) log = logging.getLogger(sau.LOGNAME)
files = set() files = set()
@ -62,25 +63,40 @@ def get_exe_file(name):
log.debug('Found binary for {} at {}'.format(name, root)) log.debug('Found binary for {} at {}'.format(name, root))
return os.path.join(root, name) return os.path.join(root, name)
# return all processes with open files
def _get_processes():
log = logging.getLogger(sau.LOGNAME)
check_procs = set()
for proc in psutil.process_iter():
files = get_deleted_open_files(proc)
if files:
log.debug('{} has open deleted files'.format(proc))
check_procs.add(proc)
return check_procs
def restart_services(): def restart_services():
log = logging.getLogger(sau.LOGNAME) log = logging.getLogger(sau.LOGNAME)
platform = sau.platforms.get_platform() platform = sau.platforms.get_platform()
conf = sau.config conf = sau.config
check_procs = set()
for proc in psutil.process_iter():
files = _get_deleted_open_files(proc)
if files:
log.info('{} has open deleted files'.format(proc))
check_procs.add(proc)
check_procs = _get_processes()
# wait before the second test # wait before the second test
time.sleep(1) time.sleep(5)
on_systemd = False
try:
init_proc = psutil.Process(pid=1)
if init_proc.name() == 'systemd':
on_systemd = True
except psutil.NoSuchProcess:
pass
# perform a second check to remove potential false positives # perform a second check to remove potential false positives
service_procs = set() service_procs = set()
retest_procs = set() retest_procs = set()
for proc in check_procs: for proc in check_procs:
files = _get_deleted_open_files(proc) files = get_deleted_open_files(proc)
if not files: if not files:
# no deleted open files for this process any longer # no deleted open files for this process any longer
continue continue
@ -89,32 +105,55 @@ def restart_services():
except (psutil.NoSuchProcess, psutil.ZombieProcess, psutil.AccessDenied): except (psutil.NoSuchProcess, psutil.ZombieProcess, psutil.AccessDenied):
# either of the above exceptions means the process has quit # either of the above exceptions means the process has quit
continue continue
parent = _get_top_parent(proc) if on_systemd:
service_procs.add(proc)
else:
parent = _get_top_parent(proc)
service_procs.add(parent)
service_procs.add(parent)
retest_procs.add(proc) retest_procs.add(proc)
recommend_restart = False
processes = {} processes = {}
services = {} services = {}
for proc in service_procs: for proc in service_procs:
if not proc: if not proc:
continue continue
service_name = None
try: try:
service_exe = proc.exe() service_exe = proc.exe()
proc_name = proc.name() proc_name = proc.name()
except (psutil.NoSuchProcess, psutil.ZombieProcess, psutil.AccessDenied): except (psutil.NoSuchProcess, psutil.ZombieProcess, psutil.AccessDenied):
log.debug('{} died before it could be restarted'.format(proc)) log.debug('{} died before it could be restarted'.format(proc))
continue continue
service_name = _get_service_from_proc(proc)
if on_systemd:
ret, unit, err = sau.helpers.exec_cmd([ '/usr/bin/systemctl', 'whoami', f'{proc.pid}' ])
name, unit_type = unit.strip().split('.')
if ret != 0:
log.debug(f'Non-success ({ret}) when checking unit for process: {err}')
continue
elif unit_type != 'service':
log.warning(f'not restarting non-service unit "{unit}"; owner of {proc}')
else:
_ret, enabled, _err = sau.helpers.exec_cmd([ '/usr/bin/systemctl', 'is-enabled', unit ])
enabled = enabled.strip()
if enabled != 'enabled':
log.warning(f'Unit {name}.service has enable status: {enabled} - will only restart "enabled" services')
else:
service_name = name
else:
service_name = _get_service_from_proc(proc)
if not service_name: if not service_name:
log.debug('no service for process {}'.format(proc)) log.warning('no service for process {}'.format(proc))
recommend_restart = True
continue continue
services[proc_name] = service_name services[proc_name] = service_name
processes[service_name] = [proc] processes[service_name] = [proc]
recommend_restart = False
for service in set([x for x in services.values() if x]): for service in set([x for x in services.values() if x]):
policy = _get_service_restart_policy(service) policy = _get_service_restart_policy(service)
if policy == 'ignore': if policy == 'ignore':
@ -131,34 +170,17 @@ def restart_services():
tested_parents = set() tested_parents = set()
for proc in retest_procs: for proc in retest_procs:
parent = _get_top_parent(proc) try:
if not parent: proc_name = proc.name()
continue if proc_name not in services:
parent_name = parent.name() continue
if parent in tested_parents: except (psutil.NoSuchProcess, psutil.ZombieProcess, psutil.AccessDenied):
log.debug('{} belongs to already tested parent {}'.format(proc, parent))
continue continue
if _get_deleted_open_files(proc): if get_deleted_open_files(proc):
tested_parents.add(parent) service = services[proc_name]
service = _get_service_from_proc(parent)
if not service:
log.warning('could not re-check process {} - failed to identify service'.format(proc))
recommend_restart = True
continue
policy = _get_service_restart_policy(service) policy = _get_service_restart_policy(service)
_warn(policy, '{} still has deleted files open'.format(proc, parent))
log.debug('{} is in service {}'.format(proc, service))
if parent_name in services and not services[parent_name]:
_warn(policy, '{} (parent {}) does not belong to a service and could not be restarted'.format(proc, parent))
recommend_restart = True
continue
elif parent_name in services:
policy = _get_service_restart_policy(service)
log.debug('service {} has policy {}'.format(service, policy))
if policy in ('ignore', 'warn'):
continue
_warn(policy, '{} (parent {}) still has deleted files open'.format(proc, parent))
recommend_restart = True recommend_restart = True
return recommend_restart return recommend_restart