D-Bus testing: kill children

Instead of killall (which may have the side effect of killing
unrelated process instances and the drawback of only covering known
processes), find and kill all child processes. The information about
child processes is taken from parsing /proc.

Some child process are special:
- The forked test-dbus.py must be kept running.
- dbus-monitor is meant to be running longer and still gets killed
  via the subprocess handle.
This commit is contained in:
Patrick Ohly 2012-05-08 17:15:41 +02:00
parent 843339ce0f
commit e2893aa1e5
1 changed files with 132 additions and 20 deletions

View File

@ -347,11 +347,13 @@ class TimeoutTest:
def TryKill(pid, signal):
try:
os.kill(pid, signal)
return True
except OSError, ex:
# might have quit in the meantime, deal with the race
# condition
if ex.errno != 3:
raise ex
return False
def ShutdownSubprocess(popen, timeout):
start = time.time()
@ -389,6 +391,7 @@ class DBusUtil(Timeout):
quit_events = []
reply = None
pserver = None
pmonitor = None
storedenv = {}
def getTestProperty(self, key, default):
@ -417,13 +420,12 @@ class DBusUtil(Timeout):
DBusUtil.events = []
DBusUtil.quit_events = []
DBusUtil.reply = None
self.pserverpid = None
# allow arbitrarily long diffs in Python unittest
self.maxDiff = None
kill = subprocess.Popen("sh -c 'killall -9 syncevo-dbus-server syncevo-local-sync syncevo-dbus-helper dbus-monitor >/dev/null 2>&1'", shell=True)
kill.communicate()
self.killChildren(0)
# collect zombies
try:
while os.waitpid(-1, os.WNOHANG)[0]:
@ -472,9 +474,9 @@ class DBusUtil(Timeout):
dbuslog = testname + ".dbus.log"
syncevolog = testname + ".syncevo.log"
pmonitor = subprocess.Popen(monitor,
stdout=open(dbuslog, "w"),
stderr=subprocess.STDOUT)
self.pmonitor = subprocess.Popen(monitor,
stdout=open(dbuslog, "w"),
stderr=subprocess.STDOUT)
if debugger:
print "\n%s: %s\n" % (self.id(), self.shortDescription())
@ -518,6 +520,10 @@ class DBusUtil(Timeout):
size = newsize
time.sleep(1)
# pserver.pid is not necessarily the pid of syncevo-dbus-server.
# It might be the child of the pserver process.
self.pserverpid = self.serverPid()
numerrors = len(result.errors)
numfailures = len(result.failures)
if debugger:
@ -553,26 +559,26 @@ class DBusUtil(Timeout):
if debugger:
# allow debugger to run as long as it is needed
DBusUtil.pserver.communicate()
else:
# Force shutdown after a certain delay: how much time we grant
# the process depends on how much work it still needs to do
# after being asked to quit. valgrind leak checking can take
# a while.
if usingValgrind():
delay = 60
else:
delay = 20
if DBusUtil.pserver and not ShutdownSubprocess(DBusUtil.pserver, delay):
print " syncevo-dbus-server had to be killed with SIGKILL"
result.errors.append((self,
"syncevo-dbus-server had to be killed with SIGKILL"))
# Find and kill all sub-processes except for dbus-monitor:
# first try SIGTERM, then if they don't respond, SIGKILL. The
# latter is an error. How much time we allow them between
# SIGTERM and SIGTERM depends on how much work still needs to
# be done after being asked to quit. valgrind leak checking
# can take a while.
unresponsive = self.killChildren(usingValgrind() and 60 or 20)
if unresponsive:
error = "/".join(unresponsive) + " had to be killed with SIGKILL"
print " ", error
result.errors.append((self, error))
serverout = open(syncevolog).read()
if DBusUtil.pserver is not None and DBusUtil.pserver.returncode != -15:
hasfailed = True
if hasfailed:
# give D-Bus time to settle down
time.sleep(1)
if not ShutdownSubprocess(pmonitor, 5):
if not ShutdownSubprocess(self.pmonitor, 5):
print " dbus-monitor had to be killed with SIGKILL"
result.errors.append((self,
"dbus-monitor had to be killed with SIGKILL"))
@ -607,6 +613,112 @@ class DBusUtil(Timeout):
"""True while the syncevo-dbus-server executable is still running"""
return DBusUtil.pserver and DBusUtil.pserver.poll() == None
def getChildren(self):
'''Find all children of the current process (ppid = out pid,
in our process group, or in process group of known
children). Return mapping from pid to name.'''
# Any of the known children might have its own process group.
# syncevo-dbus-server definitely does (we create it like that);
# that allows us to find its forked processes.
pgids = []
if self.pmonitor:
pgids.append(self.pmonitor.pid)
if self.pserverpid:
pgids.append(self.pserverpid)
# Maps from pid to name, process ID, process group ID.
procs = {}
statre = re.compile(r'^\d+ \((?P<name>.*?)\) \S (?P<ppid>\d+) (?P<pgid>\d+)')
for process in os.listdir('/proc'):
try:
pid = int(process)
stat = open('/proc/%d/stat' % pid).read()
m = statre.search(stat)
if m:
procs[pid] = m.groupdict()
for i in ('ppid', 'pgid'):
procs[pid][i] = int(procs[pid][i])
except:
# ignore all errors
pass
logging.printf("found processes: %s", procs)
# Now determine direct or indirect children.
children = {}
mypid = os.getpid()
def isChild(pid):
if pid == mypid:
return False
if not procs.get(pid, None):
return False
return procs[pid]['ppid'] == mypid or \
procs[pid]['pgid'] in pgids or \
isChild(procs[pid]['ppid'])
for pid, info in procs.iteritems():
if isChild(pid):
children[pid] = info['name']
# Exclude dbus-monitor and forked test-dbus.py, they are handled separately.
if self.pmonitor:
del children[self.pmonitor.pid]
del children[child]
logging.printf("found children: %s", children)
return children
def killChildren(self, delay):
'''Find all children of the current process and kill them. First send SIGTERM,
then after a grace period SIGKILL.'''
children = self.getChildren()
# First pass with SIGTERM?
if delay:
for pid, name in children.iteritems():
logging.printf("sending SIGTERM to %d %s", pid, name)
TryKill(pid, signal.SIGTERM)
start = time.time()
logging.printf("starting to wait for process termination at %s", time.asctime(time.localtime(start)))
while delay and start + delay >= time.time():
# Check if any process has quit, remove from list.
pids = copy.deepcopy(children)
def checkKnown(p):
if (p):
if p.pid in pids:
del pids[p.pid]
if p.poll() != None and p.pid in children:
logging.printf("known process %d %s has quit at %s",
p.pid, children[p.pid], time.asctime())
del children[p.pid]
checkKnown(self.pserver)
for pid, name in pids.iteritems():
try:
res = os.waitpid(pid, os.WNOHANG)
if res[0]:
# got result, remove process
logging.printf("got status for process %d %s at %s",
pid, name, time.asctime())
del children[pid]
except OSError, ex:
if ex.errno == errno.ECHILD:
# someone else must have been faster, also okay
logging.printf("process %d %s gone at %s",
pid, name, time.asctime())
del children[pid]
else:
raise ex
if not children:
# All children quit normally.
logging.printf("all process gone at %s",
time.asctime())
return []
# Force killing of remaining children. It's still possible
# that one of them quits before we get around to sending the
# signal.
logging.printf("starting to kill unresponsive processes at %s", time.asctime())
killed = []
for pid, name in children.iteritems():
if TryKill(pid, signal.SIGKILL):
logging.printf("killed %d %s", pid, name)
killed.append("%d %s" % (pid, name))
return killed
def serverExecutableHelper(self, pid):
self.assertTrue(self.isServerRunning())
maps = open("/proc/%d/maps" % pid, "r")