Merge lp:~stub/launchpad/kill-harder into lp:launchpad

Proposed by Stuart Bishop
Status: Superseded
Proposed branch: lp:~stub/launchpad/kill-harder
Merge into: lp:launchpad
Diff against target: None lines
To merge this branch: bzr merge lp:~stub/launchpad/kill-harder
Reviewer Review Type Date Requested Status
Canonical Launchpad Engineering Pending
Review via email: mp+11516@code.launchpad.net

This proposal has been superseded by a proposal from 2009-09-10.

To post a comment you must log in.
Revision history for this message
Stuart Bishop (stub) wrote :

Addresses Bug #307447

If bin/killservice fails to kill a service with a SIGTERM, retry again with a SIGKILL.

To test, run 'make run' in a terminal. Then, it a different terminal, run 'bin/killservice librarian'. This will test the entire code path due to the way the librarian is spawned. Then, run 'bin/killservice launchpad' to demonstrate the normal code path where the process shutsdown normally.

$ bin/killservice librarian
2009-09-10 11:13:58 INFO Killing librarian (31210)
2009-09-10 11:14:18 WARNING SIGTERM failed to kill librarian (31210). Trying SIGKILL
2009-09-10 11:14:38 ERROR SIGKILL didn't terminate librarian (31210)

$ bin/killservice launchpad
2009-09-10 11:14:47 INFO Killing launchpad (31200)

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'lib/lp/scripts/utilities/killservice.py'
2--- lib/lp/scripts/utilities/killservice.py 2009-07-24 12:32:28 +0000
3+++ lib/lp/scripts/utilities/killservice.py 2009-09-10 10:48:39 +0000
4@@ -8,7 +8,8 @@
5 __metaclass__ = type
6
7 import os, logging
8-from signal import SIGTERM
9+from signal import SIGKILL, SIGTERM
10+import time
11 from optparse import OptionParser
12 from canonical.config import config
13 from canonical.lazr.pidfile import get_pid, pidfile_path, remove_pidfile
14@@ -16,18 +17,61 @@
15 from canonical.launchpad.mailman.runmailman import stop_mailman
16
17
18+log = None # Initialized in main()
19+
20+
21+def process_exists(pid):
22+ """True if the given process exists."""
23+ try:
24+ pgid = os.getpgid(pid)
25+ except OSError, x:
26+ if x.errno == 3:
27+ return False
28+ log.error("Unknown exception from getpgid - %s", str(x))
29+ return True
30+
31+
32+def wait_for_pids(pids, wait, log):
33+ """
34+ Wait until all signalled processes are dead, or until we hit the
35+ timeout.
36+
37+ Processes discovered to be dead are removed from the list.
38+
39+ :param pids: A list of (service, pid).
40+
41+ :param wait: How many seconds to wait.
42+ """
43+ wait_start = time.time()
44+ while pids and time.time() < wait_start + wait:
45+ for service, pid in pids[:]:
46+ if not process_exists(pid):
47+ pids.remove((service, pid))
48+ time.sleep(0.1)
49+
50+
51 def main():
52 parser = OptionParser('Usage: %prog [options] [SERVICE ...]')
53+ parser.add_option("-w", "--wait", metavar="SECS",
54+ default=20, type="int",
55+ help="Wait up to SECS seconds for processes "
56+ "to die before retrying with SIGKILL")
57 logger_options(parser, logging.INFO)
58 (options, args) = parser.parse_args()
59 log = logger(options)
60 if len(args) < 1:
61 parser.error('No service name provided')
62- for service in args:
63- # Mailman is special, but only stop it if it was launched.
64- if service == 'mailman' and config.mailman.launch:
65+
66+ pids = [] # List of pids we tried to kill.
67+ services = args[:]
68+
69+ # Mailman is special, but only stop it if it was launched.
70+ if 'mailman' in services:
71+ if config.mailman.launch:
72 stop_mailman()
73- continue
74+ services.remove('mailman')
75+
76+ for service in services:
77 log.debug("PID file is %s", pidfile_path(service))
78 try:
79 pid = get_pid(service)
80@@ -38,12 +82,43 @@
81 log.info("Killing %s (%d)", service, pid)
82 try:
83 os.kill(pid, SIGTERM)
84+ pids.append((service, pid))
85 except OSError, x:
86- log.error("Unable to kill %s (%d) - %s",
87- service, pid, x.strerror)
88+ log.error(
89+ "Unable to SIGTERM %s (%d) - %s",
90+ service, pid, x.strerror)
91+ else:
92+ log.debug("No PID file for %s", service)
93+
94+ wait_for_pids(pids, options.wait, log)
95+
96+ # Anything that didn't die, kill harder with SIGKILL.
97+ for service, pid in pids:
98+ if not process_exists(pid):
99+ continue
100+ log.warn(
101+ "SIGTERM failed to kill %s (%d). Trying SIGKILL", service, pid)
102+ try:
103+ os.kill(pid, SIGKILL)
104+ except OSError, x:
105+ log.error(
106+ "Unable to SIGKILL %s (%d) - %s", service, pid, x.strerror)
107+
108+ wait_for_pids(pids, options.wait, log)
109+
110+ # Report anything still left running after a SIGKILL.
111+ for service, pid in pids:
112+ if process_exists(pid):
113+ log.error("SIGKILL didn't terminate %s (%d)", service, pid)
114+
115+ # Remove any pidfiles that didn't get cleaned up if there is no
116+ # corresponding process (from an unkillable process, or maybe some
117+ # other job has relaunched it while we were not looking).
118+ for service in services:
119+ pid = get_pid(service)
120+ if pid is not None and not process_exists(pid):
121 try:
122 remove_pidfile(service)
123 except OSError:
124 pass
125- else:
126- log.debug("No PID file for %s", service)
127+