Merge lp:~stub/launchpad/kill-harder into lp:launchpad

Proposed by Stuart Bishop
Status: Merged
Merged at revision: not available
Proposed branch: lp:~stub/launchpad/kill-harder
Merge into: lp:launchpad
Diff against target: None lines
To merge this branch: bzr merge lp:~stub/launchpad/kill-harder
Reviewer Review Type Date Requested Status
Eleanor Berger (community) Approve
Review via email: mp+11517@code.launchpad.net

This proposal supersedes a proposal from 2009-09-10.

To post a comment you must log in.
Revision history for this message
Stuart Bishop (stub) wrote : Posted in a previous version of this proposal

Addresses Bug #307447

If bin/killservice fails to kill a service with a SIGTERM, retry again with a SIGKILL.

To test, run 'make run' in a terminal. Then, it a different terminal, run 'bin/killservice librarian'. This will test the entire code path due to the way the librarian is spawned. Then, run 'bin/killservice launchpad' to demonstrate the normal code path where the process shutsdown normally.

$ bin/killservice librarian
2009-09-10 11:13:58 INFO Killing librarian (31210)
2009-09-10 11:14:18 WARNING SIGTERM failed to kill librarian (31210). Trying SIGKILL
2009-09-10 11:14:38 ERROR SIGKILL didn't terminate librarian (31210)

$ bin/killservice launchpad
2009-09-10 11:14:47 INFO Killing launchpad (31200)

Revision history for this message
Stuart Bishop (stub) wrote :

Addresses Bug #307447

If bin/killservice fails to kill a service with a SIGTERM, retry again with a SIGKILL.

To test, run 'make run' in a terminal. Then, it a different terminal, run 'bin/killservice librarian'. This will test the entire code path due to the way the librarian is spawned. Then, run 'bin/killservice launchpad' to demonstrate the normal code path where the process shutsdown normally.

$ bin/killservice librarian
2009-09-10 11:13:58 INFO Killing librarian (31210)
2009-09-10 11:14:18 WARNING SIGTERM failed to kill librarian (31210). Trying SIGKILL
2009-09-10 11:14:38 ERROR SIGKILL didn't terminate librarian (31210)

$ bin/killservice launchpad
2009-09-10 11:14:47 INFO Killing launchpad (31200)

Revision history for this message
Eleanor Berger (intellectronica) wrote :

r=me

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'lib/lp/scripts/utilities/killservice.py'
--- lib/lp/scripts/utilities/killservice.py 2009-07-24 12:32:28 +0000
+++ lib/lp/scripts/utilities/killservice.py 2009-09-10 11:20:16 +0000
@@ -8,7 +8,8 @@
8__metaclass__ = type8__metaclass__ = type
99
10import os, logging10import os, logging
11from signal import SIGTERM11from signal import SIGKILL, SIGTERM
12import time
12from optparse import OptionParser13from optparse import OptionParser
13from canonical.config import config14from canonical.config import config
14from canonical.lazr.pidfile import get_pid, pidfile_path, remove_pidfile15from canonical.lazr.pidfile import get_pid, pidfile_path, remove_pidfile
@@ -18,16 +19,26 @@
1819
19def main():20def main():
20 parser = OptionParser('Usage: %prog [options] [SERVICE ...]')21 parser = OptionParser('Usage: %prog [options] [SERVICE ...]')
22 parser.add_option("-w", "--wait", metavar="SECS",
23 default=20, type="int",
24 help="Wait up to SECS seconds for processes "
25 "to die before retrying with SIGKILL")
21 logger_options(parser, logging.INFO)26 logger_options(parser, logging.INFO)
22 (options, args) = parser.parse_args()27 (options, args) = parser.parse_args()
23 log = logger(options)28 log = logger(options)
24 if len(args) < 1:29 if len(args) < 1:
25 parser.error('No service name provided')30 parser.error('No service name provided')
26 for service in args:31
27 # Mailman is special, but only stop it if it was launched.32 pids = [] # List of pids we tried to kill.
28 if service == 'mailman' and config.mailman.launch:33 services = args[:]
34
35 # Mailman is special, but only stop it if it was launched.
36 if 'mailman' in services:
37 if config.mailman.launch:
29 stop_mailman()38 stop_mailman()
30 continue39 services.remove('mailman')
40
41 for service in services:
31 log.debug("PID file is %s", pidfile_path(service))42 log.debug("PID file is %s", pidfile_path(service))
32 try:43 try:
33 pid = get_pid(service)44 pid = get_pid(service)
@@ -38,12 +49,72 @@
38 log.info("Killing %s (%d)", service, pid)49 log.info("Killing %s (%d)", service, pid)
39 try:50 try:
40 os.kill(pid, SIGTERM)51 os.kill(pid, SIGTERM)
52 pids.append((service, pid))
41 except OSError, x:53 except OSError, x:
42 log.error("Unable to kill %s (%d) - %s",54 log.error(
43 service, pid, x.strerror)55 "Unable to SIGTERM %s (%d) - %s",
56 service, pid, x.strerror)
57 else:
58 log.debug("No PID file for %s", service)
59
60 wait_for_pids(pids, options.wait, log)
61
62 # Anything that didn't die, kill harder with SIGKILL.
63 for service, pid in pids:
64 if not process_exists(pid):
65 continue
66 log.warn(
67 "SIGTERM failed to kill %s (%d). Trying SIGKILL", service, pid)
68 try:
69 os.kill(pid, SIGKILL)
70 except OSError, x:
71 log.error(
72 "Unable to SIGKILL %s (%d) - %s", service, pid, x.strerror)
73
74 wait_for_pids(pids, options.wait, log)
75
76 # Report anything still left running after a SIGKILL.
77 for service, pid in pids:
78 if process_exists(pid):
79 log.error("SIGKILL didn't terminate %s (%d)", service, pid)
80
81 # Remove any pidfiles that didn't get cleaned up if there is no
82 # corresponding process (from an unkillable process, or maybe some
83 # other job has relaunched it while we were not looking).
84 for service in services:
85 pid = get_pid(service)
86 if pid is not None and not process_exists(pid):
44 try:87 try:
45 remove_pidfile(service)88 remove_pidfile(service)
46 except OSError:89 except OSError:
47 pass90 pass
48 else:91
49 log.debug("No PID file for %s", service)92
93def process_exists(pid):
94 """True if the given process exists."""
95 try:
96 pgid = os.getpgid(pid)
97 except OSError, x:
98 if x.errno == 3:
99 return False
100 log.error("Unknown exception from getpgid - %s", str(x))
101 return True
102
103
104def wait_for_pids(pids, wait, log):
105 """
106 Wait until all signalled processes are dead, or until we hit the
107 timeout.
108
109 Processes discovered to be dead are removed from the list.
110
111 :param pids: A list of (service, pid).
112
113 :param wait: How many seconds to wait.
114 """
115 wait_start = time.time()
116 while pids and time.time() < wait_start + wait:
117 for service, pid in pids[:]:
118 if not process_exists(pid):
119 pids.remove((service, pid))
120 time.sleep(0.1)