Merge lp:~julian-edwards/launchpad/double-build-bug-705342 into lp:launchpad

Proposed by Julian Edwards
Status: Merged
Approved by: Julian Edwards
Approved revision: no longer in the source branch.
Merged at revision: 12322
Proposed branch: lp:~julian-edwards/launchpad/double-build-bug-705342
Merge into: lp:launchpad
Prerequisite: lp:~jml/launchpad/failing-tests-bug-711209
Diff against target: 96 lines (+43/-5)
4 files modified
lib/canonical/widgets/__init__.py (+12/-0)
lib/lp/buildmaster/model/builder.py (+10/-0)
lib/lp/buildmaster/tests/mock_slaves.py (+2/-2)
lib/lp/buildmaster/tests/test_builder.py (+19/-3)
To merge this branch: bzr merge lp:~julian-edwards/launchpad/double-build-bug-705342
Reviewer Review Type Date Requested Status
Leonard Richardson (community) Approve
Review via email: mp+48219@code.launchpad.net

Commit message

[r=leonardr][ui=none] [r=leonardr][bug=705342] Fail nonvirtual builders in the ABORTED state rather than cleaning them, since we can't guarantee that there's no active build.

Description of the change

The attached bug describes how some build logs end up with two apparently separate builds in them. This happens when a buildd-admin kills the builder's processes off but doesn't kill everything quite properly for whatever reason. Normally the buildd-manager will handle this fine for virtual builders, as when it sees an ABORTED state it resets the builder.

However for nonvirtual builders we just try and clean the slave which is clamitous due to a bug in sbuild which won't kill the actual build process off, thus leaving the builder open to receiving a second build.

This change just fails the (nonvirtual) builder if it gets into this state, since there's nothing we can do on the manager side.

To post a comment you must log in.
Revision history for this message
Leonard Richardson (leonardr) :
review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== added directory 'lib/canonical/widgets'
2=== added file 'lib/canonical/widgets/__init__.py'
3--- lib/canonical/widgets/__init__.py 1970-01-01 00:00:00 +0000
4+++ lib/canonical/widgets/__init__.py 2011-02-02 21:39:04 +0000
5@@ -0,0 +1,12 @@
6+# Copyright 2009-2011 Canonical Ltd. This software is licensed under the
7+# GNU Affero General Public License version 3 (see the file LICENSE).
8+
9+# This is a stub to keep canonical.shipit operational. this module
10+# can delete when shipit is independent.
11+
12+from lp.app.widgets.itemswidgets import (
13+ CheckBoxMatrixWidget,
14+ LabeledMultiCheckBoxWidget,
15+ )
16+
17+
18
19=== modified file 'lib/lp/buildmaster/model/builder.py'
20--- lib/lp/buildmaster/model/builder.py 2010-12-06 04:41:45 +0000
21+++ lib/lp/buildmaster/model/builder.py 2011-02-03 16:31:19 +0000
22@@ -338,6 +338,16 @@
23 # communications with the slave and the build manager had to reset
24 # the job.
25 if status == 'BuilderStatus.ABORTED' and builder.currentjob is None:
26+ if not builder.virtualized:
27+ # We can't reset non-virtual builders reliably as the
28+ # abort() function doesn't kill the actual build job,
29+ # only the sbuild process! All we can do here is fail
30+ # the builder with a message indicating the problem and
31+ # wait for an admin to reboot it.
32+ builder.failBuilder(
33+ "Non-virtual builder in ABORTED state, requires admin to "
34+ "restart")
35+ return "dummy status"
36 if logger is not None:
37 logger.info(
38 "Builder '%s' being cleaned up from ABORTED" %
39
40=== modified file 'lib/lp/buildmaster/tests/mock_slaves.py'
41--- lib/lp/buildmaster/tests/mock_slaves.py 2010-11-29 14:51:07 +0000
42+++ lib/lp/buildmaster/tests/mock_slaves.py 2011-02-03 16:31:19 +0000
43@@ -239,11 +239,11 @@
44 """A mock slave that looks like it's aborted."""
45
46 def clean(self):
47- self.call_log.append('status')
48+ self.call_log.append('clean')
49 return defer.succeed(None)
50
51 def status(self):
52- self.call_log.append('clean')
53+ self.call_log.append('status')
54 return defer.succeed(('BuilderStatus.ABORTED', '1-1'))
55
56
57
58=== modified file 'lib/lp/buildmaster/tests/test_builder.py'
59--- lib/lp/buildmaster/tests/test_builder.py 2010-12-20 03:21:03 +0000
60+++ lib/lp/buildmaster/tests/test_builder.py 2011-02-03 16:31:19 +0000
61@@ -279,9 +279,10 @@
62 builder = removeSecurityProxy(self.factory.makeBuilder())
63 self.assertEqual(builder.url, builder.slave.url)
64
65- def test_recovery_of_aborted_slave(self):
66- # If a slave is in the ABORTED state, rescueBuilderIfLost should
67- # clean it if we don't think it's currently building anything.
68+ def test_recovery_of_aborted_virtual_slave(self):
69+ # If a virtual_slave is in the ABORTED state,
70+ # rescueBuilderIfLost should clean it if we don't think it's
71+ # currently building anything.
72 # See bug 463046.
73 aborted_slave = AbortedSlave()
74 builder = MockBuilder("mock_builder", aborted_slave)
75@@ -291,6 +292,21 @@
76 self.assertIn('clean', aborted_slave.call_log)
77 return d.addCallback(check_slave_calls)
78
79+ def test_recovery_of_aborted_nonvirtual_slave(self):
80+ # Nonvirtual slaves in the ABORTED state cannot be reliably
81+ # cleaned since the sbuild process doesn't properly kill the
82+ # build job. We test that the builder is marked failed.
83+ aborted_slave = AbortedSlave()
84+ builder = MockBuilder("mock_builder", aborted_slave)
85+ builder.currentjob = None
86+ builder.virtualized = False
87+ builder.builderok = True
88+ d = builder.rescueIfLost()
89+ def check_failed(ignored):
90+ self.assertFalse(builder.builderok)
91+ self.assertNotIn('clean', aborted_slave.call_log)
92+ return d.addCallback(check_failed)
93+
94 def test_recover_ok_slave(self):
95 # An idle slave is not rescued.
96 slave = OkSlave()