Merge ~wgrant/launchpad:buildd-manager-nicer-retries into launchpad:master

Proposed by William Grant
Status: Merged
Approved by: William Grant
Approved revision: 272d66a896f028186c79cf1474eab1505f6febfe
Merge reported by: Otto Co-Pilot
Merged at revision: not available
Proposed branch: ~wgrant/launchpad:buildd-manager-nicer-retries
Merge into: launchpad:master
Prerequisite: ~wgrant/launchpad:buildd-manager-failure-metrics
Diff against target: 312 lines (+155/-6)
2 files modified
lib/lp/buildmaster/manager.py (+40/-0)
lib/lp/buildmaster/tests/test_manager.py (+115/-6)
Reviewer Review Type Date Requested Status
Colin Watson (community) Approve
Review via email: mp+454694@code.launchpad.net

Commit message

Cope more gracefully with intermittent builder glitches

Description of the change

buildd-manager would previously immediately count any single scan
failure against the builder and job. This meant that three glitches --
say, network timeouts -- over the course of job would result in the
build being requeued. A builder's failure count is reset on successful
dispatch, but a job's deliberately isn't since we want to fail builds
that are repeatedly killing builders. This meant that a single network
glitch in the second attempt at a build would cause it to be failed.

This added layer of failure counting substantially reduces the
likelihood of those two scenarios, by requiring five consecutive
unsuccessful scans before a single failure is counted against a builder
or job. This means that brief network interruptions, or indeed temporary
insanity on buildd-manager's part, should no longer cause builds to be
requeued or failed at all.

The only significant downside of this change is that recovery from
legitimate failures will now take a few minutes longer. But that's much
less of a concern with the very large build farm we have nowadays.

To post a comment you must log in.
Revision history for this message
Colin Watson (cjwatson) wrote :

Very nice. Thanks!

review: Approve
Revision history for this message
Otto Co-Pilot (otto-copilot) wrote :

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/lib/lp/buildmaster/manager.py b/lib/lp/buildmaster/manager.py
2index cf29548..13ace0c 100644
3--- a/lib/lp/buildmaster/manager.py
4+++ b/lib/lp/buildmaster/manager.py
5@@ -54,6 +54,10 @@ from lp.services.statsd.interfaces.statsd_client import IStatsdClient
6 BUILDD_MANAGER_LOG_NAME = "worker-scanner"
7
8
9+# The number of times the scan of a builder can fail before we start
10+# attributing it to the builder and job.
11+SCAN_FAILURE_THRESHOLD = 5
12+
13 # The number of times a builder can consecutively fail before we
14 # reset its current job.
15 JOB_RESET_THRESHOLD = 3
16@@ -515,6 +519,12 @@ class WorkerScanner:
17
18 self.can_retry = True
19
20+ # The build and job failure counts are persisted, but we only really
21+ # care about the consecutive scan failure count over the space of a
22+ # couple of minutes, so it's okay if it resets on buildd-manager
23+ # restart.
24+ self.scan_failure_count = 0
25+
26 # We cache the build cookie, keyed on the BuildQueue, to avoid
27 # hitting the DB on every scan.
28 self._cached_build_cookie = None
29@@ -553,6 +563,12 @@ class WorkerScanner:
30
31 try:
32 yield self.scan()
33+
34+ # We got through a scan without error, so reset the consecutive
35+ # failure count. We don't reset the persistent builder or job
36+ # failure counts, since the build might consistently break the
37+ # builder later in the build.
38+ self.scan_failure_count = 0
39 except Exception as e:
40 self._scanFailed(self.can_retry, e)
41
42@@ -591,6 +607,11 @@ class WorkerScanner:
43 "Scanning %s failed" % self.builder_name, exc_info=exc
44 )
45
46+ # Certain errors can't possibly be a glitch, and they can insta-fail
47+ # even if the scan phase would normally allow a retry.
48+ if isinstance(exc, (BuildDaemonIsolationError, CannotResumeHost)):
49+ retry = False
50+
51 # Decide if we need to terminate the job or reset/fail the builder.
52 vitals = self.builder_factory.getVitals(self.builder_name)
53 builder = self.builder_factory[self.builder_name]
54@@ -599,6 +620,24 @@ class WorkerScanner:
55 self.statsd_client.incr(
56 "builders.failure.scan_failed", labels=labels
57 )
58+ self.scan_failure_count += 1
59+
60+ # To avoid counting network glitches or similar against innocent
61+ # builds and jobs, we allow a scan to fail a few times in a row
62+ # without consequence, and just retry. If we exceed the threshold,
63+ # we then persistently record a single failure against the build
64+ # and job.
65+ if retry and self.scan_failure_count < SCAN_FAILURE_THRESHOLD:
66+ self.statsd_client.incr(
67+ "builders.failure.scan_retried",
68+ labels={
69+ "failures": str(self.scan_failure_count),
70+ **labels,
71+ },
72+ )
73+ return
74+
75+ self.scan_failure_count = 0
76
77 builder.gotFailure()
78 if builder.current_build is not None:
79@@ -612,6 +651,7 @@ class WorkerScanner:
80 "Miserable failure when trying to handle failure:\n",
81 exc_info=True,
82 )
83+ finally:
84 transaction.abort()
85
86 @defer.inlineCallbacks
87diff --git a/lib/lp/buildmaster/tests/test_manager.py b/lib/lp/buildmaster/tests/test_manager.py
88index 2d3b6d5..78d42df 100644
89--- a/lib/lp/buildmaster/tests/test_manager.py
90+++ b/lib/lp/buildmaster/tests/test_manager.py
91@@ -40,6 +40,7 @@ from lp.buildmaster.interfaces.buildqueue import IBuildQueueSet
92 from lp.buildmaster.manager import (
93 BUILDER_FAILURE_THRESHOLD,
94 JOB_RESET_THRESHOLD,
95+ SCAN_FAILURE_THRESHOLD,
96 BuilddManager,
97 BuilderFactory,
98 PrefetchedBuilderFactory,
99@@ -133,7 +134,13 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
100 self.assertEqual(job.specific_build.status, BuildStatus.BUILDING)
101 self.assertEqual(job.logtail, logtail)
102
103- def _getScanner(self, builder_name=None, clock=None, builder_factory=None):
104+ def _getScanner(
105+ self,
106+ builder_name=None,
107+ clock=None,
108+ builder_factory=None,
109+ scan_failure_count=0,
110+ ):
111 """Instantiate a WorkerScanner object.
112
113 Replace its default logging handler by a testing version.
114@@ -148,6 +155,7 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
115 builder_name, builder_factory, manager, BufferLogger(), clock=clock
116 )
117 scanner.logger.name = "worker-scanner"
118+ scanner.scan_failure_count = scan_failure_count
119
120 return scanner
121
122@@ -458,6 +466,7 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
123 @defer.inlineCallbacks
124 def _assertFailureCounting(
125 self,
126+ scan_count,
127 builder_count,
128 job_count,
129 expected_builder_count,
130@@ -470,7 +479,7 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
131 def failing_scan():
132 return defer.fail(Exception("fake exception"))
133
134- scanner = self._getScanner()
135+ scanner = self._getScanner(scan_failure_count=scan_count)
136 scanner.scan = failing_scan
137 from lp.buildmaster import manager as manager_module
138
139@@ -497,10 +506,98 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
140 )
141 self.assertEqual(1, manager_module.recover_failure.call_count)
142
143+ @defer.inlineCallbacks
144+ def test_scan_persistent_failure_counts(self):
145+ # The first few scan exceptions just result in retries, not penalties
146+ # for the involved parties. A builder or job failure is only recorded
147+ # after several scan failures.
148+ scanner = self._getScanner()
149+ builder = getUtility(IBuilderSet)[scanner.builder_name]
150+ transaction.commit()
151+
152+ self.patch(scanner, "scan", lambda: defer.fail(Exception("fake")))
153+
154+ # Rack up almost enough failures.
155+ for i in range(1, SCAN_FAILURE_THRESHOLD):
156+ yield scanner.singleCycle()
157+ self.assertEqual(i, scanner.scan_failure_count)
158+ self.assertEqual(0, builder.failure_count)
159+ self.assertEqual(0, builder.current_build.failure_count)
160+
161+ # Once we reach the consecutive failure threshold, the builder and
162+ # build each get a failure and the count is reset.
163+ yield scanner.singleCycle()
164+ self.assertEqual(0, scanner.scan_failure_count)
165+ self.assertEqual(1, builder.failure_count)
166+ self.assertEqual(1, builder.current_build.failure_count)
167+
168+ @defer.inlineCallbacks
169+ def test_scan_intermittent_failure_retries(self):
170+ # A successful scan after a few failures resets the failure count.
171+ scanner = self._getScanner()
172+ builder = getUtility(IBuilderSet)[scanner.builder_name]
173+ transaction.commit()
174+
175+ # Rack up a couple of failures.
176+ self.patch(scanner, "scan", lambda: defer.fail(Exception("fake")))
177+
178+ yield scanner.singleCycle()
179+ self.assertEqual(1, scanner.scan_failure_count)
180+ self.assertEqual(0, builder.failure_count)
181+ self.assertEqual(0, builder.current_build.failure_count)
182+ self.assertEqual(
183+ [
184+ mock.call(
185+ "builders.failure.scan_failed,arch=386,build=True,"
186+ "builder_name=bob,env=test,job_type=PACKAGEBUILD,region=,"
187+ "virtualized=False"
188+ ),
189+ mock.call(
190+ "builders.failure.scan_retried,arch=386,build=True,"
191+ "builder_name=bob,env=test,failures=1,"
192+ "job_type=PACKAGEBUILD,region=,virtualized=False"
193+ ),
194+ ],
195+ self.stats_client.incr.mock_calls,
196+ )
197+ self.stats_client.incr.reset_mock()
198+
199+ yield scanner.singleCycle()
200+ self.assertEqual(2, scanner.scan_failure_count)
201+ self.assertEqual(0, builder.failure_count)
202+ self.assertEqual(0, builder.current_build.failure_count)
203+ self.assertEqual(
204+ [
205+ mock.call(
206+ "builders.failure.scan_failed,arch=386,build=True,"
207+ "builder_name=bob,env=test,job_type=PACKAGEBUILD,"
208+ "region=,virtualized=False"
209+ ),
210+ mock.call(
211+ "builders.failure.scan_retried,arch=386,build=True,"
212+ "builder_name=bob,env=test,failures=2,"
213+ "job_type=PACKAGEBUILD,region=,virtualized=False"
214+ ),
215+ ],
216+ self.stats_client.incr.mock_calls,
217+ )
218+ self.stats_client.incr.reset_mock()
219+
220+ # Since we didn't meet SCAN_FAILURE_THRESHOLD, a success just resets
221+ # the count and no harm is done to innocent bystanders.
222+ self.patch(scanner, "scan", lambda: defer.succeed(None))
223+ yield scanner.singleCycle()
224+ self.assertEqual(0, scanner.scan_failure_count)
225+ self.assertEqual(0, builder.failure_count)
226+ self.assertEqual(0, builder.current_build.failure_count)
227+ self.assertEqual([], self.stats_client.incr.mock_calls)
228+ self.stats_client.incr.reset_mock()
229+
230 def test_scan_first_fail(self):
231 # The first failure of a job should result in the failure_count
232 # on the job and the builder both being incremented.
233 return self._assertFailureCounting(
234+ scan_count=SCAN_FAILURE_THRESHOLD - 1,
235 builder_count=0,
236 job_count=0,
237 expected_builder_count=1,
238@@ -511,6 +608,7 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
239 # The first failure of a job should result in the failure_count
240 # on the job and the builder both being incremented.
241 return self._assertFailureCounting(
242+ scan_count=SCAN_FAILURE_THRESHOLD - 1,
243 builder_count=1,
244 job_count=0,
245 expected_builder_count=2,
246@@ -521,6 +619,7 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
247 # The first failure of a job should result in the failure_count
248 # on the job and the builder both being incremented.
249 return self._assertFailureCounting(
250+ scan_count=SCAN_FAILURE_THRESHOLD - 1,
251 builder_count=0,
252 job_count=1,
253 expected_builder_count=1,
254@@ -532,7 +631,9 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
255 def failing_scan():
256 return defer.fail(Exception("fake exception"))
257
258- scanner = self._getScanner()
259+ scanner = self._getScanner(
260+ scan_failure_count=SCAN_FAILURE_THRESHOLD - 1
261+ )
262 scanner.scan = failing_scan
263 builder = getUtility(IBuilderSet)[scanner.builder_name]
264 builder.failure_count = BUILDER_FAILURE_THRESHOLD
265@@ -547,7 +648,10 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
266 def failing_scan():
267 return defer.fail(Exception("fake exception"))
268
269- scanner = self._getScanner()
270+ # TODO: Add and test metrics for retried scan failures.
271+ scanner = self._getScanner(
272+ scan_failure_count=SCAN_FAILURE_THRESHOLD - 1
273+ )
274 scanner.scan = failing_scan
275 builder = getUtility(IBuilderSet)[scanner.builder_name]
276 builder.failure_count = BUILDER_FAILURE_THRESHOLD
277@@ -680,7 +784,9 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
278 # Run 'scan' and check its results.
279 switch_dbuser(config.builddmaster.dbuser)
280 clock = task.Clock()
281- scanner = self._getScanner(clock=clock)
282+ scanner = self._getScanner(
283+ clock=clock, scan_failure_count=SCAN_FAILURE_THRESHOLD - 1
284+ )
285 yield scanner.scan()
286
287 # An abort request should be sent.
288@@ -689,6 +795,7 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
289
290 # Advance time a little. Nothing much should happen.
291 clock.advance(1)
292+ scanner.scan_failure_count = SCAN_FAILURE_THRESHOLD - 1
293 yield scanner.scan()
294 self.assertEqual(1, worker.call_log.count("abort"))
295 self.assertEqual(BuildStatus.CANCELLING, build.status)
296@@ -697,6 +804,7 @@ class TestWorkerScannerScan(StatsMixin, TestCaseWithFactory):
297 # we should have also called the resume() method on the worker that
298 # resets the virtual machine.
299 clock.advance(WorkerScanner.CANCEL_TIMEOUT)
300+ scanner.scan_failure_count = SCAN_FAILURE_THRESHOLD - 1
301 yield scanner.singleCycle()
302 self.assertEqual(1, worker.call_log.count("abort"))
303 self.assertEqual(BuilderCleanStatus.DIRTY, builder.clean_status)
304@@ -916,7 +1024,8 @@ class TestPrefetchedBuilderFactory(TestCaseWithFactory):
305 # with the non-prefetching BuilderFactory.
306 self.assertEqual(7, len(all_vitals))
307 self.assertEqual(
308- 4, len([v for v in all_vitals if v.build_queue is not None])
309+ SCAN_FAILURE_THRESHOLD - 1,
310+ len([v for v in all_vitals if v.build_queue is not None]),
311 )
312 self.assertContentEqual(BuilderFactory().iterVitals(), all_vitals)
313

Subscribers

People subscribed via source and target branches

to status/vote changes: