Merge ~wgrant/launchpad:buildd-manager-failure-refactor into launchpad:master

Proposed by William Grant
Status: Merged
Approved by: William Grant
Approved revision: a981c1869b22567c664c9805190e2323c396dacb
Merge reported by: Otto Co-Pilot
Merged at revision: not available
Proposed branch: ~wgrant/launchpad:buildd-manager-failure-refactor
Merge into: launchpad:master
Diff against target: 135 lines (+28/-33)
1 file modified
lib/lp/buildmaster/manager.py (+28/-33)
Reviewer Review Type Date Requested Status
Colin Watson (community) Approve
Review via email: mp+454692@code.launchpad.net

Commit message

Refactor buildd-manager job dispatch error handling

manager.py is now entirely inlineCallbacks.

To post a comment you must log in.
Revision history for this message
Colin Watson (cjwatson) :
review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/lib/lp/buildmaster/manager.py b/lib/lp/buildmaster/manager.py
2index 7f2b870..0c6d335 100644
3--- a/lib/lp/buildmaster/manager.py
4+++ b/lib/lp/buildmaster/manager.py
5@@ -11,7 +11,6 @@ __all__ = [
6 ]
7
8 import datetime
9-import functools
10 import logging
11 import os.path
12 import shutil
13@@ -502,6 +501,8 @@ class WorkerScanner:
14 self.date_cancel = None
15 self.date_scanned = None
16
17+ self.can_retry = True
18+
19 # We cache the build cookie, keyed on the BuildQueue, to avoid
20 # hitting the DB on every scan.
21 self._cached_build_cookie = None
22@@ -520,6 +521,7 @@ class WorkerScanner:
23 """Terminate the LoopingCall."""
24 self.loop.stop()
25
26+ @defer.inlineCallbacks
27 def singleCycle(self):
28 # Inhibit scanning if the BuilderFactory hasn't updated since
29 # the last run. This doesn't matter for the base BuilderFactory,
30@@ -533,22 +535,19 @@ class WorkerScanner:
31 self.logger.debug(
32 "Skipping builder %s (cache out of date)" % self.builder_name
33 )
34- return defer.succeed(None)
35+ return
36
37 self.logger.debug("Scanning builder %s" % self.builder_name)
38- # Errors should normally be able to be retried a few times. Bits
39- # of scan() which don't want retries will call _scanFailed
40- # directly.
41- d = self.scan()
42- d.addErrback(functools.partial(self._scanFailed, True))
43- d.addBoth(self._updateDateScanned)
44- return d
45
46- def _updateDateScanned(self, ignored):
47+ try:
48+ yield self.scan()
49+ except Exception as e:
50+ self._scanFailed(self.can_retry, e)
51+
52 self.logger.debug("Scan finished for builder %s" % self.builder_name)
53 self.date_scanned = datetime.datetime.utcnow()
54
55- def _scanFailed(self, retry, failure):
56+ def _scanFailed(self, retry, exc):
57 """Deal with failures encountered during the scan cycle.
58
59 1. Print the error in the log
60@@ -562,26 +561,22 @@ class WorkerScanner:
61
62 # If we don't recognise the exception include a stack trace with
63 # the error.
64- error_message = failure.getErrorMessage()
65- if failure.check(
66- BuildWorkerFailure,
67- CannotBuild,
68- CannotResumeHost,
69- BuildDaemonError,
70- CannotFetchFile,
71+ if isinstance(
72+ exc,
73+ (
74+ BuildWorkerFailure,
75+ CannotBuild,
76+ CannotResumeHost,
77+ BuildDaemonError,
78+ CannotFetchFile,
79+ ),
80 ):
81 self.logger.info(
82- "Scanning %s failed with: %s"
83- % (self.builder_name, error_message)
84+ "Scanning %s failed with: %r" % (self.builder_name, exc)
85 )
86 else:
87 self.logger.info(
88- "Scanning %s failed with: %s\n%s"
89- % (
90- self.builder_name,
91- failure.getErrorMessage(),
92- failure.getTraceback(),
93- )
94+ "Scanning %s failed" % self.builder_name, exc_info=exc
95 )
96
97 # Decide if we need to terminate the job or reset/fail the builder.
98@@ -602,7 +597,7 @@ class WorkerScanner:
99 else:
100 labels["build"] = False
101 self.statsd_client.incr("builders.judged_failed", labels=labels)
102- recover_failure(self.logger, vitals, builder, retry, failure.value)
103+ recover_failure(self.logger, vitals, builder, retry, exc)
104 transaction.commit()
105 except Exception:
106 # Catastrophic code failure! Not much we can do.
107@@ -687,6 +682,7 @@ class WorkerScanner:
108 vitals = self.builder_factory.getVitals(self.builder_name)
109 interactor = self.interactor_factory()
110 worker = self.worker_factory(vitals)
111+ self.can_retry = True
112
113 if vitals.build_queue is not None:
114 if vitals.clean_status != BuilderCleanStatus.DIRTY:
115@@ -759,15 +755,14 @@ class WorkerScanner:
116 "%s is in manual mode, not dispatching.", vitals.name
117 )
118 return
119- # Try to find and dispatch a job. If it fails, don't
120- # attempt to just retry the scan; we need to reset
121- # the job so the dispatch will be reattempted.
122+ # Try to find and dispatch a job. If it fails, don't attempt to
123+ # just retry the scan; we need to reset the job so the dispatch
124+ # will be reattempted.
125 builder = self.builder_factory[self.builder_name]
126- d = interactor.findAndStartJob(
127+ self.can_retry = False
128+ yield interactor.findAndStartJob(
129 vitals, builder, worker, self.builder_factory
130 )
131- d.addErrback(functools.partial(self._scanFailed, False))
132- yield d
133 if builder.currentjob is not None:
134 # After a successful dispatch we can reset the
135 # failure_count.

Subscribers

People subscribed via source and target branches

to status/vote changes: