Merge ~wgrant/launchpad:buildd-manager-failure-refactor into launchpad:master

Proposed by William Grant
Status: Merged
Approved by: William Grant
Approved revision: a981c1869b22567c664c9805190e2323c396dacb
Merge reported by: Otto Co-Pilot
Merged at revision: not available
Proposed branch: ~wgrant/launchpad:buildd-manager-failure-refactor
Merge into: launchpad:master
Diff against target: 135 lines (+28/-33)
1 file modified
lib/lp/buildmaster/manager.py (+28/-33)
Reviewer Review Type Date Requested Status
Colin Watson (community) Approve
Review via email: mp+454692@code.launchpad.net

Commit message

Refactor buildd-manager job dispatch error handling

manager.py is now entirely inlineCallbacks.

To post a comment you must log in.
Revision history for this message
Colin Watson (cjwatson) :
review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
diff --git a/lib/lp/buildmaster/manager.py b/lib/lp/buildmaster/manager.py
index 7f2b870..0c6d335 100644
--- a/lib/lp/buildmaster/manager.py
+++ b/lib/lp/buildmaster/manager.py
@@ -11,7 +11,6 @@ __all__ = [
11]11]
1212
13import datetime13import datetime
14import functools
15import logging14import logging
16import os.path15import os.path
17import shutil16import shutil
@@ -502,6 +501,8 @@ class WorkerScanner:
502 self.date_cancel = None501 self.date_cancel = None
503 self.date_scanned = None502 self.date_scanned = None
504503
504 self.can_retry = True
505
505 # We cache the build cookie, keyed on the BuildQueue, to avoid506 # We cache the build cookie, keyed on the BuildQueue, to avoid
506 # hitting the DB on every scan.507 # hitting the DB on every scan.
507 self._cached_build_cookie = None508 self._cached_build_cookie = None
@@ -520,6 +521,7 @@ class WorkerScanner:
520 """Terminate the LoopingCall."""521 """Terminate the LoopingCall."""
521 self.loop.stop()522 self.loop.stop()
522523
524 @defer.inlineCallbacks
523 def singleCycle(self):525 def singleCycle(self):
524 # Inhibit scanning if the BuilderFactory hasn't updated since526 # Inhibit scanning if the BuilderFactory hasn't updated since
525 # the last run. This doesn't matter for the base BuilderFactory,527 # the last run. This doesn't matter for the base BuilderFactory,
@@ -533,22 +535,19 @@ class WorkerScanner:
533 self.logger.debug(535 self.logger.debug(
534 "Skipping builder %s (cache out of date)" % self.builder_name536 "Skipping builder %s (cache out of date)" % self.builder_name
535 )537 )
536 return defer.succeed(None)538 return
537539
538 self.logger.debug("Scanning builder %s" % self.builder_name)540 self.logger.debug("Scanning builder %s" % self.builder_name)
539 # Errors should normally be able to be retried a few times. Bits
540 # of scan() which don't want retries will call _scanFailed
541 # directly.
542 d = self.scan()
543 d.addErrback(functools.partial(self._scanFailed, True))
544 d.addBoth(self._updateDateScanned)
545 return d
546541
547 def _updateDateScanned(self, ignored):542 try:
543 yield self.scan()
544 except Exception as e:
545 self._scanFailed(self.can_retry, e)
546
548 self.logger.debug("Scan finished for builder %s" % self.builder_name)547 self.logger.debug("Scan finished for builder %s" % self.builder_name)
549 self.date_scanned = datetime.datetime.utcnow()548 self.date_scanned = datetime.datetime.utcnow()
550549
551 def _scanFailed(self, retry, failure):550 def _scanFailed(self, retry, exc):
552 """Deal with failures encountered during the scan cycle.551 """Deal with failures encountered during the scan cycle.
553552
554 1. Print the error in the log553 1. Print the error in the log
@@ -562,26 +561,22 @@ class WorkerScanner:
562561
563 # If we don't recognise the exception include a stack trace with562 # If we don't recognise the exception include a stack trace with
564 # the error.563 # the error.
565 error_message = failure.getErrorMessage()564 if isinstance(
566 if failure.check(565 exc,
567 BuildWorkerFailure,566 (
568 CannotBuild,567 BuildWorkerFailure,
569 CannotResumeHost,568 CannotBuild,
570 BuildDaemonError,569 CannotResumeHost,
571 CannotFetchFile,570 BuildDaemonError,
571 CannotFetchFile,
572 ),
572 ):573 ):
573 self.logger.info(574 self.logger.info(
574 "Scanning %s failed with: %s"575 "Scanning %s failed with: %r" % (self.builder_name, exc)
575 % (self.builder_name, error_message)
576 )576 )
577 else:577 else:
578 self.logger.info(578 self.logger.info(
579 "Scanning %s failed with: %s\n%s"579 "Scanning %s failed" % self.builder_name, exc_info=exc
580 % (
581 self.builder_name,
582 failure.getErrorMessage(),
583 failure.getTraceback(),
584 )
585 )580 )
586581
587 # Decide if we need to terminate the job or reset/fail the builder.582 # Decide if we need to terminate the job or reset/fail the builder.
@@ -602,7 +597,7 @@ class WorkerScanner:
602 else:597 else:
603 labels["build"] = False598 labels["build"] = False
604 self.statsd_client.incr("builders.judged_failed", labels=labels)599 self.statsd_client.incr("builders.judged_failed", labels=labels)
605 recover_failure(self.logger, vitals, builder, retry, failure.value)600 recover_failure(self.logger, vitals, builder, retry, exc)
606 transaction.commit()601 transaction.commit()
607 except Exception:602 except Exception:
608 # Catastrophic code failure! Not much we can do.603 # Catastrophic code failure! Not much we can do.
@@ -687,6 +682,7 @@ class WorkerScanner:
687 vitals = self.builder_factory.getVitals(self.builder_name)682 vitals = self.builder_factory.getVitals(self.builder_name)
688 interactor = self.interactor_factory()683 interactor = self.interactor_factory()
689 worker = self.worker_factory(vitals)684 worker = self.worker_factory(vitals)
685 self.can_retry = True
690686
691 if vitals.build_queue is not None:687 if vitals.build_queue is not None:
692 if vitals.clean_status != BuilderCleanStatus.DIRTY:688 if vitals.clean_status != BuilderCleanStatus.DIRTY:
@@ -759,15 +755,14 @@ class WorkerScanner:
759 "%s is in manual mode, not dispatching.", vitals.name755 "%s is in manual mode, not dispatching.", vitals.name
760 )756 )
761 return757 return
762 # Try to find and dispatch a job. If it fails, don't758 # Try to find and dispatch a job. If it fails, don't attempt to
763 # attempt to just retry the scan; we need to reset759 # just retry the scan; we need to reset the job so the dispatch
764 # the job so the dispatch will be reattempted.760 # will be reattempted.
765 builder = self.builder_factory[self.builder_name]761 builder = self.builder_factory[self.builder_name]
766 d = interactor.findAndStartJob(762 self.can_retry = False
763 yield interactor.findAndStartJob(
767 vitals, builder, worker, self.builder_factory764 vitals, builder, worker, self.builder_factory
768 )765 )
769 d.addErrback(functools.partial(self._scanFailed, False))
770 yield d
771 if builder.currentjob is not None:766 if builder.currentjob is not None:
772 # After a successful dispatch we can reset the767 # After a successful dispatch we can reset the
773 # failure_count.768 # failure_count.

Subscribers

People subscribed via source and target branches

to status/vote changes: