Merge lp:~mwhudson/launchpad/back-off-failing-imports-bug-413637 into lp:launchpad

Proposed by Michael Hudson-Doyle
Status: Merged
Approved by: Michael Hudson-Doyle
Approved revision: no longer in the source branch.
Merged at revision: not available
Proposed branch: lp:~mwhudson/launchpad/back-off-failing-imports-bug-413637
Merge into: lp:launchpad
Diff against target: 176 lines (+64/-24)
3 files modified
lib/lp/code/interfaces/codeimportjob.py (+12/-3)
lib/lp/code/model/codeimportjob.py (+27/-21)
lib/lp/code/model/tests/test_codeimportjob.py (+25/-0)
To merge this branch: bzr merge lp:~mwhudson/launchpad/back-off-failing-imports-bug-413637
Reviewer Review Type Date Requested Status
Tim Penhey (community) Approve
Review via email: mp+21411@code.launchpad.net

Commit message

Exponentially back off trying failing code imports before marking them failed.

Description of the change

This is a re-propose of https://code.edge.launchpad.net/~mwhudson/launchpad/back-off-failing-imports-bug-413637/+merge/21408

Hi there,

This small branch changes the code import system to exponentially back off on retrying code imports -- see the linked bug for more.

Currently it backs off so that it tries imports at 6, 12, 24 and 48 hours (for subversion) for a total of three and a bit days of trying before marking an import failed -- is that enough? It seems OK, though perhaps a bit on the low side, to me.

Cheers,
mwh

To post a comment you must log in.
Revision history for this message
Tim Penhey (thumper) wrote :

> def newJob(code_import, interval=datetime.timedelta(0)):

Shouldn't we default the interval to None? Having objects as default args just makes me feel funny.
Actually you do use interval=None in the model class. Can you update the interface to match please?

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'lib/lp/code/interfaces/codeimportjob.py'
--- lib/lp/code/interfaces/codeimportjob.py 2010-02-22 08:10:03 +0000
+++ lib/lp/code/interfaces/codeimportjob.py 2010-03-16 01:37:25 +0000
@@ -16,6 +16,8 @@
16 'ICodeImportJobWorkflow',16 'ICodeImportJobWorkflow',
17 ]17 ]
1818
19import datetime
20
19from zope.interface import Interface21from zope.interface import Interface
20from zope.schema import Choice, Datetime, Int, Object, Text22from zope.schema import Choice, Datetime, Int, Object, Text
2123
@@ -137,13 +139,18 @@
137class ICodeImportJobWorkflow(Interface):139class ICodeImportJobWorkflow(Interface):
138 """Utility to manage `CodeImportJob` objects through their life cycle."""140 """Utility to manage `CodeImportJob` objects through their life cycle."""
139141
140 def newJob(code_import):142 def newJob(code_import, interval=None):
141 """Create a `CodeImportJob` associated with a reviewed `CodeImport`.143 """Create a `CodeImportJob` associated with a reviewed `CodeImport`.
142144
143 Call this method from `CodeImport.updateFromData` when the145 Call this method from `CodeImport.updateFromData` when the
144 review_status of `code_import` changes to REVIEWED.146 review_status of `code_import` changes to REVIEWED.
145147
146 :param code_import: `CodeImport` object.148 :param code_import: `CodeImport` object.
149 :param interval: Schedule the job this far ahead of the start of the
150 last update of this import. Defaults to
151 ``code_import.effective_update_interval``. This parameter is
152 ignored and the job scheduled for right now if this is the first
153 run of the import.
147 :precondition: `code_import` has REVIEWED review_status.154 :precondition: `code_import` has REVIEWED review_status.
148 :precondition: `code_import` has no associated `CodeImportJob`.155 :precondition: `code_import` has no associated `CodeImportJob`.
149 :return: A new `CodeImportJob` object associated to `code_import`.156 :return: A new `CodeImportJob` object associated to `code_import`.
@@ -218,9 +225,11 @@
218 display for diagnostics. May be None.225 display for diagnostics. May be None.
219 :precondition: `import_job`.state == RUNNING.226 :precondition: `import_job`.state == RUNNING.
220 :postcondition: `import_job` is deleted.227 :postcondition: `import_job` is deleted.
221 :postcondition: `code_import.import_job` is not None.228 :postcondition: `code_import.import_job` is not None unless the job
229 has failed more than consecutive_failure_limit times in a row.
222 :postcondition: `code_import.import_job.date_due` is230 :postcondition: `code_import.import_job.date_due` is
223 import_job.date_due + code_import.effective_update_interval`.231 import_job.date_due + code_import.effective_update_interval`, with
232 scaling to retry failing imports less often.
224 :postcondition: A `CodeImportResult` was created.233 :postcondition: A `CodeImportResult` was created.
225 :postcondition: A FINISH `CodeImportEvent` was created.234 :postcondition: A FINISH `CodeImportEvent` was created.
226 """235 """
227236
=== modified file 'lib/lp/code/model/codeimportjob.py'
--- lib/lp/code/model/codeimportjob.py 2010-02-24 10:18:16 +0000
+++ lib/lp/code/model/codeimportjob.py 2010-03-16 01:37:25 +0000
@@ -10,6 +10,8 @@
10 'CodeImportJobWorkflow',10 'CodeImportJobWorkflow',
11 ]11 ]
1212
13import datetime
14
13from sqlobject import ForeignKey, IntCol, SQLObjectNotFound, StringCol15from sqlobject import ForeignKey, IntCol, SQLObjectNotFound, StringCol
1416
15from zope.component import getUtility17from zope.component import getUtility
@@ -140,7 +142,7 @@
140142
141 implements(ICodeImportJobWorkflow)143 implements(ICodeImportJobWorkflow)
142144
143 def newJob(self, code_import, date_due=None):145 def newJob(self, code_import, interval=None):
144 """See `ICodeImportJobWorkflow`."""146 """See `ICodeImportJobWorkflow`."""
145 assert code_import.review_status == CodeImportReviewStatus.REVIEWED, (147 assert code_import.review_status == CodeImportReviewStatus.REVIEWED, (
146 "Review status of %s is not REVIEWED: %s" % (148 "Review status of %s is not REVIEWED: %s" % (
@@ -149,23 +151,22 @@
149 "Already associated to a CodeImportJob: %s" % (151 "Already associated to a CodeImportJob: %s" % (
150 code_import.branch.unique_name))152 code_import.branch.unique_name))
151153
154 if interval is None:
155 interval = code_import.effective_update_interval
156
152 job = CodeImportJob(code_import=code_import, date_due=UTC_NOW)157 job = CodeImportJob(code_import=code_import, date_due=UTC_NOW)
153158
154 if date_due is None:159 # Find the most recent CodeImportResult for this CodeImport. We
155 # Find the most recent CodeImportResult for this CodeImport. We160 # sort by date_created because we do not have an index on
156 # sort by date_created because we do not have an index on161 # date_job_started in the database, and that should give the same
157 # date_job_started in the database, and that should give the same162 # sort order.
158 # sort order.163 most_recent_result_list = list(CodeImportResult.selectBy(
159 most_recent_result_list = list(CodeImportResult.selectBy(164 code_import=code_import).orderBy(['-date_created']).limit(1))
160 code_import=code_import).orderBy(['-date_created']).limit(1))
161165
162 if len(most_recent_result_list) != 0:166 if len(most_recent_result_list) != 0:
163 [most_recent_result] = most_recent_result_list167 [most_recent_result] = most_recent_result_list
164 interval = code_import.effective_update_interval168 date_due = most_recent_result.date_job_started + interval
165 date_due = most_recent_result.date_job_started + interval169 job.date_due = max(job.date_due, date_due)
166 job.date_due = max(job.date_due, date_due)
167 else:
168 job.date_due = date_due
169170
170 return job171 return job
171172
@@ -281,15 +282,20 @@
281 # If the import has failed too many times in a row, mark it as282 # If the import has failed too many times in a row, mark it as
282 # FAILING.283 # FAILING.
283 failure_limit = config.codeimport.consecutive_failure_limit284 failure_limit = config.codeimport.consecutive_failure_limit
284 if code_import.consecutive_failure_count >= failure_limit:285 failure_count = code_import.consecutive_failure_count
286 if failure_count >= failure_limit:
285 code_import.updateFromData(287 code_import.updateFromData(
286 dict(review_status=CodeImportReviewStatus.FAILING), None)288 dict(review_status=CodeImportReviewStatus.FAILING), None)
289 elif status == CodeImportResultStatus.SUCCESS_PARTIAL:
290 interval = datetime.timedelta(0)
291 elif failure_count > 0:
292 interval = (code_import.effective_update_interval *
293 (2 ** (failure_count - 1)))
294 else:
295 interval = code_import.effective_update_interval
287 # Only start a new one if the import is still in the REVIEWED state.296 # Only start a new one if the import is still in the REVIEWED state.
288 if code_import.review_status == CodeImportReviewStatus.REVIEWED:297 if code_import.review_status == CodeImportReviewStatus.REVIEWED:
289 extra = {}298 self.newJob(code_import, interval=interval)
290 if status == CodeImportResultStatus.SUCCESS_PARTIAL:
291 extra['date_due'] = UTC_NOW
292 self.newJob(code_import, **extra)
293 # If the status was successful, update date_last_successful.299 # If the status was successful, update date_last_successful.
294 if status in [CodeImportResultStatus.SUCCESS,300 if status in [CodeImportResultStatus.SUCCESS,
295 CodeImportResultStatus.SUCCESS_NOCHANGE]:301 CodeImportResultStatus.SUCCESS_NOCHANGE]:
@@ -321,7 +327,7 @@
321 import_job, CodeImportResultStatus.RECLAIMED, None)327 import_job, CodeImportResultStatus.RECLAIMED, None)
322 # 3)328 # 3)
323 if code_import.review_status == CodeImportReviewStatus.REVIEWED:329 if code_import.review_status == CodeImportReviewStatus.REVIEWED:
324 self.newJob(code_import, UTC_NOW)330 self.newJob(code_import, datetime.timedelta(0))
325 # 4)331 # 4)
326 getUtility(ICodeImportEventSet).newReclaim(332 getUtility(ICodeImportEventSet).newReclaim(
327 code_import, machine, job_id)333 code_import, machine, job_id)
328334
=== modified file 'lib/lp/code/model/tests/test_codeimportjob.py'
--- lib/lp/code/model/tests/test_codeimportjob.py 2010-03-14 20:15:34 +0000
+++ lib/lp/code/model/tests/test_codeimportjob.py 2010-03-16 01:37:25 +0000
@@ -778,6 +778,31 @@
778 self.assertEqual(new_job.machine, None)778 self.assertEqual(new_job.machine, None)
779 self.assertSqlAttributeEqualsDate(new_job, 'date_due', UTC_NOW)779 self.assertSqlAttributeEqualsDate(new_job, 'date_due', UTC_NOW)
780780
781 def test_failures_back_off(self):
782 # We wait for longer and longer between retrying failing imports, to
783 # make it less likely that an import is marked failing just because
784 # someone's DNS went down for a day.
785 running_job = self.makeRunningJob()
786 intervals = []
787 interval = running_job.code_import.effective_update_interval
788 expected_intervals = []
789 for i in range(config.codeimport.consecutive_failure_limit - 1):
790 expected_intervals.append(interval)
791 interval *= 2
792 # Fail an import a bunch of times and record how far in the future the
793 # next job was scheduled.
794 for i in range(config.codeimport.consecutive_failure_limit - 1):
795 code_import = running_job.code_import
796 getUtility(ICodeImportJobWorkflow).finishJob(
797 running_job, CodeImportResultStatus.FAILURE, None)
798 intervals.append(
799 code_import.import_job.date_due -
800 code_import.results[-1].date_job_started)
801 running_job = code_import.import_job
802 getUtility(ICodeImportJobWorkflow).startJob(
803 running_job, self.machine)
804 self.assertEqual(expected_intervals, intervals)
805
781 def test_doesntCreateNewJobIfCodeImportNotReviewed(self):806 def test_doesntCreateNewJobIfCodeImportNotReviewed(self):
782 # finishJob() creates a new CodeImportJob for the given CodeImport,807 # finishJob() creates a new CodeImportJob for the given CodeImport,
783 # unless the CodeImport has been suspended or marked invalid.808 # unless the CodeImport has been suspended or marked invalid.