Merge lp:~mwhudson/launchpad/back-off-failing-imports-bug-413637 into lp:launchpad

Proposed by Michael Hudson-Doyle
Status: Merged
Approved by: Michael Hudson-Doyle
Approved revision: no longer in the source branch.
Merged at revision: not available
Proposed branch: lp:~mwhudson/launchpad/back-off-failing-imports-bug-413637
Merge into: lp:launchpad
Diff against target: 176 lines (+64/-24)
3 files modified
lib/lp/code/interfaces/codeimportjob.py (+12/-3)
lib/lp/code/model/codeimportjob.py (+27/-21)
lib/lp/code/model/tests/test_codeimportjob.py (+25/-0)
To merge this branch: bzr merge lp:~mwhudson/launchpad/back-off-failing-imports-bug-413637
Reviewer Review Type Date Requested Status
Tim Penhey (community) Approve
Review via email: mp+21411@code.launchpad.net

Commit message

Exponentially back off trying failing code imports before marking them failed.

Description of the change

This is a re-propose of https://code.edge.launchpad.net/~mwhudson/launchpad/back-off-failing-imports-bug-413637/+merge/21408

Hi there,

This small branch changes the code import system to exponentially back off on retrying code imports -- see the linked bug for more.

Currently it backs off so that it tries imports at 6, 12, 24 and 48 hours (for subversion) for a total of three and a bit days of trying before marking an import failed -- is that enough? It seems OK, though perhaps a bit on the low side, to me.

Cheers,
mwh

To post a comment you must log in.
Revision history for this message
Tim Penhey (thumper) wrote :

> def newJob(code_import, interval=datetime.timedelta(0)):

Shouldn't we default the interval to None? Having objects as default args just makes me feel funny.
Actually you do use interval=None in the model class. Can you update the interface to match please?

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'lib/lp/code/interfaces/codeimportjob.py'
2--- lib/lp/code/interfaces/codeimportjob.py 2010-02-22 08:10:03 +0000
3+++ lib/lp/code/interfaces/codeimportjob.py 2010-03-16 01:37:25 +0000
4@@ -16,6 +16,8 @@
5 'ICodeImportJobWorkflow',
6 ]
7
8+import datetime
9+
10 from zope.interface import Interface
11 from zope.schema import Choice, Datetime, Int, Object, Text
12
13@@ -137,13 +139,18 @@
14 class ICodeImportJobWorkflow(Interface):
15 """Utility to manage `CodeImportJob` objects through their life cycle."""
16
17- def newJob(code_import):
18+ def newJob(code_import, interval=None):
19 """Create a `CodeImportJob` associated with a reviewed `CodeImport`.
20
21 Call this method from `CodeImport.updateFromData` when the
22 review_status of `code_import` changes to REVIEWED.
23
24 :param code_import: `CodeImport` object.
25+ :param interval: Schedule the job this far ahead of the start of the
26+ last update of this import. Defaults to
27+ ``code_import.effective_update_interval``. This parameter is
28+ ignored and the job scheduled for right now if this is the first
29+ run of the import.
30 :precondition: `code_import` has REVIEWED review_status.
31 :precondition: `code_import` has no associated `CodeImportJob`.
32 :return: A new `CodeImportJob` object associated to `code_import`.
33@@ -218,9 +225,11 @@
34 display for diagnostics. May be None.
35 :precondition: `import_job`.state == RUNNING.
36 :postcondition: `import_job` is deleted.
37- :postcondition: `code_import.import_job` is not None.
38+ :postcondition: `code_import.import_job` is not None unless the job
39+ has failed more than consecutive_failure_limit times in a row.
40 :postcondition: `code_import.import_job.date_due` is
41- import_job.date_due + code_import.effective_update_interval`.
42+ import_job.date_due + code_import.effective_update_interval`, with
43+ scaling to retry failing imports less often.
44 :postcondition: A `CodeImportResult` was created.
45 :postcondition: A FINISH `CodeImportEvent` was created.
46 """
47
48=== modified file 'lib/lp/code/model/codeimportjob.py'
49--- lib/lp/code/model/codeimportjob.py 2010-02-24 10:18:16 +0000
50+++ lib/lp/code/model/codeimportjob.py 2010-03-16 01:37:25 +0000
51@@ -10,6 +10,8 @@
52 'CodeImportJobWorkflow',
53 ]
54
55+import datetime
56+
57 from sqlobject import ForeignKey, IntCol, SQLObjectNotFound, StringCol
58
59 from zope.component import getUtility
60@@ -140,7 +142,7 @@
61
62 implements(ICodeImportJobWorkflow)
63
64- def newJob(self, code_import, date_due=None):
65+ def newJob(self, code_import, interval=None):
66 """See `ICodeImportJobWorkflow`."""
67 assert code_import.review_status == CodeImportReviewStatus.REVIEWED, (
68 "Review status of %s is not REVIEWED: %s" % (
69@@ -149,23 +151,22 @@
70 "Already associated to a CodeImportJob: %s" % (
71 code_import.branch.unique_name))
72
73+ if interval is None:
74+ interval = code_import.effective_update_interval
75+
76 job = CodeImportJob(code_import=code_import, date_due=UTC_NOW)
77
78- if date_due is None:
79- # Find the most recent CodeImportResult for this CodeImport. We
80- # sort by date_created because we do not have an index on
81- # date_job_started in the database, and that should give the same
82- # sort order.
83- most_recent_result_list = list(CodeImportResult.selectBy(
84- code_import=code_import).orderBy(['-date_created']).limit(1))
85+ # Find the most recent CodeImportResult for this CodeImport. We
86+ # sort by date_created because we do not have an index on
87+ # date_job_started in the database, and that should give the same
88+ # sort order.
89+ most_recent_result_list = list(CodeImportResult.selectBy(
90+ code_import=code_import).orderBy(['-date_created']).limit(1))
91
92- if len(most_recent_result_list) != 0:
93- [most_recent_result] = most_recent_result_list
94- interval = code_import.effective_update_interval
95- date_due = most_recent_result.date_job_started + interval
96- job.date_due = max(job.date_due, date_due)
97- else:
98- job.date_due = date_due
99+ if len(most_recent_result_list) != 0:
100+ [most_recent_result] = most_recent_result_list
101+ date_due = most_recent_result.date_job_started + interval
102+ job.date_due = max(job.date_due, date_due)
103
104 return job
105
106@@ -281,15 +282,20 @@
107 # If the import has failed too many times in a row, mark it as
108 # FAILING.
109 failure_limit = config.codeimport.consecutive_failure_limit
110- if code_import.consecutive_failure_count >= failure_limit:
111+ failure_count = code_import.consecutive_failure_count
112+ if failure_count >= failure_limit:
113 code_import.updateFromData(
114 dict(review_status=CodeImportReviewStatus.FAILING), None)
115+ elif status == CodeImportResultStatus.SUCCESS_PARTIAL:
116+ interval = datetime.timedelta(0)
117+ elif failure_count > 0:
118+ interval = (code_import.effective_update_interval *
119+ (2 ** (failure_count - 1)))
120+ else:
121+ interval = code_import.effective_update_interval
122 # Only start a new one if the import is still in the REVIEWED state.
123 if code_import.review_status == CodeImportReviewStatus.REVIEWED:
124- extra = {}
125- if status == CodeImportResultStatus.SUCCESS_PARTIAL:
126- extra['date_due'] = UTC_NOW
127- self.newJob(code_import, **extra)
128+ self.newJob(code_import, interval=interval)
129 # If the status was successful, update date_last_successful.
130 if status in [CodeImportResultStatus.SUCCESS,
131 CodeImportResultStatus.SUCCESS_NOCHANGE]:
132@@ -321,7 +327,7 @@
133 import_job, CodeImportResultStatus.RECLAIMED, None)
134 # 3)
135 if code_import.review_status == CodeImportReviewStatus.REVIEWED:
136- self.newJob(code_import, UTC_NOW)
137+ self.newJob(code_import, datetime.timedelta(0))
138 # 4)
139 getUtility(ICodeImportEventSet).newReclaim(
140 code_import, machine, job_id)
141
142=== modified file 'lib/lp/code/model/tests/test_codeimportjob.py'
143--- lib/lp/code/model/tests/test_codeimportjob.py 2010-03-14 20:15:34 +0000
144+++ lib/lp/code/model/tests/test_codeimportjob.py 2010-03-16 01:37:25 +0000
145@@ -778,6 +778,31 @@
146 self.assertEqual(new_job.machine, None)
147 self.assertSqlAttributeEqualsDate(new_job, 'date_due', UTC_NOW)
148
149+ def test_failures_back_off(self):
150+ # We wait for longer and longer between retrying failing imports, to
151+ # make it less likely that an import is marked failing just because
152+ # someone's DNS went down for a day.
153+ running_job = self.makeRunningJob()
154+ intervals = []
155+ interval = running_job.code_import.effective_update_interval
156+ expected_intervals = []
157+ for i in range(config.codeimport.consecutive_failure_limit - 1):
158+ expected_intervals.append(interval)
159+ interval *= 2
160+ # Fail an import a bunch of times and record how far in the future the
161+ # next job was scheduled.
162+ for i in range(config.codeimport.consecutive_failure_limit - 1):
163+ code_import = running_job.code_import
164+ getUtility(ICodeImportJobWorkflow).finishJob(
165+ running_job, CodeImportResultStatus.FAILURE, None)
166+ intervals.append(
167+ code_import.import_job.date_due -
168+ code_import.results[-1].date_job_started)
169+ running_job = code_import.import_job
170+ getUtility(ICodeImportJobWorkflow).startJob(
171+ running_job, self.machine)
172+ self.assertEqual(expected_intervals, intervals)
173+
174 def test_doesntCreateNewJobIfCodeImportNotReviewed(self):
175 # finishJob() creates a new CodeImportJob for the given CodeImport,
176 # unless the CodeImport has been suspended or marked invalid.