Jonathan Lange wrote:
> On Mon, Mar 15, 2010 at 10:22 PM, Michael Hudson
> <email address hidden> wrote:
>> Michael Hudson has proposed merging lp:~mwhudson/launchpad/back-off-failing-imports-bug-413637 into lp:launchpad.
>>
>> Requested reviews:
>> Canonical Launchpad Engineering (launchpad)
>> Related bugs:
>> #413637 imports disabled too rapidly
>> https://bugs.launchpad.net/bugs/413637
>>
>>
>> Hi there,
>>
>> This small branch changes the code import system to exponentially back off on retrying code imports -- see the linked bug for more.
>>
>
> I think the branch is bigger than you intended. It's got a bunch of
> build farm job stuff in the diff.
>> Currently it backs off so that it tries imports at 6, 12, 24 and 48 hours (for subversion) for a total of three and a bit days of trying before marking an import failed -- is that enough? It seems OK, though perhaps a bit on the low side, to me.
>>
>
> Sounds fine to me.
>
> ...
>> === modified file 'lib/lp/code/model/codeimportjob.py'
>> --- lib/lp/code/model/codeimportjob.py 2010-02-24 10:18:16 +0000
>> +++ lib/lp/code/model/codeimportjob.py 2010-03-15 22:22:20 +0000
>> @@ -10,6 +10,8 @@
>> 'CodeImportJobWorkflow',
>> ]
>>
>> +import datetime
>> +
>> from sqlobject import ForeignKey, IntCol, SQLObjectNotFound, StringCol
>>
>> from zope.component import getUtility
>> @@ -140,7 +142,7 @@
>>
>> implements(ICodeImportJobWorkflow)
>>
>> - def newJob(self, code_import, date_due=None):
>> + def newJob(self, code_import, interval=None):
>> """See `ICodeImportJobWorkflow`."""
>> assert code_import.review_status == CodeImportReviewStatus.REVIEWED, (
>> "Review status of %s is not REVIEWED: %s" % (
>> @@ -149,23 +151,22 @@
>> "Already associated to a CodeImportJob: %s" % (
>> code_import.branch.unique_name))
>>
>> + if interval is None:
>> + interval = code_import.effective_update_interval
>> +
>> job = CodeImportJob(code_import=code_import, date_due=UTC_NOW)
>>
>> - if date_due is None:
>> - # Find the most recent CodeImportResult for this CodeImport. We
>> - # sort by date_created because we do not have an index on
>> - # date_job_started in the database, and that should give the same
>> - # sort order.
>> - most_recent_result_list = list(CodeImportResult.selectBy(
>> - code_import=code_import).orderBy(['-date_created']).limit(1))
>> + # Find the most recent CodeImportResult for this CodeImport. We
>> + # sort by date_created because we do not have an index on
>> + # date_job_started in the database, and that should give the same
>> + # sort order.
>> + most_recent_result_list = list(CodeImportResult.selectBy(
>> + code_import=code_import).orderBy(['-date_created']).limit(1))
>>
>
> Why aren't you using Storm, with its 'one()' notation for this?
Because all I did to this code was dedent it. I can change it to storm
if you like...
>> @@ -281,15 +282,19 @@
>> # If the import has failed too many times in a row, mark it as
>> # FAILING.
>> failure_limit = config.codeimport.consecutive_failure_limit
>> - if code_import.consecutive_failure_count >= failure_limit:
>> + failure_count = code_import.consecutive_failure_count
>> + if failure_count >= failure_limit:
>> code_import.updateFromData(
>> dict(review_status=CodeImportReviewStatus.FAILING), None)
>> + elif status == CodeImportResultStatus.SUCCESS_PARTIAL:
>> + interval = datetime.timedelta(0)
>> + elif failure_count > 0:
>> + interval = code_import.effective_update_interval * (2**(failure_count - 1))
>
> PEP 8 says you need more spaces here, I think.
Yeah, and the line was too long. Fixed.
>> + else:
>> + interval = code_import.effective_update_interval
>> # Only start a new one if the import is still in the REVIEWED state.
>> if code_import.review_status == CodeImportReviewStatus.REVIEWED:
>> - extra = {}
>> - if status == CodeImportResultStatus.SUCCESS_PARTIAL:
>> - extra['date_due'] = UTC_NOW
>> - self.newJob(code_import, **extra)
>> + self.newJob(code_import, interval=interval)
>> # If the status was successful, update date_last_successful.
>> if status in [CodeImportResultStatus.SUCCESS,
>> CodeImportResultStatus.SUCCESS_NOCHANGE]:
>> @@ -321,7 +326,7 @@
>> import_job, CodeImportResultStatus.RECLAIMED, None)
>> # 3)
>> if code_import.review_status == CodeImportReviewStatus.REVIEWED:
>> - self.newJob(code_import, UTC_NOW)
>> + self.newJob(code_import, datetime.timedelta(0))
>
> It's a shame that datetime.timedelta(0) isn't a constant.
Yes.
>> # 4)
>> getUtility(ICodeImportEventSet).newReclaim(
>> code_import, machine, job_id)
>>
>
>> === modified file 'lib/lp/code/model/tests/test_codeimportjob.py'
>> --- lib/lp/code/model/tests/test_codeimportjob.py 2010-03-14 20:15:34 +0000
>> +++ lib/lp/code/model/tests/test_codeimportjob.py 2010-03-15 22:22:20 +0000
>> @@ -778,6 +778,31 @@
>> self.assertEqual(new_job.machine, None)
>> self.assertSqlAttributeEqualsDate(new_job, 'date_due', UTC_NOW)
>>
>> + def test_failures_back_off(self):
>> + # We wait for longer and longer between retrying failing imports, to
>> + # make it less likely that an import is marked failing just because
>> + # someone's DNS went down for a day.
>> + running_job = self.makeRunningJob()
>> + intervals = []
>> + interval = running_job.code_import.effective_update_interval
>> + expected_intervals = []
>> + for i in range(config.codeimport.consecutive_failure_limit - 1):
>> + expected_intervals.append(interval)
>> + interval *= 2
>> + # Fail an import a bunch of times and record how far in the future the
>> + # next job was scheduled.
>> + for i in range(config.codeimport.consecutive_failure_limit - 1):
>> + code_import = running_job.code_import
>> + getUtility(ICodeImportJobWorkflow).finishJob(
>> + running_job, CodeImportResultStatus.FAILURE, None)
>> + intervals.append(
>> + code_import.import_job.date_due -
>> + code_import.results[-1].date_job_started)
>> + running_job = code_import.import_job
>> + getUtility(ICodeImportJobWorkflow).startJob(
>> + running_job, self.machine)
>> + self.assertEqual(expected_intervals, intervals)
>> +
>
> The test is a bit big & slightly hard to follow, but I'm not sure what
> could be done about it. Perhaps one test that checks one failure,
> another that checks two failures. I don't know -- you don't have to
> change it.
OK, I won't. I agree that it's a bit complicated, lots of them in this
file are (in their defense, they are testing fairly complicated things).
> Thanks though, great to see this change in code.
Thanks! Did you avoid voting because of the wrong target issue?
Jonathan Lange wrote: /bugs.launchpad .net/bugs/ 413637
> On Mon, Mar 15, 2010 at 10:22 PM, Michael Hudson
> <email address hidden> wrote:
>> Michael Hudson has proposed merging lp:~mwhudson/launchpad/back-off-failing-imports-bug-413637 into lp:launchpad.
>>
>> Requested reviews:
>> Canonical Launchpad Engineering (launchpad)
>> Related bugs:
>> #413637 imports disabled too rapidly
>> https:/
>>
>>
>> Hi there,
>>
>> This small branch changes the code import system to exponentially back off on retrying code imports -- see the linked bug for more.
>>
>
> I think the branch is bigger than you intended. It's got a bunch of
> build farm job stuff in the diff.
Grar grar, it's proposed into db-devel of course.
I proposed again: /code.edge. launchpad. net/~mwhudson/ launchpad/ back-off- failing- imports- bug-413637/ +merge/ 21411
https:/
>> Currently it backs off so that it tries imports at 6, 12, 24 and 48 hours (for subversion) for a total of three and a bit days of trying before marking an import failed -- is that enough? It seems OK, though perhaps a bit on the low side, to me. code/model/ codeimportjob. py' code/model/ codeimportjob. py 2010-02-24 10:18:16 +0000 code/model/ codeimportjob. py 2010-03-15 22:22:20 +0000 orkflow' , ICodeImportJobW orkflow) Workflow` .""" review_ status == CodeImportRevie wStatus. REVIEWED, ( branch. unique_ name)) effective_ update_ interval code_import= code_import, date_due=UTC_NOW) result_ list = list(CodeImport Result. selectBy( code_import) .orderBy( ['-date_ created' ]).limit( 1)) result_ list = list(CodeImport Result. selectBy( code_import) .orderBy( ['-date_ created' ]).limit( 1))
>>
>
> Sounds fine to me.
>
> ...
>> === modified file 'lib/lp/
>> --- lib/lp/
>> +++ lib/lp/
>> @@ -10,6 +10,8 @@
>> 'CodeImportJobW
>> ]
>>
>> +import datetime
>> +
>> from sqlobject import ForeignKey, IntCol, SQLObjectNotFound, StringCol
>>
>> from zope.component import getUtility
>> @@ -140,7 +142,7 @@
>>
>> implements(
>>
>> - def newJob(self, code_import, date_due=None):
>> + def newJob(self, code_import, interval=None):
>> """See `ICodeImportJob
>> assert code_import.
>> "Review status of %s is not REVIEWED: %s" % (
>> @@ -149,23 +151,22 @@
>> "Already associated to a CodeImportJob: %s" % (
>> code_import.
>>
>> + if interval is None:
>> + interval = code_import.
>> +
>> job = CodeImportJob(
>>
>> - if date_due is None:
>> - # Find the most recent CodeImportResult for this CodeImport. We
>> - # sort by date_created because we do not have an index on
>> - # date_job_started in the database, and that should give the same
>> - # sort order.
>> - most_recent_
>> - code_import=
>> + # Find the most recent CodeImportResult for this CodeImport. We
>> + # sort by date_created because we do not have an index on
>> + # date_job_started in the database, and that should give the same
>> + # sort order.
>> + most_recent_
>> + code_import=
>>
>
> Why aren't you using Storm, with its 'one()' notation for this?
Because all I did to this code was dedent it. I can change it to storm
if you like...
>> @@ -281,15 +282,19 @@ codeimport. consecutive_ failure_ limit consecutive_ failure_ count >= failure_limit: consecutive_ failure_ count updateFromData( status= CodeImportRevie wStatus. FAILING) , None) tStatus. SUCCESS_ PARTIAL: timedelta( 0) effective_ update_ interval * (2**(failure_count - 1))
>> # If the import has failed too many times in a row, mark it as
>> # FAILING.
>> failure_limit = config.
>> - if code_import.
>> + failure_count = code_import.
>> + if failure_count >= failure_limit:
>> code_import.
>> dict(review_
>> + elif status == CodeImportResul
>> + interval = datetime.
>> + elif failure_count > 0:
>> + interval = code_import.
>
> PEP 8 says you need more spaces here, I think.
Yeah, and the line was too long. Fixed.
>> + else: effective_ update_ interval review_ status == CodeImportRevie wStatus. REVIEWED: tStatus. SUCCESS_ PARTIAL: code_import, **extra) code_import, interval=interval) successful. ltStatus. SUCCESS, tStatus. SUCCESS_ NOCHANGE] : tStatus. RECLAIMED, None) review_ status == CodeImportRevie wStatus. REVIEWED: code_import, UTC_NOW) code_import, datetime. timedelta( 0)) timedelta( 0) isn't a constant.
>> + interval = code_import.
>> # Only start a new one if the import is still in the REVIEWED state.
>> if code_import.
>> - extra = {}
>> - if status == CodeImportResul
>> - extra['date_due'] = UTC_NOW
>> - self.newJob(
>> + self.newJob(
>> # If the status was successful, update date_last_
>> if status in [CodeImportResu
>> CodeImportResul
>> @@ -321,7 +326,7 @@
>> import_job, CodeImportResul
>> # 3)
>> if code_import.
>> - self.newJob(
>> + self.newJob(
>
> It's a shame that datetime.
Yes.
>> # 4) ICodeImportEven tSet).newReclai m( code/model/ tests/test_ codeimportjob. py' code/model/ tests/test_ codeimportjob. py 2010-03-14 20:15:34 +0000 code/model/ tests/test_ codeimportjob. py 2010-03-15 22:22:20 +0000 l(new_job. machine, None) ttributeEqualsD ate(new_ job, 'date_due', UTC_NOW) back_off( self): gJob() job.code_ import. effective_ update_ interval codeimport. consecutive_ failure_ limit - 1): intervals. append( interval) codeimport. consecutive_ failure_ limit - 1): job.code_ import ICodeImportJobW orkflow) .finishJob( tStatus. FAILURE, None) import_ job.date_ due - results[ -1].date_ job_started) import_ job ICodeImportJobW orkflow) .startJob( l(expected_ intervals, intervals)
>> getUtility(
>> code_import, machine, job_id)
>>
>
>> === modified file 'lib/lp/
>> --- lib/lp/
>> +++ lib/lp/
>> @@ -778,6 +778,31 @@
>> self.assertEqua
>> self.assertSqlA
>>
>> + def test_failures_
>> + # We wait for longer and longer between retrying failing imports, to
>> + # make it less likely that an import is marked failing just because
>> + # someone's DNS went down for a day.
>> + running_job = self.makeRunnin
>> + intervals = []
>> + interval = running_
>> + expected_intervals = []
>> + for i in range(config.
>> + expected_
>> + interval *= 2
>> + # Fail an import a bunch of times and record how far in the future the
>> + # next job was scheduled.
>> + for i in range(config.
>> + code_import = running_
>> + getUtility(
>> + running_job, CodeImportResul
>> + intervals.append(
>> + code_import.
>> + code_import.
>> + running_job = code_import.
>> + getUtility(
>> + running_job, self.machine)
>> + self.assertEqua
>> +
>
> The test is a bit big & slightly hard to follow, but I'm not sure what
> could be done about it. Perhaps one test that checks one failure,
> another that checks two failures. I don't know -- you don't have to
> change it.
OK, I won't. I agree that it's a bit complicated, lots of them in this
file are (in their defense, they are testing fairly complicated things).
> Thanks though, great to see this change in code.
Thanks! Did you avoid voting because of the wrong target issue?
Cheers,
mwh