On Mon, Mar 15, 2010 at 10:22 PM, Michael Hudson
<email address hidden> wrote:
> Michael Hudson has proposed merging lp:~mwhudson/launchpad/back-off-failing-imports-bug-413637 into lp:launchpad.
>
> Requested reviews:
> Canonical Launchpad Engineering (launchpad)
> Related bugs:
> #413637 imports disabled too rapidly
> https://bugs.launchpad.net/bugs/413637
>
>
> Hi there,
>
> This small branch changes the code import system to exponentially back off on retrying code imports -- see the linked bug for more.
>
I think the branch is bigger than you intended. It's got a bunch of
build farm job stuff in the diff.
> Currently it backs off so that it tries imports at 6, 12, 24 and 48 hours (for subversion) for a total of three and a bit days of trying before marking an import failed -- is that enough? It seems OK, though perhaps a bit on the low side, to me.
>
Sounds fine to me.
...
> === modified file 'lib/lp/code/model/codeimportjob.py'
> --- lib/lp/code/model/codeimportjob.py 2010-02-24 10:18:16 +0000
> +++ lib/lp/code/model/codeimportjob.py 2010-03-15 22:22:20 +0000
> @@ -10,6 +10,8 @@
> 'CodeImportJobWorkflow',
> ]
>
> +import datetime
> +
> from sqlobject import ForeignKey, IntCol, SQLObjectNotFound, StringCol
>
> from zope.component import getUtility
> @@ -140,7 +142,7 @@
>
> implements(ICodeImportJobWorkflow)
>
> - def newJob(self, code_import, date_due=None):
> + def newJob(self, code_import, interval=None):
> """See `ICodeImportJobWorkflow`."""
> assert code_import.review_status == CodeImportReviewStatus.REVIEWED, (
> "Review status of %s is not REVIEWED: %s" % (
> @@ -149,23 +151,22 @@
> "Already associated to a CodeImportJob: %s" % (
> code_import.branch.unique_name))
>
> + if interval is None:
> + interval = code_import.effective_update_interval
> +
> job = CodeImportJob(code_import=code_import, date_due=UTC_NOW)
>
> - if date_due is None:
> - # Find the most recent CodeImportResult for this CodeImport. We
> - # sort by date_created because we do not have an index on
> - # date_job_started in the database, and that should give the same
> - # sort order.
> - most_recent_result_list = list(CodeImportResult.selectBy(
> - code_import=code_import).orderBy(['-date_created']).limit(1))
> + # Find the most recent CodeImportResult for this CodeImport. We
> + # sort by date_created because we do not have an index on
> + # date_job_started in the database, and that should give the same
> + # sort order.
> + most_recent_result_list = list(CodeImportResult.selectBy(
> + code_import=code_import).orderBy(['-date_created']).limit(1))
>
Why aren't you using Storm, with its 'one()' notation for this?
> @@ -281,15 +282,19 @@
> # If the import has failed too many times in a row, mark it as
> # FAILING.
> failure_limit = config.codeimport.consecutive_failure_limit
> - if code_import.consecutive_failure_count >= failure_limit:
> + failure_count = code_import.consecutive_failure_count
> + if failure_count >= failure_limit:
> code_import.updateFromData(
> dict(review_status=CodeImportReviewStatus.FAILING), None)
> + elif status == CodeImportResultStatus.SUCCESS_PARTIAL:
> + interval = datetime.timedelta(0)
> + elif failure_count > 0:
> + interval = code_import.effective_update_interval * (2**(failure_count - 1))
PEP 8 says you need more spaces here, I think.
> + else:
> + interval = code_import.effective_update_interval
> # Only start a new one if the import is still in the REVIEWED state.
> if code_import.review_status == CodeImportReviewStatus.REVIEWED:
> - extra = {}
> - if status == CodeImportResultStatus.SUCCESS_PARTIAL:
> - extra['date_due'] = UTC_NOW
> - self.newJob(code_import, **extra)
> + self.newJob(code_import, interval=interval)
> # If the status was successful, update date_last_successful.
> if status in [CodeImportResultStatus.SUCCESS,
> CodeImportResultStatus.SUCCESS_NOCHANGE]:
> @@ -321,7 +326,7 @@
> import_job, CodeImportResultStatus.RECLAIMED, None)
> # 3)
> if code_import.review_status == CodeImportReviewStatus.REVIEWED:
> - self.newJob(code_import, UTC_NOW)
> + self.newJob(code_import, datetime.timedelta(0))
It's a shame that datetime.timedelta(0) isn't a constant.
> === modified file 'lib/lp/code/model/tests/test_codeimportjob.py'
> --- lib/lp/code/model/tests/test_codeimportjob.py 2010-03-14 20:15:34 +0000
> +++ lib/lp/code/model/tests/test_codeimportjob.py 2010-03-15 22:22:20 +0000
> @@ -778,6 +778,31 @@
> self.assertEqual(new_job.machine, None)
> self.assertSqlAttributeEqualsDate(new_job, 'date_due', UTC_NOW)
>
> + def test_failures_back_off(self):
> + # We wait for longer and longer between retrying failing imports, to
> + # make it less likely that an import is marked failing just because
> + # someone's DNS went down for a day.
> + running_job = self.makeRunningJob()
> + intervals = []
> + interval = running_job.code_import.effective_update_interval
> + expected_intervals = []
> + for i in range(config.codeimport.consecutive_failure_limit - 1):
> + expected_intervals.append(interval)
> + interval *= 2
> + # Fail an import a bunch of times and record how far in the future the
> + # next job was scheduled.
> + for i in range(config.codeimport.consecutive_failure_limit - 1):
> + code_import = running_job.code_import
> + getUtility(ICodeImportJobWorkflow).finishJob(
> + running_job, CodeImportResultStatus.FAILURE, None)
> + intervals.append(
> + code_import.import_job.date_due -
> + code_import.results[-1].date_job_started)
> + running_job = code_import.import_job
> + getUtility(ICodeImportJobWorkflow).startJob(
> + running_job, self.machine)
> + self.assertEqual(expected_intervals, intervals)
> +
The test is a bit big & slightly hard to follow, but I'm not sure what
could be done about it. Perhaps one test that checks one failure,
another that checks two failures. I don't know -- you don't have to
change it.
Thanks though, great to see this change in code.
jml
On Mon, Mar 15, 2010 at 10:22 PM, Michael Hudson /bugs.launchpad .net/bugs/ 413637
<email address hidden> wrote:
> Michael Hudson has proposed merging lp:~mwhudson/launchpad/back-off-failing-imports-bug-413637 into lp:launchpad.
>
> Requested reviews:
> Canonical Launchpad Engineering (launchpad)
> Related bugs:
> #413637 imports disabled too rapidly
> https:/
>
>
> Hi there,
>
> This small branch changes the code import system to exponentially back off on retrying code imports -- see the linked bug for more.
>
I think the branch is bigger than you intended. It's got a bunch of
build farm job stuff in the diff.
> Currently it backs off so that it tries imports at 6, 12, 24 and 48 hours (for subversion) for a total of three and a bit days of trying before marking an import failed -- is that enough? It seems OK, though perhaps a bit on the low side, to me.
>
Sounds fine to me.
... code/model/ codeimportjob. py' code/model/ codeimportjob. py 2010-02-24 10:18:16 +0000 code/model/ codeimportjob. py 2010-03-15 22:22:20 +0000 orkflow' , ICodeImportJobW orkflow) Workflow` .""" review_ status == CodeImportRevie wStatus. REVIEWED, ( branch. unique_ name)) effective_ update_ interval code_import= code_import, date_due=UTC_NOW) result_ list = list(CodeImport Result. selectBy( code_import) .orderBy( ['-date_ created' ]).limit( 1)) result_ list = list(CodeImport Result. selectBy( code_import) .orderBy( ['-date_ created' ]).limit( 1))
> === modified file 'lib/lp/
> --- lib/lp/
> +++ lib/lp/
> @@ -10,6 +10,8 @@
> 'CodeImportJobW
> ]
>
> +import datetime
> +
> from sqlobject import ForeignKey, IntCol, SQLObjectNotFound, StringCol
>
> from zope.component import getUtility
> @@ -140,7 +142,7 @@
>
> implements(
>
> - def newJob(self, code_import, date_due=None):
> + def newJob(self, code_import, interval=None):
> """See `ICodeImportJob
> assert code_import.
> "Review status of %s is not REVIEWED: %s" % (
> @@ -149,23 +151,22 @@
> "Already associated to a CodeImportJob: %s" % (
> code_import.
>
> + if interval is None:
> + interval = code_import.
> +
> job = CodeImportJob(
>
> - if date_due is None:
> - # Find the most recent CodeImportResult for this CodeImport. We
> - # sort by date_created because we do not have an index on
> - # date_job_started in the database, and that should give the same
> - # sort order.
> - most_recent_
> - code_import=
> + # Find the most recent CodeImportResult for this CodeImport. We
> + # sort by date_created because we do not have an index on
> + # date_job_started in the database, and that should give the same
> + # sort order.
> + most_recent_
> + code_import=
>
Why aren't you using Storm, with its 'one()' notation for this?
> @@ -281,15 +282,19 @@ codeimport. consecutive_ failure_ limit consecutive_ failure_ count >= failure_limit: consecutive_ failure_ count updateFromData( status= CodeImportRevie wStatus. FAILING) , None) tStatus. SUCCESS_ PARTIAL: timedelta( 0) effective_ update_ interval * (2**(failure_count - 1))
> # If the import has failed too many times in a row, mark it as
> # FAILING.
> failure_limit = config.
> - if code_import.
> + failure_count = code_import.
> + if failure_count >= failure_limit:
> code_import.
> dict(review_
> + elif status == CodeImportResul
> + interval = datetime.
> + elif failure_count > 0:
> + interval = code_import.
PEP 8 says you need more spaces here, I think.
> + else: effective_ update_ interval review_ status == CodeImportRevie wStatus. REVIEWED: tStatus. SUCCESS_ PARTIAL: code_import, **extra) code_import, interval=interval) successful. ltStatus. SUCCESS, tStatus. SUCCESS_ NOCHANGE] : tStatus. RECLAIMED, None) review_ status == CodeImportRevie wStatus. REVIEWED: code_import, UTC_NOW) code_import, datetime. timedelta( 0))
> + interval = code_import.
> # Only start a new one if the import is still in the REVIEWED state.
> if code_import.
> - extra = {}
> - if status == CodeImportResul
> - extra['date_due'] = UTC_NOW
> - self.newJob(
> + self.newJob(
> # If the status was successful, update date_last_
> if status in [CodeImportResu
> CodeImportResul
> @@ -321,7 +326,7 @@
> import_job, CodeImportResul
> # 3)
> if code_import.
> - self.newJob(
> + self.newJob(
It's a shame that datetime. timedelta( 0) isn't a constant.
> # 4) ICodeImportEven tSet).newReclai m(
> getUtility(
> code_import, machine, job_id)
>
> === modified file 'lib/lp/ code/model/ tests/test_ codeimportjob. py' code/model/ tests/test_ codeimportjob. py 2010-03-14 20:15:34 +0000 code/model/ tests/test_ codeimportjob. py 2010-03-15 22:22:20 +0000 l(new_job. machine, None) ttributeEqualsD ate(new_ job, 'date_due', UTC_NOW) back_off( self): gJob() job.code_ import. effective_ update_ interval codeimport. consecutive_ failure_ limit - 1): intervals. append( interval) codeimport. consecutive_ failure_ limit - 1): job.code_ import ICodeImportJobW orkflow) .finishJob( tStatus. FAILURE, None) import_ job.date_ due - results[ -1].date_ job_started) import_ job ICodeImportJobW orkflow) .startJob( l(expected_ intervals, intervals)
> --- lib/lp/
> +++ lib/lp/
> @@ -778,6 +778,31 @@
> self.assertEqua
> self.assertSqlA
>
> + def test_failures_
> + # We wait for longer and longer between retrying failing imports, to
> + # make it less likely that an import is marked failing just because
> + # someone's DNS went down for a day.
> + running_job = self.makeRunnin
> + intervals = []
> + interval = running_
> + expected_intervals = []
> + for i in range(config.
> + expected_
> + interval *= 2
> + # Fail an import a bunch of times and record how far in the future the
> + # next job was scheduled.
> + for i in range(config.
> + code_import = running_
> + getUtility(
> + running_job, CodeImportResul
> + intervals.append(
> + code_import.
> + code_import.
> + running_job = code_import.
> + getUtility(
> + running_job, self.machine)
> + self.assertEqua
> +
The test is a bit big & slightly hard to follow, but I'm not sure what
could be done about it. Perhaps one test that checks one failure,
another that checks two failures. I don't know -- you don't have to
change it.
Thanks though, great to see this change in code.
jml