Launchpad itself

Code review comment for lp:~mwhudson/launchpad/back-off-failing-imports-bug-413637

back-off-failing-imports-bug-413637
Merge into db-devel

Revision history for this message

Jonathan Lange (jml) wrote on 2010-03-15:

On Mon, Mar 15, 2010 at 10:22 PM, Michael Hudson
<email address hidden> wrote:
> Michael Hudson has proposed merging lp:~mwhudson/launchpad/back-off-failing-imports-bug-413637 into lp:launchpad.
>
> Requested reviews:
> Canonical Launchpad Engineering (launchpad)
> Related bugs:
> #413637 imports disabled too rapidly
> https://bugs.launchpad.net/bugs/413637
>
>
> Hi there,
>
> This small branch changes the code import system to exponentially back off on retrying code imports -- see the linked bug for more.
>

I think the branch is bigger than you intended. It's got a bunch of
build farm job stuff in the diff.

> Currently it backs off so that it tries imports at 6, 12, 24 and 48 hours (for subversion) for a total of three and a bit days of trying before marking an import failed -- is that enough? It seems OK, though perhaps a bit on the low side, to me.
>

Sounds fine to me.

...
> === modified file 'lib/lp/code/model/codeimportjob.py'
> --- lib/lp/code/model/codeimportjob.py 2010-02-24 10:18:16 +0000
> +++ lib/lp/code/model/codeimportjob.py 2010-03-15 22:22:20 +0000
> @@ -10,6 +10,8 @@
> 'CodeImportJobWorkflow',
> ]
>
> +import datetime
> +
> from sqlobject import ForeignKey, IntCol, SQLObjectNotFound, StringCol
>
> from zope.component import getUtility
> @@ -140,7 +142,7 @@
>
> implements(ICodeImportJobWorkflow)
>
> - def newJob(self, code_import, date_due=None):
> + def newJob(self, code_import, interval=None):
> """See `ICodeImportJobWorkflow`."""
> assert code_import.review_status == CodeImportReviewStatus.REVIEWED, (
> "Review status of %s is not REVIEWED: %s" % (
> @@ -149,23 +151,22 @@
> "Already associated to a CodeImportJob: %s" % (
> code_import.branch.unique_name))
>
> + if interval is None:
> + interval = code_import.effective_update_interval
> +
> job = CodeImportJob(code_import=code_import, date_due=UTC_NOW)
>
> - if date_due is None:
> - # Find the most recent CodeImportResult for this CodeImport. We
> - # sort by date_created because we do not have an index on
> - # date_job_started in the database, and that should give the same
> - # sort order.
> - most_recent_result_list = list(CodeImportResult.selectBy(
> - code_import=code_import).orderBy(['-date_created']).limit(1))
> + # Find the most recent CodeImportResult for this CodeImport. We
> + # sort by date_created because we do not have an index on
> + # date_job_started in the database, and that should give the same
> + # sort order.
> + most_recent_result_list = list(CodeImportResult.selectBy(
> + code_import=code_import).orderBy(['-date_created']).limit(1))
>

Why aren't you using Storm, with its 'one()' notation for this?

> @@ -281,15 +282,19 @@
> # If the import has failed too many times in a row, mark it as
> # FAILING.
> failure_limit = config.codeimport.consecutive_failure_limit
> - if code_import.consecutive_failure_count >= failure_limit:
> + failure_count = code_import.consecutive_failure_count
> + if failure_count >= failure_limit:
> code_import.updateFromData(
> dict(review_status=CodeImportReviewStatus.FAILING), None)
> + elif status == CodeImportResultStatus.SUCCESS_PARTIAL:
> + interval = datetime.timedelta(0)
> + elif failure_count > 0:
> + interval = code_import.effective_update_interval * (2**(failure_count - 1))

PEP 8 says you need more spaces here, I think.

> + else:
> + interval = code_import.effective_update_interval
> # Only start a new one if the import is still in the REVIEWED state.
> if code_import.review_status == CodeImportReviewStatus.REVIEWED:
> - extra = {}
> - if status == CodeImportResultStatus.SUCCESS_PARTIAL:
> - extra['date_due'] = UTC_NOW
> - self.newJob(code_import, **extra)
> + self.newJob(code_import, interval=interval)
> # If the status was successful, update date_last_successful.
> if status in [CodeImportResultStatus.SUCCESS,
> CodeImportResultStatus.SUCCESS_NOCHANGE]:
> @@ -321,7 +326,7 @@
> import_job, CodeImportResultStatus.RECLAIMED, None)
> # 3)
> if code_import.review_status == CodeImportReviewStatus.REVIEWED:
> - self.newJob(code_import, UTC_NOW)
> + self.newJob(code_import, datetime.timedelta(0))

It's a shame that datetime.timedelta(0) isn't a constant.

> # 4)
> getUtility(ICodeImportEventSet).newReclaim(
> code_import, machine, job_id)
>

> === modified file 'lib/lp/code/model/tests/test_codeimportjob.py'
> --- lib/lp/code/model/tests/test_codeimportjob.py 2010-03-14 20:15:34 +0000
> +++ lib/lp/code/model/tests/test_codeimportjob.py 2010-03-15 22:22:20 +0000
> @@ -778,6 +778,31 @@
> self.assertEqual(new_job.machine, None)
> self.assertSqlAttributeEqualsDate(new_job, 'date_due', UTC_NOW)
>
> + def test_failures_back_off(self):
> + # We wait for longer and longer between retrying failing imports, to
> + # make it less likely that an import is marked failing just because
> + # someone's DNS went down for a day.
> + running_job = self.makeRunningJob()
> + intervals = []
> + interval = running_job.code_import.effective_update_interval
> + expected_intervals = []
> + for i in range(config.codeimport.consecutive_failure_limit - 1):
> + expected_intervals.append(interval)
> + interval *= 2
> + # Fail an import a bunch of times and record how far in the future the
> + # next job was scheduled.
> + for i in range(config.codeimport.consecutive_failure_limit - 1):
> + code_import = running_job.code_import
> + getUtility(ICodeImportJobWorkflow).finishJob(
> + running_job, CodeImportResultStatus.FAILURE, None)
> + intervals.append(
> + code_import.import_job.date_due -
> + code_import.results[-1].date_job_started)
> + running_job = code_import.import_job
> + getUtility(ICodeImportJobWorkflow).startJob(
> + running_job, self.machine)
> + self.assertEqual(expected_intervals, intervals)
> +

The test is a bit big & slightly hard to follow, but I'm not sure what
could be done about it. Perhaps one test that checks one failure,
another that checks two failures. I don't know -- you don't have to
change it.

Thanks though, great to see this change in code.
jml

On Mon, Mar 15, 2010 at 10:22 PM, Michael Hudson
<michael.hudson@canonical.com> wrote:
> Michael Hudson has proposed merging lp:~mwhudson/launchpad/back-off-failing-imports-bug-413637 into lp:launchpad.
>
> Requested reviews:
>  Canonical Launchpad Engineering (launchpad)
> Related bugs:
>  #413637 imports disabled too rapidly
>  https://bugs.launchpad.net/bugs/413637
>
>
> Hi there,
>
> This small branch changes the code import system to exponentially back off on retrying code imports -- see the linked bug for more.
>

I think the branch is bigger than you intended. It's got a bunch of
build farm job stuff in the diff.

> Currently it backs off so that it tries imports at 6, 12, 24 and 48 hours (for subversion) for a total of three and a bit days of trying before marking an import failed -- is that enough?  It seems OK, though perhaps a bit on the low side, to me.
>

Sounds fine to me.

...
> === modified file 'lib/lp/code/model/codeimportjob.py'
> --- lib/lp/code/model/codeimportjob.py  2010-02-24 10:18:16 +0000
> +++ lib/lp/code/model/codeimportjob.py  2010-03-15 22:22:20 +0000
> @@ -10,6 +10,8 @@
>     'CodeImportJobWorkflow',
>     ]
>
> +import datetime
> +
>  from sqlobject import ForeignKey, IntCol, SQLObjectNotFound, StringCol
>
>  from zope.component import getUtility
> @@ -140,7 +142,7 @@
>
>     implements(ICodeImportJobWorkflow)
>
> -    def newJob(self, code_import, date_due=None):
> +    def newJob(self, code_import, interval=None):
>         """See `ICodeImportJobWorkflow`."""
>         assert code_import.review_status == CodeImportReviewStatus.REVIEWED, (
>             "Review status of %s is not REVIEWED: %s" % (
> @@ -149,23 +151,22 @@
>             "Already associated to a CodeImportJob: %s" % (
>             code_import.branch.unique_name))
>
> +        if interval is None:
> +            interval = code_import.effective_update_interval
> +
>         job = CodeImportJob(code_import=code_import, date_due=UTC_NOW)
>
> -        if date_due is None:
> -            # Find the most recent CodeImportResult for this CodeImport. We
> -            # sort by date_created because we do not have an index on
> -            # date_job_started in the database, and that should give the same
> -            # sort order.
> -            most_recent_result_list = list(CodeImportResult.selectBy(
> -                code_import=code_import).orderBy(['-date_created']).limit(1))
> +        # Find the most recent CodeImportResult for this CodeImport. We
> +        # sort by date_created because we do not have an index on
> +        # date_job_started in the database, and that should give the same
> +        # sort order.
> +        most_recent_result_list = list(CodeImportResult.selectBy(
> +            code_import=code_import).orderBy(['-date_created']).limit(1))
>

Why aren't you using Storm, with its 'one()' notation for this?

> @@ -281,15 +282,19 @@
>         # If the import has failed too many times in a row, mark it as
>         # FAILING.
>         failure_limit = config.codeimport.consecutive_failure_limit
> -        if code_import.consecutive_failure_count >= failure_limit:
> +        failure_count = code_import.consecutive_failure_count
> +        if failure_count >= failure_limit:
>             code_import.updateFromData(
>                 dict(review_status=CodeImportReviewStatus.FAILING), None)
> +        elif status == CodeImportResultStatus.SUCCESS_PARTIAL:
> +            interval = datetime.timedelta(0)
> +        elif failure_count > 0:
> +            interval = code_import.effective_update_interval * (2**(failure_count - 1))

PEP 8 says you need more spaces here, I think.

> +        else:
> +            interval = code_import.effective_update_interval
>         # Only start a new one if the import is still in the REVIEWED state.
>         if code_import.review_status == CodeImportReviewStatus.REVIEWED:
> -            extra = {}
> -            if status == CodeImportResultStatus.SUCCESS_PARTIAL:
> -                extra['date_due'] = UTC_NOW
> -            self.newJob(code_import, **extra)
> +            self.newJob(code_import, interval=interval)
>         # If the status was successful, update date_last_successful.
>         if status in [CodeImportResultStatus.SUCCESS,
>                       CodeImportResultStatus.SUCCESS_NOCHANGE]:
> @@ -321,7 +326,7 @@
>             import_job, CodeImportResultStatus.RECLAIMED, None)
>         # 3)
>         if code_import.review_status == CodeImportReviewStatus.REVIEWED:
> -            self.newJob(code_import, UTC_NOW)
> +            self.newJob(code_import, datetime.timedelta(0))

It's a shame that datetime.timedelta(0) isn't a constant.

>         # 4)
>         getUtility(ICodeImportEventSet).newReclaim(
>             code_import, machine, job_id)
>

> === modified file 'lib/lp/code/model/tests/test_codeimportjob.py'
> --- lib/lp/code/model/tests/test_codeimportjob.py       2010-03-14 20:15:34 +0000
> +++ lib/lp/code/model/tests/test_codeimportjob.py       2010-03-15 22:22:20 +0000
> @@ -778,6 +778,31 @@
>         self.assertEqual(new_job.machine, None)
>         self.assertSqlAttributeEqualsDate(new_job, 'date_due', UTC_NOW)
>
> +    def test_failures_back_off(self):
> +        # We wait for longer and longer between retrying failing imports, to
> +        # make it less likely that an import is marked failing just because
> +        # someone's DNS went down for a day.
> +        running_job = self.makeRunningJob()
> +        intervals = []
> +        interval = running_job.code_import.effective_update_interval
> +        expected_intervals = []
> +        for i in range(config.codeimport.consecutive_failure_limit - 1):
> +            expected_intervals.append(interval)
> +            interval *= 2
> +        # Fail an import a bunch of times and record how far in the future the
> +        # next job was scheduled.
> +        for i in range(config.codeimport.consecutive_failure_limit - 1):
> +            code_import = running_job.code_import
> +            getUtility(ICodeImportJobWorkflow).finishJob(
> +                running_job, CodeImportResultStatus.FAILURE, None)
> +            intervals.append(
> +                code_import.import_job.date_due -
> +                code_import.results[-1].date_job_started)
> +            running_job = code_import.import_job
> +            getUtility(ICodeImportJobWorkflow).startJob(
> +                running_job, self.machine)
> +        self.assertEqual(expected_intervals, intervals)
> +

Thanks though, great to see this change in code.
jml

« Back to merge proposal