Launchpad itself

Merge lp:~wgrant/launchpad/gc-dupe-lighter into lp:launchpad

gc-dupe-lighter
Merge into devel

Proposed by William Grant on 2016-06-20

Status:	Merged
Merged at revision:	18113
Proposed branch:	lp:~wgrant/launchpad/gc-dupe-lighter
Merge into:	lp:launchpad
Diff against target:	149 lines (+42/-38) 2 files modified lib/lp/services/librarianserver/librariangc.py (+26/-33) lib/lp/services/librarianserver/tests/test_gc.py (+16/-5)
To merge this branch:	bzr merge lp:~wgrant/launchpad/gc-dupe-lighter
Related bugs:	Link a bug report

Reviewer	Review Type	Date Requested	Status
Colin Watson (community)		2016-06-20	Approve on 2016-06-22
Review via email: mp+297878@code.launchpad.net

Commit message

Don't hash duplicates in librarian-gc.

Description of the change

Don't hash duplicates in librarian-gc.

Instead of comparing everything, just check that the original file's hash matches the DB and trust that the rest haven't collided. Pedantically verifying that every duplicate file matches the original file bit by bit is very slow now that we have a lot of duplicates and most of them have to be downloaded from Swift.

Revision history for this message

Colin Watson (cjwatson) wrote on 2016-06-22:

I guess we have little alternative if librarian-gc can't keep up.

We really need to move to SHA-256 here sooner rather than later, though.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk

Subscribers

People subscribed via source and target branches

to all changes:

Barki Mustapha

Celso Providelo

Christian Reis

Christy Awad

Colin Watson

Harpianto,ANDI

James Troup

John A Meinel

Kevin bush

Launchpad code reviewers

Launchpad code reviewers from Canonical

Matthew Tanner

Maximiliano Bertacchini

Oguz Ersoz

Simon Brakhane

Ubuntu-BR DevOps

William Grant

alhawiti

api.ng

pedro cavazos

todaioan

wenjingwen

to status/vote changes:

Tzaddi

Tzaddi Belding

 === modified file 'lib/lp/services/librarianserver/librariangc.py'
 --- lib/lp/services/librarianserver/librariangc.py	2016-05-05 06:10:32 +0000
 +++ lib/lp/services/librarianserver/librariangc.py	2016-06-20 08:26:14 +0000
@@ -10,6 +10,7 @@
      timedelta,
+     )
  import errno
++import hashlib
  import os
  import re
  import sys
@@ -91,17 +92,15 @@
      return None  # File not found.
--def same_file(content_id_1, content_id_2):
--    file1 = open_stream(content_id_1)
--    file2 = open_stream(content_id_2)
--
--    chunks_iter = iter(
--        lambda: (file1.read(STREAM_CHUNK_SIZE), file2.read(STREAM_CHUNK_SIZE)),
--        ('', ''))
--    for chunk1, chunk2 in chunks_iter:
--        if chunk1 != chunk2:
--            return False
--    return True
++def sha1_file(content_id):
++    file = open_stream(content_id)
++    chunks_iter = iter(lambda: file.read(STREAM_CHUNK_SIZE), '')
++    length = 0
++    hasher = hashlib.sha1()
++    for chunk in chunks_iter:
++        hasher.update(chunk)
++        length += len(chunk)
++    return hasher.hexdigest(), length
  def confirm_no_clock_skew(con):
@@ -222,18 +221,18 @@
          # most likely to exist on the staging server (it should be
          # irrelevant on production).
          cur.execute("""
--            SELECT id
++            SELECT id, sha1, filesize
              FROM LibraryFileContent
              WHERE sha1=%(sha1)s AND filesize=%(filesize)s
              ORDER BY datecreated DESC
              """, vars())
--        dupes = [row[0] for row in cur.fetchall()]
++        dupes = cur.fetchall()
          if debug:
              log.debug("Found duplicate LibraryFileContents")
              # Spit out more info in case it helps work out where
              # dupes are coming from.
--            for dupe_id in dupes:
++            for dupe_id, _, _ in dupes:
                  cur.execute("""
                      SELECT id, filename, mimetype FROM LibraryFileAlias
                      WHERE content = %(dupe_id)s
@@ -246,7 +245,7 @@
          # and cope - just report and skip. However, on staging this will
          # be more common because database records has been synced from
          # production but the actual librarian contents has not.
--        dupe1_id = dupes[0]
++        dupe1_id = dupes[0][0]
          if not file_exists(dupe1_id):
              if config.instance_name == 'staging':
                  log.debug(
@@ -256,31 +255,25 @@
                          "LibraryFileContent %d data is missing", dupe1_id)
              continue
--        # Do a manual check that they really are identical, because we
--        # employ paranoids. And we might as well cope with someone breaking
--        # SHA1 enough that it becomes possible to create a SHA1 collision
--        # with an identical filesize to an existing file. Which is pretty
--        # unlikely. Where did I leave my tin foil hat?
--        for dupe2_id in (dupe for dupe in dupes[1:]):
--            # Check paths exist, because on staging they may not!
--            if (file_exists(dupe2_id) and not same_file(dupe1_id, dupe2_id)):
--                log.error(
--                        "SHA-1 collision found. LibraryFileContent %d and "
--                        "%d have the same SHA1 and filesize, but are not "
--                        "byte-for-byte identical.",
--                        dupe1_id, dupe2_id
--                        )
--                sys.exit(1)
++        # Check that the first file is intact. Don't want to delete
++        # dupes if we might need them to recover the original.
++        actual_sha1, actual_size = sha1_file(dupe1_id)
++        if actual_sha1 != dupes[0][1] or actual_size != dupes[0][2]:
++            log.error(
++                "Corruption found. LibraryFileContent %d has SHA-1 %s and "
++                "size %d, expected %s and %d.", dupes[0][0],
++                actual_sha1, actual_size, dupes[0][1], dupes[0][2])
++            sys.exit(1)
          # Update all the LibraryFileAlias entries to point to a single
          # LibraryFileContent
--        prime_id = dupes[0]
--        other_ids = ', '.join(str(dupe) for dupe in dupes[1:])
++        prime_id = dupes[0][0]
++        other_ids = ', '.join(str(dupe) for dupe, _, _ in dupes[1:])
          log.debug(
              "Making LibraryFileAliases referencing %s reference %s instead",
              other_ids, prime_id
+             )
--        for other_id in dupes[1:]:
++        for other_id, _, _ in dupes[1:]:
              cur.execute("""
                  UPDATE LibraryFileAlias SET content=%(prime_id)s
                  WHERE content = %(other_id)s
 === modified file 'lib/lp/services/librarianserver/tests/test_gc.py'
 --- lib/lp/services/librarianserver/tests/test_gc.py	2015-01-14 06:02:50 +0000
 +++ lib/lp/services/librarianserver/tests/test_gc.py	2016-06-20 08:26:14 +0000
@@ -851,16 +851,27 @@
          self.unexpired_blob_id = cur.fetchone()[0]
          self.layer.txn.commit()
--        # Make sure all the librarian files actually exist on disk
++        # Make sure all the librarian files actually exist on disk with
++        # hashes matching the DB. We use the hash as the new file
++        # content, to preserve existing duplicate relationships.
++        switch_dbuser('testadmin')
          cur = cursor()
--        cur.execute("SELECT id FROM LibraryFileContent")
--        for content_id in (row[0] for row in cur.fetchall()):
++        cur.execute("SELECT id, sha1 FROM LibraryFileContent")
++        for content_id, sha1 in cur.fetchall():
              path = librariangc.get_file_path(content_id)
              if not os.path.exists(path):
                  if not os.path.exists(os.path.dirname(path)):
                      os.makedirs(os.path.dirname(path))
--                open(path, 'w').write('whatever')
--        self.layer.txn.abort()
++                data = sha1
++                open(path, 'w').write(data)
++                cur.execute(
++                    "UPDATE LibraryFileContent "
++                    "SET md5 = %s, sha1 = %s, sha256 = %s, filesize = %s "
++                    "WHERE id = %s",
++                    (hashlib.md5(data).hexdigest(),
++                     hashlib.sha1(data).hexdigest(),
++                     hashlib.sha256(data).hexdigest(), len(data), content_id))
++        self.layer.txn.commit()
          switch_dbuser(config.librarian_gc.dbuser)