Merge lp:~wgrant/launchpad/gc-dupe-lighter into lp:launchpad

Proposed by William Grant
Status: Merged
Merged at revision: 18113
Proposed branch: lp:~wgrant/launchpad/gc-dupe-lighter
Merge into: lp:launchpad
Diff against target: 149 lines (+42/-38)
2 files modified
lib/lp/services/librarianserver/librariangc.py (+26/-33)
lib/lp/services/librarianserver/tests/test_gc.py (+16/-5)
To merge this branch: bzr merge lp:~wgrant/launchpad/gc-dupe-lighter
Reviewer Review Type Date Requested Status
Colin Watson (community) Approve
Review via email: mp+297878@code.launchpad.net

Commit message

Don't hash duplicates in librarian-gc.

Description of the change

Don't hash duplicates in librarian-gc.

Instead of comparing everything, just check that the original file's hash matches the DB and trust that the rest haven't collided. Pedantically verifying that every duplicate file matches the original file bit by bit is very slow now that we have a lot of duplicates and most of them have to be downloaded from Swift.

To post a comment you must log in.
Revision history for this message
Colin Watson (cjwatson) wrote :

I guess we have little alternative if librarian-gc can't keep up.

We really need to move to SHA-256 here sooner rather than later, though.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'lib/lp/services/librarianserver/librariangc.py'
--- lib/lp/services/librarianserver/librariangc.py 2016-05-05 06:10:32 +0000
+++ lib/lp/services/librarianserver/librariangc.py 2016-06-20 08:26:14 +0000
@@ -10,6 +10,7 @@
10 timedelta,10 timedelta,
11 )11 )
12import errno12import errno
13import hashlib
13import os14import os
14import re15import re
15import sys16import sys
@@ -91,17 +92,15 @@
91 return None # File not found.92 return None # File not found.
9293
9394
94def same_file(content_id_1, content_id_2):95def sha1_file(content_id):
95 file1 = open_stream(content_id_1)96 file = open_stream(content_id)
96 file2 = open_stream(content_id_2)97 chunks_iter = iter(lambda: file.read(STREAM_CHUNK_SIZE), '')
9798 length = 0
98 chunks_iter = iter(99 hasher = hashlib.sha1()
99 lambda: (file1.read(STREAM_CHUNK_SIZE), file2.read(STREAM_CHUNK_SIZE)),100 for chunk in chunks_iter:
100 ('', ''))101 hasher.update(chunk)
101 for chunk1, chunk2 in chunks_iter:102 length += len(chunk)
102 if chunk1 != chunk2:103 return hasher.hexdigest(), length
103 return False
104 return True
105104
106105
107def confirm_no_clock_skew(con):106def confirm_no_clock_skew(con):
@@ -222,18 +221,18 @@
222 # most likely to exist on the staging server (it should be221 # most likely to exist on the staging server (it should be
223 # irrelevant on production).222 # irrelevant on production).
224 cur.execute("""223 cur.execute("""
225 SELECT id224 SELECT id, sha1, filesize
226 FROM LibraryFileContent225 FROM LibraryFileContent
227 WHERE sha1=%(sha1)s AND filesize=%(filesize)s226 WHERE sha1=%(sha1)s AND filesize=%(filesize)s
228 ORDER BY datecreated DESC227 ORDER BY datecreated DESC
229 """, vars())228 """, vars())
230 dupes = [row[0] for row in cur.fetchall()]229 dupes = cur.fetchall()
231230
232 if debug:231 if debug:
233 log.debug("Found duplicate LibraryFileContents")232 log.debug("Found duplicate LibraryFileContents")
234 # Spit out more info in case it helps work out where233 # Spit out more info in case it helps work out where
235 # dupes are coming from.234 # dupes are coming from.
236 for dupe_id in dupes:235 for dupe_id, _, _ in dupes:
237 cur.execute("""236 cur.execute("""
238 SELECT id, filename, mimetype FROM LibraryFileAlias237 SELECT id, filename, mimetype FROM LibraryFileAlias
239 WHERE content = %(dupe_id)s238 WHERE content = %(dupe_id)s
@@ -246,7 +245,7 @@
246 # and cope - just report and skip. However, on staging this will245 # and cope - just report and skip. However, on staging this will
247 # be more common because database records has been synced from246 # be more common because database records has been synced from
248 # production but the actual librarian contents has not.247 # production but the actual librarian contents has not.
249 dupe1_id = dupes[0]248 dupe1_id = dupes[0][0]
250 if not file_exists(dupe1_id):249 if not file_exists(dupe1_id):
251 if config.instance_name == 'staging':250 if config.instance_name == 'staging':
252 log.debug(251 log.debug(
@@ -256,31 +255,25 @@
256 "LibraryFileContent %d data is missing", dupe1_id)255 "LibraryFileContent %d data is missing", dupe1_id)
257 continue256 continue
258257
259 # Do a manual check that they really are identical, because we258 # Check that the first file is intact. Don't want to delete
260 # employ paranoids. And we might as well cope with someone breaking259 # dupes if we might need them to recover the original.
261 # SHA1 enough that it becomes possible to create a SHA1 collision260 actual_sha1, actual_size = sha1_file(dupe1_id)
262 # with an identical filesize to an existing file. Which is pretty261 if actual_sha1 != dupes[0][1] or actual_size != dupes[0][2]:
263 # unlikely. Where did I leave my tin foil hat?262 log.error(
264 for dupe2_id in (dupe for dupe in dupes[1:]):263 "Corruption found. LibraryFileContent %d has SHA-1 %s and "
265 # Check paths exist, because on staging they may not!264 "size %d, expected %s and %d.", dupes[0][0],
266 if (file_exists(dupe2_id) and not same_file(dupe1_id, dupe2_id)):265 actual_sha1, actual_size, dupes[0][1], dupes[0][2])
267 log.error(266 sys.exit(1)
268 "SHA-1 collision found. LibraryFileContent %d and "
269 "%d have the same SHA1 and filesize, but are not "
270 "byte-for-byte identical.",
271 dupe1_id, dupe2_id
272 )
273 sys.exit(1)
274267
275 # Update all the LibraryFileAlias entries to point to a single268 # Update all the LibraryFileAlias entries to point to a single
276 # LibraryFileContent269 # LibraryFileContent
277 prime_id = dupes[0]270 prime_id = dupes[0][0]
278 other_ids = ', '.join(str(dupe) for dupe in dupes[1:])271 other_ids = ', '.join(str(dupe) for dupe, _, _ in dupes[1:])
279 log.debug(272 log.debug(
280 "Making LibraryFileAliases referencing %s reference %s instead",273 "Making LibraryFileAliases referencing %s reference %s instead",
281 other_ids, prime_id274 other_ids, prime_id
282 )275 )
283 for other_id in dupes[1:]:276 for other_id, _, _ in dupes[1:]:
284 cur.execute("""277 cur.execute("""
285 UPDATE LibraryFileAlias SET content=%(prime_id)s278 UPDATE LibraryFileAlias SET content=%(prime_id)s
286 WHERE content = %(other_id)s279 WHERE content = %(other_id)s
287280
=== modified file 'lib/lp/services/librarianserver/tests/test_gc.py'
--- lib/lp/services/librarianserver/tests/test_gc.py 2015-01-14 06:02:50 +0000
+++ lib/lp/services/librarianserver/tests/test_gc.py 2016-06-20 08:26:14 +0000
@@ -851,16 +851,27 @@
851 self.unexpired_blob_id = cur.fetchone()[0]851 self.unexpired_blob_id = cur.fetchone()[0]
852 self.layer.txn.commit()852 self.layer.txn.commit()
853853
854 # Make sure all the librarian files actually exist on disk854 # Make sure all the librarian files actually exist on disk with
855 # hashes matching the DB. We use the hash as the new file
856 # content, to preserve existing duplicate relationships.
857 switch_dbuser('testadmin')
855 cur = cursor()858 cur = cursor()
856 cur.execute("SELECT id FROM LibraryFileContent")859 cur.execute("SELECT id, sha1 FROM LibraryFileContent")
857 for content_id in (row[0] for row in cur.fetchall()):860 for content_id, sha1 in cur.fetchall():
858 path = librariangc.get_file_path(content_id)861 path = librariangc.get_file_path(content_id)
859 if not os.path.exists(path):862 if not os.path.exists(path):
860 if not os.path.exists(os.path.dirname(path)):863 if not os.path.exists(os.path.dirname(path)):
861 os.makedirs(os.path.dirname(path))864 os.makedirs(os.path.dirname(path))
862 open(path, 'w').write('whatever')865 data = sha1
863 self.layer.txn.abort()866 open(path, 'w').write(data)
867 cur.execute(
868 "UPDATE LibraryFileContent "
869 "SET md5 = %s, sha1 = %s, sha256 = %s, filesize = %s "
870 "WHERE id = %s",
871 (hashlib.md5(data).hexdigest(),
872 hashlib.sha1(data).hexdigest(),
873 hashlib.sha256(data).hexdigest(), len(data), content_id))
874 self.layer.txn.commit()
864875
865 switch_dbuser(config.librarian_gc.dbuser)876 switch_dbuser(config.librarian_gc.dbuser)
866877