Merge lp:~wgrant/launchpad/gc-dupe-lighter into lp:launchpad

Proposed by William Grant
Status: Merged
Merged at revision: 18113
Proposed branch: lp:~wgrant/launchpad/gc-dupe-lighter
Merge into: lp:launchpad
Diff against target: 149 lines (+42/-38)
2 files modified
lib/lp/services/librarianserver/librariangc.py (+26/-33)
lib/lp/services/librarianserver/tests/test_gc.py (+16/-5)
To merge this branch: bzr merge lp:~wgrant/launchpad/gc-dupe-lighter
Reviewer Review Type Date Requested Status
Colin Watson (community) Approve
Review via email: mp+297878@code.launchpad.net

Commit message

Don't hash duplicates in librarian-gc.

Description of the change

Don't hash duplicates in librarian-gc.

Instead of comparing everything, just check that the original file's hash matches the DB and trust that the rest haven't collided. Pedantically verifying that every duplicate file matches the original file bit by bit is very slow now that we have a lot of duplicates and most of them have to be downloaded from Swift.

To post a comment you must log in.
Revision history for this message
Colin Watson (cjwatson) wrote :

I guess we have little alternative if librarian-gc can't keep up.

We really need to move to SHA-256 here sooner rather than later, though.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'lib/lp/services/librarianserver/librariangc.py'
2--- lib/lp/services/librarianserver/librariangc.py 2016-05-05 06:10:32 +0000
3+++ lib/lp/services/librarianserver/librariangc.py 2016-06-20 08:26:14 +0000
4@@ -10,6 +10,7 @@
5 timedelta,
6 )
7 import errno
8+import hashlib
9 import os
10 import re
11 import sys
12@@ -91,17 +92,15 @@
13 return None # File not found.
14
15
16-def same_file(content_id_1, content_id_2):
17- file1 = open_stream(content_id_1)
18- file2 = open_stream(content_id_2)
19-
20- chunks_iter = iter(
21- lambda: (file1.read(STREAM_CHUNK_SIZE), file2.read(STREAM_CHUNK_SIZE)),
22- ('', ''))
23- for chunk1, chunk2 in chunks_iter:
24- if chunk1 != chunk2:
25- return False
26- return True
27+def sha1_file(content_id):
28+ file = open_stream(content_id)
29+ chunks_iter = iter(lambda: file.read(STREAM_CHUNK_SIZE), '')
30+ length = 0
31+ hasher = hashlib.sha1()
32+ for chunk in chunks_iter:
33+ hasher.update(chunk)
34+ length += len(chunk)
35+ return hasher.hexdigest(), length
36
37
38 def confirm_no_clock_skew(con):
39@@ -222,18 +221,18 @@
40 # most likely to exist on the staging server (it should be
41 # irrelevant on production).
42 cur.execute("""
43- SELECT id
44+ SELECT id, sha1, filesize
45 FROM LibraryFileContent
46 WHERE sha1=%(sha1)s AND filesize=%(filesize)s
47 ORDER BY datecreated DESC
48 """, vars())
49- dupes = [row[0] for row in cur.fetchall()]
50+ dupes = cur.fetchall()
51
52 if debug:
53 log.debug("Found duplicate LibraryFileContents")
54 # Spit out more info in case it helps work out where
55 # dupes are coming from.
56- for dupe_id in dupes:
57+ for dupe_id, _, _ in dupes:
58 cur.execute("""
59 SELECT id, filename, mimetype FROM LibraryFileAlias
60 WHERE content = %(dupe_id)s
61@@ -246,7 +245,7 @@
62 # and cope - just report and skip. However, on staging this will
63 # be more common because database records has been synced from
64 # production but the actual librarian contents has not.
65- dupe1_id = dupes[0]
66+ dupe1_id = dupes[0][0]
67 if not file_exists(dupe1_id):
68 if config.instance_name == 'staging':
69 log.debug(
70@@ -256,31 +255,25 @@
71 "LibraryFileContent %d data is missing", dupe1_id)
72 continue
73
74- # Do a manual check that they really are identical, because we
75- # employ paranoids. And we might as well cope with someone breaking
76- # SHA1 enough that it becomes possible to create a SHA1 collision
77- # with an identical filesize to an existing file. Which is pretty
78- # unlikely. Where did I leave my tin foil hat?
79- for dupe2_id in (dupe for dupe in dupes[1:]):
80- # Check paths exist, because on staging they may not!
81- if (file_exists(dupe2_id) and not same_file(dupe1_id, dupe2_id)):
82- log.error(
83- "SHA-1 collision found. LibraryFileContent %d and "
84- "%d have the same SHA1 and filesize, but are not "
85- "byte-for-byte identical.",
86- dupe1_id, dupe2_id
87- )
88- sys.exit(1)
89+ # Check that the first file is intact. Don't want to delete
90+ # dupes if we might need them to recover the original.
91+ actual_sha1, actual_size = sha1_file(dupe1_id)
92+ if actual_sha1 != dupes[0][1] or actual_size != dupes[0][2]:
93+ log.error(
94+ "Corruption found. LibraryFileContent %d has SHA-1 %s and "
95+ "size %d, expected %s and %d.", dupes[0][0],
96+ actual_sha1, actual_size, dupes[0][1], dupes[0][2])
97+ sys.exit(1)
98
99 # Update all the LibraryFileAlias entries to point to a single
100 # LibraryFileContent
101- prime_id = dupes[0]
102- other_ids = ', '.join(str(dupe) for dupe in dupes[1:])
103+ prime_id = dupes[0][0]
104+ other_ids = ', '.join(str(dupe) for dupe, _, _ in dupes[1:])
105 log.debug(
106 "Making LibraryFileAliases referencing %s reference %s instead",
107 other_ids, prime_id
108 )
109- for other_id in dupes[1:]:
110+ for other_id, _, _ in dupes[1:]:
111 cur.execute("""
112 UPDATE LibraryFileAlias SET content=%(prime_id)s
113 WHERE content = %(other_id)s
114
115=== modified file 'lib/lp/services/librarianserver/tests/test_gc.py'
116--- lib/lp/services/librarianserver/tests/test_gc.py 2015-01-14 06:02:50 +0000
117+++ lib/lp/services/librarianserver/tests/test_gc.py 2016-06-20 08:26:14 +0000
118@@ -851,16 +851,27 @@
119 self.unexpired_blob_id = cur.fetchone()[0]
120 self.layer.txn.commit()
121
122- # Make sure all the librarian files actually exist on disk
123+ # Make sure all the librarian files actually exist on disk with
124+ # hashes matching the DB. We use the hash as the new file
125+ # content, to preserve existing duplicate relationships.
126+ switch_dbuser('testadmin')
127 cur = cursor()
128- cur.execute("SELECT id FROM LibraryFileContent")
129- for content_id in (row[0] for row in cur.fetchall()):
130+ cur.execute("SELECT id, sha1 FROM LibraryFileContent")
131+ for content_id, sha1 in cur.fetchall():
132 path = librariangc.get_file_path(content_id)
133 if not os.path.exists(path):
134 if not os.path.exists(os.path.dirname(path)):
135 os.makedirs(os.path.dirname(path))
136- open(path, 'w').write('whatever')
137- self.layer.txn.abort()
138+ data = sha1
139+ open(path, 'w').write(data)
140+ cur.execute(
141+ "UPDATE LibraryFileContent "
142+ "SET md5 = %s, sha1 = %s, sha256 = %s, filesize = %s "
143+ "WHERE id = %s",
144+ (hashlib.md5(data).hexdigest(),
145+ hashlib.sha1(data).hexdigest(),
146+ hashlib.sha256(data).hexdigest(), len(data), content_id))
147+ self.layer.txn.commit()
148
149 switch_dbuser(config.librarian_gc.dbuser)
150