1
=== modified file 'lib/lp/services/librarianserver/librariangc.py'
2
--- lib/lp/services/librarianserver/librariangc.py	2016-05-05 06:10:32 +0000
3
+++ lib/lp/services/librarianserver/librariangc.py	2016-06-20 08:26:14 +0000
4
@@ -10,6 +10,7 @@
5
10
    timedelta,
10
    timedelta,
6
11
    )
11
    )
7
12
import errno
12
import errno
8
13
import hashlib
9
13
import os
14
import os
10
14
import re
15
import re
11
15
import sys
16
import sys
12
@@ -91,17 +92,15 @@
13
91
    return None  # File not found.
92
    return None  # File not found.
14
92
93
15
93
94
27
94
def same_file(content_id_1, content_id_2):
95
def sha1_file(content_id):
28
95
    file1 = open_stream(content_id_1)
96
    file = open_stream(content_id)
29
96
    file2 = open_stream(content_id_2)
97
    chunks_iter = iter(lambda: file.read(STREAM_CHUNK_SIZE), '')
30
97
98
    length = 0
31
98
    chunks_iter = iter(
99
    hasher = hashlib.sha1()
32
99
        lambda: (file1.read(STREAM_CHUNK_SIZE), file2.read(STREAM_CHUNK_SIZE)),
100
    for chunk in chunks_iter:
33
100
        ('', ''))
101
        hasher.update(chunk)
34
101
    for chunk1, chunk2 in chunks_iter:
102
        length += len(chunk)
35
102
        if chunk1 != chunk2:
103
    return hasher.hexdigest(), length
25
103
            return False
26
104
    return True
36
105
104
37
106
105
38
107
def confirm_no_clock_skew(con):
106
def confirm_no_clock_skew(con):
39
@@ -222,18 +221,18 @@
40
222
        # most likely to exist on the staging server (it should be
221
        # most likely to exist on the staging server (it should be
41
223
        # irrelevant on production).
222
        # irrelevant on production).
42
224
        cur.execute("""
223
        cur.execute("""
44
225
            SELECT id
224
            SELECT id, sha1, filesize
45
226
            FROM LibraryFileContent
225
            FROM LibraryFileContent
46
227
            WHERE sha1=%(sha1)s AND filesize=%(filesize)s
226
            WHERE sha1=%(sha1)s AND filesize=%(filesize)s
47
228
            ORDER BY datecreated DESC
227
            ORDER BY datecreated DESC
48
229
            """, vars())
228
            """, vars())
50
230
        dupes = [row[0] for row in cur.fetchall()]
229
        dupes = cur.fetchall()
51
231
230
52
232
        if debug:
231
        if debug:
53
233
            log.debug("Found duplicate LibraryFileContents")
232
            log.debug("Found duplicate LibraryFileContents")
54
234
            # Spit out more info in case it helps work out where
233
            # Spit out more info in case it helps work out where
55
235
            # dupes are coming from.
234
            # dupes are coming from.
57
236
            for dupe_id in dupes:
235
            for dupe_id, _, _ in dupes:
58
237
                cur.execute("""
236
                cur.execute("""
59
238
                    SELECT id, filename, mimetype FROM LibraryFileAlias
237
                    SELECT id, filename, mimetype FROM LibraryFileAlias
60
239
                    WHERE content = %(dupe_id)s
238
                    WHERE content = %(dupe_id)s
61
@@ -246,7 +245,7 @@
62
246
        # and cope - just report and skip. However, on staging this will
245
        # and cope - just report and skip. However, on staging this will
63
247
        # be more common because database records has been synced from
246
        # be more common because database records has been synced from
64
248
        # production but the actual librarian contents has not.
247
        # production but the actual librarian contents has not.
66
249
        dupe1_id = dupes[0]
248
        dupe1_id = dupes[0][0]
67
250
        if not file_exists(dupe1_id):
249
        if not file_exists(dupe1_id):
68
251
            if config.instance_name == 'staging':
250
            if config.instance_name == 'staging':
69
252
                log.debug(
251
                log.debug(
70
@@ -256,31 +255,25 @@
71
256
                        "LibraryFileContent %d data is missing", dupe1_id)
255
                        "LibraryFileContent %d data is missing", dupe1_id)
72
257
            continue
256
            continue
73
258
257
89
259
        # Do a manual check that they really are identical, because we
258
        # Check that the first file is intact. Don't want to delete
90
260
        # employ paranoids. And we might as well cope with someone breaking
259
        # dupes if we might need them to recover the original.
91
261
        # SHA1 enough that it becomes possible to create a SHA1 collision
260
        actual_sha1, actual_size = sha1_file(dupe1_id)
92
262
        # with an identical filesize to an existing file. Which is pretty
261
        if actual_sha1 != dupes[0][1] or actual_size != dupes[0][2]:
93
263
        # unlikely. Where did I leave my tin foil hat?
262
            log.error(
94
264
        for dupe2_id in (dupe for dupe in dupes[1:]):
263
                "Corruption found. LibraryFileContent %d has SHA-1 %s and "
95
265
            # Check paths exist, because on staging they may not!
264
                "size %d, expected %s and %d.", dupes[0][0],
96
266
            if (file_exists(dupe2_id) and not same_file(dupe1_id, dupe2_id)):
265
                actual_sha1, actual_size, dupes[0][1], dupes[0][2])
97
267
                log.error(
266
            sys.exit(1)
83
268
                        "SHA-1 collision found. LibraryFileContent %d and "
84
269
                        "%d have the same SHA1 and filesize, but are not "
85
270
                        "byte-for-byte identical.",
86
271
                        dupe1_id, dupe2_id
87
272
                        )
88
273
                sys.exit(1)
98
274
267
99
275
        # Update all the LibraryFileAlias entries to point to a single
268
        # Update all the LibraryFileAlias entries to point to a single
100
276
        # LibraryFileContent
269
        # LibraryFileContent
103
277
        prime_id = dupes[0]
270
        prime_id = dupes[0][0]
104
278
        other_ids = ', '.join(str(dupe) for dupe in dupes[1:])
271
        other_ids = ', '.join(str(dupe) for dupe, _, _ in dupes[1:])
105
279
        log.debug(
272
        log.debug(
106
280
            "Making LibraryFileAliases referencing %s reference %s instead",
273
            "Making LibraryFileAliases referencing %s reference %s instead",
107
281
            other_ids, prime_id
274
            other_ids, prime_id
108
282
            )
275
            )
110
283
        for other_id in dupes[1:]:
276
        for other_id, _, _ in dupes[1:]:
111
284
            cur.execute("""
277
            cur.execute("""
112
285
                UPDATE LibraryFileAlias SET content=%(prime_id)s
278
                UPDATE LibraryFileAlias SET content=%(prime_id)s
113
286
                WHERE content = %(other_id)s
279
                WHERE content = %(other_id)s
114
287
280
115
=== modified file 'lib/lp/services/librarianserver/tests/test_gc.py'
116
--- lib/lp/services/librarianserver/tests/test_gc.py	2015-01-14 06:02:50 +0000
117
+++ lib/lp/services/librarianserver/tests/test_gc.py	2016-06-20 08:26:14 +0000
118
@@ -851,16 +851,27 @@
119
851
        self.unexpired_blob_id = cur.fetchone()[0]
851
        self.unexpired_blob_id = cur.fetchone()[0]
120
852
        self.layer.txn.commit()
852
        self.layer.txn.commit()
121
853
853
123
854
        # Make sure all the librarian files actually exist on disk
854
        # Make sure all the librarian files actually exist on disk with
124
855
        # hashes matching the DB. We use the hash as the new file
125
856
        # content, to preserve existing duplicate relationships.
126
857
        switch_dbuser('testadmin')
127
855
        cur = cursor()
858
        cur = cursor()
130
856
        cur.execute("SELECT id FROM LibraryFileContent")
859
        cur.execute("SELECT id, sha1 FROM LibraryFileContent")
131
857
        for content_id in (row[0] for row in cur.fetchall()):
860
        for content_id, sha1 in cur.fetchall():
132
858
            path = librariangc.get_file_path(content_id)
861
            path = librariangc.get_file_path(content_id)
133
859
            if not os.path.exists(path):
862
            if not os.path.exists(path):
134
860
                if not os.path.exists(os.path.dirname(path)):
863
                if not os.path.exists(os.path.dirname(path)):
135
861
                    os.makedirs(os.path.dirname(path))
864
                    os.makedirs(os.path.dirname(path))
138
862
                open(path, 'w').write('whatever')
865
                data = sha1
139
863
        self.layer.txn.abort()
866
                open(path, 'w').write(data)
140
867
                cur.execute(
141
868
                    "UPDATE LibraryFileContent "
142
869
                    "SET md5 = %s, sha1 = %s, sha256 = %s, filesize = %s "
143
870
                    "WHERE id = %s",
144
871
                    (hashlib.md5(data).hexdigest(),
145
872
                     hashlib.sha1(data).hexdigest(),
146
873
                     hashlib.sha256(data).hexdigest(), len(data), content_id))
147
874
        self.layer.txn.commit()
148
864
875
149
865
        switch_dbuser(config.librarian_gc.dbuser)
876
        switch_dbuser(config.librarian_gc.dbuser)
150
866
877
Status:	Merged
Merged at revision:	18113
Proposed branch:	lp:~wgrant/launchpad/gc-dupe-lighter
Merge into:	lp:launchpad
Diff against target:	149 lines (+42/-38) 2 files modified lib/lp/services/librarianserver/librariangc.py (+26/-33) lib/lp/services/librarianserver/tests/test_gc.py (+16/-5)
To merge this branch:	bzr merge lp:~wgrant/launchpad/gc-dupe-lighter
Related bugs:	Link a bug report
Reviewer	Review Type	Date Requested	Status
Colin Watson (community)		2016-06-20	Approve on 2016-06-22
Review via email: mp+297878@code.launchpad.net
1	=== modified file 'lib/lp/services/librarianserver/librariangc.py'
2	--- lib/lp/services/librarianserver/librariangc.py 2016-05-05 06:10:32 +0000
3	+++ lib/lp/services/librarianserver/librariangc.py 2016-06-20 08:26:14 +0000
4	@@ -10,6 +10,7 @@
5	10	timedelta,	10	timedelta,
6	11	)	11	)
7	12	import errno	12	import errno
8			13	import hashlib
9	13	import os	14	import os
10	14	import re	15	import re
11	15	import sys	16	import sys
12	@@ -91,17 +92,15 @@
13	91	return None # File not found.	92	return None # File not found.
14	92		93
15	93		94
27	94	def same_file(content_id_1, content_id_2):	95	def sha1_file(content_id):
28	95	file1 = open_stream(content_id_1)	96	file = open_stream(content_id)
29	96	file2 = open_stream(content_id_2)	97	chunks_iter = iter(lambda: file.read(STREAM_CHUNK_SIZE), '')
30	97		98	length = 0
31	98	chunks_iter = iter(	99	hasher = hashlib.sha1()
32	99	lambda: (file1.read(STREAM_CHUNK_SIZE), file2.read(STREAM_CHUNK_SIZE)),	100	for chunk in chunks_iter:
33	100	('', ''))	101	hasher.update(chunk)
34	101	for chunk1, chunk2 in chunks_iter:	102	length += len(chunk)
35	102	if chunk1 != chunk2:	103	return hasher.hexdigest(), length
25	103	return False
26	104	return True
36	105		104
37	106		105
38	107	def confirm_no_clock_skew(con):	106	def confirm_no_clock_skew(con):
39	@@ -222,18 +221,18 @@
40	222	# most likely to exist on the staging server (it should be	221	# most likely to exist on the staging server (it should be
41	223	# irrelevant on production).	222	# irrelevant on production).
42	224	cur.execute("""	223	cur.execute("""
44	225	SELECT id	224	SELECT id, sha1, filesize
45	226	FROM LibraryFileContent	225	FROM LibraryFileContent
46	227	WHERE sha1=%(sha1)s AND filesize=%(filesize)s	226	WHERE sha1=%(sha1)s AND filesize=%(filesize)s
47	228	ORDER BY datecreated DESC	227	ORDER BY datecreated DESC
48	229	""", vars())	228	""", vars())
50	230	dupes = [row[0] for row in cur.fetchall()]	229	dupes = cur.fetchall()
51	231		230
52	232	if debug:	231	if debug:
53	233	log.debug("Found duplicate LibraryFileContents")	232	log.debug("Found duplicate LibraryFileContents")
54	234	# Spit out more info in case it helps work out where	233	# Spit out more info in case it helps work out where
55	235	# dupes are coming from.	234	# dupes are coming from.
57	236	for dupe_id in dupes:	235	for dupe_id, _, _ in dupes:
58	237	cur.execute("""	236	cur.execute("""
59	238	SELECT id, filename, mimetype FROM LibraryFileAlias	237	SELECT id, filename, mimetype FROM LibraryFileAlias
60	239	WHERE content = %(dupe_id)s	238	WHERE content = %(dupe_id)s
61	@@ -246,7 +245,7 @@
62	246	# and cope - just report and skip. However, on staging this will	245	# and cope - just report and skip. However, on staging this will
63	247	# be more common because database records has been synced from	246	# be more common because database records has been synced from
64	248	# production but the actual librarian contents has not.	247	# production but the actual librarian contents has not.
66	249	dupe1_id = dupes[0]	248	dupe1_id = dupes[0][0]
67	250	if not file_exists(dupe1_id):	249	if not file_exists(dupe1_id):
68	251	if config.instance_name == 'staging':	250	if config.instance_name == 'staging':
69	252	log.debug(	251	log.debug(
70	@@ -256,31 +255,25 @@
71	256	"LibraryFileContent %d data is missing", dupe1_id)	255	"LibraryFileContent %d data is missing", dupe1_id)
72	257	continue	256	continue
73	258		257
89	259	# Do a manual check that they really are identical, because we	258	# Check that the first file is intact. Don't want to delete
90	260	# employ paranoids. And we might as well cope with someone breaking	259	# dupes if we might need them to recover the original.
91	261	# SHA1 enough that it becomes possible to create a SHA1 collision	260	actual_sha1, actual_size = sha1_file(dupe1_id)
92	262	# with an identical filesize to an existing file. Which is pretty	261	if actual_sha1 != dupes[0][1] or actual_size != dupes[0][2]:
93	263	# unlikely. Where did I leave my tin foil hat?	262	log.error(
94	264	for dupe2_id in (dupe for dupe in dupes[1:]):	263	"Corruption found. LibraryFileContent %d has SHA-1 %s and "
95	265	# Check paths exist, because on staging they may not!	264	"size %d, expected %s and %d.", dupes[0][0],
96	266	if (file_exists(dupe2_id) and not same_file(dupe1_id, dupe2_id)):	265	actual_sha1, actual_size, dupes[0][1], dupes[0][2])
97	267	log.error(	266	sys.exit(1)
83	268	"SHA-1 collision found. LibraryFileContent %d and "
84	269	"%d have the same SHA1 and filesize, but are not "
85	270	"byte-for-byte identical.",
86	271	dupe1_id, dupe2_id
87	272	)
88	273	sys.exit(1)
98	274		267
99	275	# Update all the LibraryFileAlias entries to point to a single	268	# Update all the LibraryFileAlias entries to point to a single
100	276	# LibraryFileContent	269	# LibraryFileContent
103	277	prime_id = dupes[0]	270	prime_id = dupes[0][0]
104	278	other_ids = ', '.join(str(dupe) for dupe in dupes[1:])	271	other_ids = ', '.join(str(dupe) for dupe, _, _ in dupes[1:])
105	279	log.debug(	272	log.debug(
106	280	"Making LibraryFileAliases referencing %s reference %s instead",	273	"Making LibraryFileAliases referencing %s reference %s instead",
107	281	other_ids, prime_id	274	other_ids, prime_id
108	282	)	275	)
110	283	for other_id in dupes[1:]:	276	for other_id, _, _ in dupes[1:]:
111	284	cur.execute("""	277	cur.execute("""
112	285	UPDATE LibraryFileAlias SET content=%(prime_id)s	278	UPDATE LibraryFileAlias SET content=%(prime_id)s
113	286	WHERE content = %(other_id)s	279	WHERE content = %(other_id)s
114	287		280
115	=== modified file 'lib/lp/services/librarianserver/tests/test_gc.py'
116	--- lib/lp/services/librarianserver/tests/test_gc.py 2015-01-14 06:02:50 +0000
117	+++ lib/lp/services/librarianserver/tests/test_gc.py 2016-06-20 08:26:14 +0000
118	@@ -851,16 +851,27 @@
119	851	self.unexpired_blob_id = cur.fetchone()[0]	851	self.unexpired_blob_id = cur.fetchone()[0]
120	852	self.layer.txn.commit()	852	self.layer.txn.commit()
121	853		853
123	854	# Make sure all the librarian files actually exist on disk	854	# Make sure all the librarian files actually exist on disk with
124			855	# hashes matching the DB. We use the hash as the new file
125			856	# content, to preserve existing duplicate relationships.
126			857	switch_dbuser('testadmin')
127	855	cur = cursor()	858	cur = cursor()
130	856	cur.execute("SELECT id FROM LibraryFileContent")	859	cur.execute("SELECT id, sha1 FROM LibraryFileContent")
131	857	for content_id in (row[0] for row in cur.fetchall()):	860	for content_id, sha1 in cur.fetchall():
132	858	path = librariangc.get_file_path(content_id)	861	path = librariangc.get_file_path(content_id)
133	859	if not os.path.exists(path):	862	if not os.path.exists(path):
134	860	if not os.path.exists(os.path.dirname(path)):	863	if not os.path.exists(os.path.dirname(path)):
135	861	os.makedirs(os.path.dirname(path))	864	os.makedirs(os.path.dirname(path))
138	862	open(path, 'w').write('whatever')	865	data = sha1
139	863	self.layer.txn.abort()	866	open(path, 'w').write(data)
140			867	cur.execute(
141			868	"UPDATE LibraryFileContent "
142			869	"SET md5 = %s, sha1 = %s, sha256 = %s, filesize = %s "
143			870	"WHERE id = %s",
144			871	(hashlib.md5(data).hexdigest(),
145			872	hashlib.sha1(data).hexdigest(),
146			873	hashlib.sha256(data).hexdigest(), len(data), content_id))
147			874	self.layer.txn.commit()
148	864		875
149	865	switch_dbuser(config.librarian_gc.dbuser)	876	switch_dbuser(config.librarian_gc.dbuser)
150	866		877