Merge lp:~spiv/bzr/insert-stream-check-chk-root into lp:bzr/2.0

Proposed by Andrew Bennetts
Status: Merged
Merged at revision: not available
Proposed branch: lp:~spiv/bzr/insert-stream-check-chk-root
Merge into: lp:bzr/2.0
Diff against target: 408 lines
To merge this branch: bzr merge lp:~spiv/bzr/insert-stream-check-chk-root
Reviewer Review Type Date Requested Status
Robert Collins (community) Approve
Review via email: mp+11033@code.launchpad.net
To post a comment you must log in.
Revision history for this message
Andrew Bennetts (spiv) wrote :

This is a partial fix for bug 406687, and an incremental step towards a full fix. It adds a check to RepositoryPackCollection._commit_write_group that verifies every revision added by that write group has a corresponding inventory, and that for every corresponding inventory plus any present parent inventories that the chk root entries are present ­— and presence in a fallback does not count. At the same time ghost parent inventories are still allowed. This fix is sufficient at least to catch some bugs we had in the past where no chk records were transferred at all in some cases.

(The full fix, which I am working on, will involve checking that all relevant chk records are present, and that the text versions named by those relevant records are also present.)

I experimented with not requiring chk root records in a stacked repository for inventories that have not changed, because in principle they aren't required (just comparing the key names is enough to show that the delta between those revs is empty), but bzr currently fails on such a repository, so allowing this would allow data incompatible with e.g. 2.0rc1, which I think is undesirable.

I am a little worried about the possible performance impact of this change. I don't think the code is particularly wasteful (except perhaps in holding collections of keys that are larger or more long-lived than strictly necessary), but it is fundamentally more work to perform on every 2a repository write.

Revision history for this message
Robert Collins (lifeless) wrote :

tweak

I think that it would be clearer to use method overriding to handle pack vs chk rather than a check on self in the pack code: the bulk of the method is chk specific.

[though pack repos want their inventories to, so that suggests either duplicate, or missing coverage]

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'bzrlib/groupcompress.py'
2--- bzrlib/groupcompress.py 2009-08-26 16:47:51 +0000
3+++ bzrlib/groupcompress.py 2009-09-02 03:35:25 +0000
4@@ -1714,7 +1714,7 @@
5
6 def __init__(self, graph_index, is_locked, parents=True,
7 add_callback=None, track_external_parent_refs=False,
8- inconsistency_fatal=True):
9+ inconsistency_fatal=True, track_new_keys=False):
10 """Construct a _GCGraphIndex on a graph_index.
11
12 :param graph_index: An implementation of bzrlib.index.GraphIndex.
13@@ -1740,7 +1740,8 @@
14 self._is_locked = is_locked
15 self._inconsistency_fatal = inconsistency_fatal
16 if track_external_parent_refs:
17- self._key_dependencies = knit._KeyRefs()
18+ self._key_dependencies = knit._KeyRefs(
19+ track_new_keys=track_new_keys)
20 else:
21 self._key_dependencies = None
22
23@@ -1800,10 +1801,14 @@
24 result.append((key, value))
25 records = result
26 key_dependencies = self._key_dependencies
27- if key_dependencies is not None and self._parents:
28- for key, value, refs in records:
29- parents = refs[0]
30- key_dependencies.add_references(key, parents)
31+ if key_dependencies is not None:
32+ if self._parents:
33+ for key, value, refs in records:
34+ parents = refs[0]
35+ key_dependencies.add_references(key, parents)
36+ else:
37+ for key, value, refs in records:
38+ new_keys.add_key(key)
39 self._add_callback(records)
40
41 def _check_read(self):
42@@ -1866,7 +1871,7 @@
43 """Return the keys of missing parents."""
44 # Copied from _KnitGraphIndex.get_missing_parents
45 # We may have false positives, so filter those out.
46- self._key_dependencies.add_keys(
47+ self._key_dependencies.satisfy_refs_for_keys(
48 self.get_parent_map(self._key_dependencies.get_unsatisfied_refs()))
49 return frozenset(self._key_dependencies.get_unsatisfied_refs())
50
51@@ -1926,17 +1931,17 @@
52
53 This allows this _GCGraphIndex to keep track of any missing
54 compression parents we may want to have filled in to make those
55- indices valid.
56+ indices valid. It also allows _GCGraphIndex to track any new keys.
57
58 :param graph_index: A GraphIndex
59 """
60- if self._key_dependencies is not None:
61- # Add parent refs from graph_index (and discard parent refs that
62- # the graph_index has).
63- add_refs = self._key_dependencies.add_references
64- for node in graph_index.iter_all_entries():
65- add_refs(node[1], node[3][0])
66-
67+ key_dependencies = self._key_dependencies
68+ if key_dependencies is None:
69+ return
70+ for node in graph_index.iter_all_entries():
71+ # Add parent refs from graph_index (and discard parent refs
72+ # that the graph_index has).
73+ key_dependencies.add_references(node[1], node[3][0])
74
75
76 from bzrlib._groupcompress_py import (
77
78=== modified file 'bzrlib/knit.py'
79--- bzrlib/knit.py 2009-08-26 16:44:27 +0000
80+++ bzrlib/knit.py 2009-09-02 03:35:25 +0000
81@@ -2777,9 +2777,19 @@
82
83 class _KeyRefs(object):
84
85- def __init__(self):
86+ def __init__(self, track_new_keys=False):
87 # dict mapping 'key' to 'set of keys referring to that key'
88 self.refs = {}
89+ if track_new_keys:
90+ self.new_keys = set()
91+ else:
92+ self.new_keys = None
93+
94+ def clear(self):
95+ if self.refs:
96+ self.refs.clear()
97+ if self.new_keys:
98+ self.new_keys.clear()
99
100 def add_references(self, key, refs):
101 # Record the new references
102@@ -2792,19 +2802,28 @@
103 # Discard references satisfied by the new key
104 self.add_key(key)
105
106+ def get_new_keys(self):
107+ return self.new_keys
108+
109 def get_unsatisfied_refs(self):
110 return self.refs.iterkeys()
111
112- def add_key(self, key):
113+ def _satisfy_refs_for_key(self, key):
114 try:
115 del self.refs[key]
116 except KeyError:
117 # No keys depended on this key. That's ok.
118 pass
119
120- def add_keys(self, keys):
121+ def add_key(self, key):
122+ # satisfy refs for key, and remember that we've seen this key.
123+ self._satisfy_refs_for_key(key)
124+ if self.new_keys is not None:
125+ self.new_keys.add(key)
126+
127+ def satisfy_refs_for_keys(self, keys):
128 for key in keys:
129- self.add_key(key)
130+ self._satisfy_refs_for_key(key)
131
132 def get_referrers(self):
133 result = set()
134@@ -2972,7 +2991,7 @@
135 # If updating this, you should also update
136 # groupcompress._GCGraphIndex.get_missing_parents
137 # We may have false positives, so filter those out.
138- self._key_dependencies.add_keys(
139+ self._key_dependencies.satisfy_refs_for_keys(
140 self.get_parent_map(self._key_dependencies.get_unsatisfied_refs()))
141 return frozenset(self._key_dependencies.get_unsatisfied_refs())
142
143
144=== modified file 'bzrlib/repofmt/groupcompress_repo.py'
145--- bzrlib/repofmt/groupcompress_repo.py 2009-08-24 19:34:13 +0000
146+++ bzrlib/repofmt/groupcompress_repo.py 2009-09-02 03:35:25 +0000
147@@ -651,7 +651,7 @@
148 _GCGraphIndex(self._pack_collection.revision_index.combined_index,
149 add_callback=self._pack_collection.revision_index.add_callback,
150 parents=True, is_locked=self.is_locked,
151- track_external_parent_refs=True),
152+ track_external_parent_refs=True, track_new_keys=True),
153 access=self._pack_collection.revision_index.data_access,
154 delta=False)
155 self.signatures = GroupCompressVersionedFiles(
156
157=== modified file 'bzrlib/repofmt/pack_repo.py'
158--- bzrlib/repofmt/pack_repo.py 2009-08-14 11:11:29 +0000
159+++ bzrlib/repofmt/pack_repo.py 2009-09-02 03:35:25 +0000
160@@ -2063,6 +2063,53 @@
161 self._remove_pack_indices(resumed_pack)
162 del self._resumed_packs[:]
163
164+ def _check_new_inventories(self):
165+ """Detect missing inventories or chk root entries for the new revisions
166+ in this write group.
167+
168+ :returns: set of missing keys. Note that not every missing key is
169+ guaranteed to be reported.
170+ """
171+ if getattr(self.repo, 'chk_bytes', None) is None:
172+ return set()
173+ # Ensure that all revisions added in this write group have:
174+ # - corresponding inventories,
175+ # - chk root entries for those inventories,
176+ # - and any present parent inventories have their chk root
177+ # entries too.
178+ # And all this should be independent of any fallback repository.
179+ key_deps = self.repo.revisions._index._key_dependencies
180+ new_revisions_keys = key_deps.get_new_keys()
181+ no_fallback_inv_index = self.repo.inventories._index
182+ no_fallback_chk_bytes_index = self.repo.chk_bytes._index
183+ inv_parent_map = no_fallback_inv_index.get_parent_map(
184+ new_revisions_keys)
185+ # Are any inventories for corresponding to the new revisions missing?
186+ corresponding_invs = set(inv_parent_map)
187+ missing_corresponding = set(new_revisions_keys)
188+ missing_corresponding.difference_update(corresponding_invs)
189+ if missing_corresponding:
190+ return [('inventories', key) for key in missing_corresponding]
191+ # Are any chk root entries missing for any inventories? This includes
192+ # any present parent inventories, which may be used when calculating
193+ # deltas for streaming.
194+ all_inv_keys = set(corresponding_invs)
195+ for parent_inv_keys in inv_parent_map.itervalues():
196+ all_inv_keys.update(parent_inv_keys)
197+ # Filter out ghost parents.
198+ all_inv_keys.intersection_update(
199+ no_fallback_inv_index.get_parent_map(all_inv_keys))
200+ all_missing = set()
201+ inv_ids = [key[-1] for key in all_inv_keys]
202+ for inv in self.repo.iter_inventories(inv_ids, 'unordered'):
203+ root_keys = set([inv.id_to_entry.key()])
204+ if inv.parent_id_basename_to_file_id is not None:
205+ root_keys.add(inv.parent_id_basename_to_file_id.key())
206+ present = no_fallback_chk_bytes_index.get_parent_map(root_keys)
207+ missing = root_keys.difference(present)
208+ all_missing.update([('chk_bytes',) + key for key in missing])
209+ return all_missing
210+
211 def _commit_write_group(self):
212 all_missing = set()
213 for prefix, versioned_file in (
214@@ -2073,6 +2120,7 @@
215 ):
216 missing = versioned_file.get_missing_compression_parent_keys()
217 all_missing.update([(prefix,) + key for key in missing])
218+ all_missing.update(self._check_new_inventories())
219 if all_missing:
220 raise errors.BzrCheckError(
221 "Repository %s has missing compression parent(s) %r "
222@@ -2222,7 +2270,7 @@
223 % (self._format, self.bzrdir.transport.base))
224
225 def _abort_write_group(self):
226- self.revisions._index._key_dependencies.refs.clear()
227+ self.revisions._index._key_dependencies.clear()
228 self._pack_collection._abort_write_group()
229
230 def _get_source(self, to_format):
231@@ -2242,13 +2290,14 @@
232 self._pack_collection._start_write_group()
233
234 def _commit_write_group(self):
235- self.revisions._index._key_dependencies.refs.clear()
236- return self._pack_collection._commit_write_group()
237+ hint = self._pack_collection._commit_write_group()
238+ self.revisions._index._key_dependencies.clear()
239+ return hint
240
241 def suspend_write_group(self):
242 # XXX check self._write_group is self.get_transaction()?
243 tokens = self._pack_collection._suspend_write_group()
244- self.revisions._index._key_dependencies.refs.clear()
245+ self.revisions._index._key_dependencies.clear()
246 self._write_group = None
247 return tokens
248
249
250=== modified file 'bzrlib/repository.py'
251--- bzrlib/repository.py 2009-08-30 22:02:45 +0000
252+++ bzrlib/repository.py 2009-09-02 03:35:25 +0000
253@@ -1604,7 +1604,7 @@
254 # but at the moment we're only checking for texts referenced by
255 # inventories at the graph's edge.
256 key_deps = self.revisions._index._key_dependencies
257- key_deps.add_keys(present_inventories)
258+ key_deps.satisfy_refs_for_keys(present_inventories)
259 referrers = frozenset(r[0] for r in key_deps.get_referrers())
260 file_ids = self.fileids_altered_by_revision_ids(referrers)
261 missing_texts = set()
262
263=== modified file 'bzrlib/tests/per_repository/test_write_group.py'
264--- bzrlib/tests/per_repository/test_write_group.py 2009-08-17 04:18:57 +0000
265+++ bzrlib/tests/per_repository/test_write_group.py 2009-09-02 03:35:25 +0000
266@@ -361,6 +361,143 @@
267 sink.insert_stream((), repo._format, tokens)
268 self.assertEqual([True], call_log)
269
270+ def test_missing_chk_root_for_inventory(self):
271+ """commit_write_group fails with BzrCheckError when the chk root record
272+ for a new inventory is missing.
273+ """
274+ builder = self.make_branch_builder('simple-branch')
275+ builder.build_snapshot('A-id', None, [
276+ ('add', ('', 'root-id', 'directory', None)),
277+ ('add', ('file', 'file-id', 'file', 'content\n'))])
278+ b = builder.get_branch()
279+ if not b.repository._format.supports_chks:
280+ raise TestNotApplicable('requires repository with chk_bytes')
281+ b.lock_read()
282+ self.addCleanup(b.unlock)
283+ repo = self.make_repository('damaged-repo')
284+ repo.lock_write()
285+ repo.start_write_group()
286+ # Now, add the objects manually
287+ text_keys = [('file-id', 'A-id'), ('root-id', 'A-id')]
288+ # Directly add the texts, inventory, and revision object for 'A-id' --
289+ # but don't add the chk_bytes.
290+ src_repo = b.repository
291+ repo.texts.insert_record_stream(src_repo.texts.get_record_stream(
292+ text_keys, 'unordered', True))
293+ repo.inventories.insert_record_stream(
294+ src_repo.inventories.get_record_stream(
295+ [('A-id',)], 'unordered', True))
296+ repo.revisions.insert_record_stream(
297+ src_repo.revisions.get_record_stream(
298+ [('A-id',)], 'unordered', True))
299+ # Make sure the presence of the missing data in a fallback does not
300+ # avoid the error.
301+ repo.add_fallback_repository(b.repository)
302+ self.assertRaises(errors.BzrCheckError, repo.commit_write_group)
303+ reopened_repo = self.reopen_repo_and_resume_write_group(repo)
304+ self.assertRaises(
305+ errors.BzrCheckError, reopened_repo.commit_write_group)
306+ reopened_repo.abort_write_group()
307+
308+ def test_missing_chk_root_for_unchanged_inventory(self):
309+ """commit_write_group fails with BzrCheckError when the chk root record
310+ for a new inventory is missing, even if the parent inventory is present
311+ and has identical content (i.e. the same chk root).
312+
313+ A stacked repository containing only a revision with an identical
314+ inventory to its parent will still have the chk root records for those
315+ inventories.
316+
317+ (In principle the chk records are unnecessary in this case, but in
318+ practice bzr 2.0rc1 (at least) expects to find them.)
319+ """
320+ # Make a branch where the last two revisions have identical
321+ # inventories.
322+ builder = self.make_branch_builder('simple-branch')
323+ builder.build_snapshot('A-id', None, [
324+ ('add', ('', 'root-id', 'directory', None)),
325+ ('add', ('file', 'file-id', 'file', 'content\n'))])
326+ builder.build_snapshot('B-id', None, [])
327+ builder.build_snapshot('C-id', None, [])
328+ b = builder.get_branch()
329+ if not b.repository._format.supports_chks:
330+ raise TestNotApplicable('requires repository with chk_bytes')
331+ b.lock_read()
332+ self.addCleanup(b.unlock)
333+ # check our setup: B-id and C-id should have identical chk root keys.
334+ inv_b = b.repository.get_inventory('B-id')
335+ inv_c = b.repository.get_inventory('C-id')
336+ self.assertEqual(inv_b.id_to_entry.key(), inv_c.id_to_entry.key())
337+ # Now, manually insert objects for a stacked repo with only revision
338+ # C-id:
339+ # We need ('revisions', 'C-id'), ('inventories', 'C-id'),
340+ # ('inventories', 'B-id'), and the corresponding chk roots for those
341+ # inventories.
342+ repo = self.make_repository('damaged-repo')
343+ repo.lock_write()
344+ repo.start_write_group()
345+ src_repo = b.repository
346+ repo.inventories.insert_record_stream(
347+ src_repo.inventories.get_record_stream(
348+ [('B-id',), ('C-id',)], 'unordered', True))
349+ repo.revisions.insert_record_stream(
350+ src_repo.revisions.get_record_stream(
351+ [('C-id',)], 'unordered', True))
352+ # Make sure the presence of the missing data in a fallback does not
353+ # avoid the error.
354+ repo.add_fallback_repository(b.repository)
355+ self.assertRaises(errors.BzrCheckError, repo.commit_write_group)
356+ reopened_repo = self.reopen_repo_and_resume_write_group(repo)
357+ self.assertRaises(
358+ errors.BzrCheckError, reopened_repo.commit_write_group)
359+ reopened_repo.abort_write_group()
360+
361+ def test_missing_chk_root_for_parent_inventory(self):
362+ """commit_write_group fails with BzrCheckError when the chk root record
363+ for a parent inventory of a new revision is missing.
364+ """
365+ builder = self.make_branch_builder('simple-branch')
366+ builder.build_snapshot('A-id', None, [
367+ ('add', ('', 'root-id', 'directory', None)),
368+ ('add', ('file', 'file-id', 'file', 'content\n'))])
369+ builder.build_snapshot('B-id', None, [])
370+ builder.build_snapshot('C-id', None, [
371+ ('modify', ('file-id', 'new-content'))])
372+ b = builder.get_branch()
373+ if not b.repository._format.supports_chks:
374+ raise TestNotApplicable('requires repository with chk_bytes')
375+ b.lock_read()
376+ self.addCleanup(b.unlock)
377+ # Now, manually insert objects for a stacked repo with only revision
378+ # C-id, *except* the chk root entry for the parent inventory.
379+ # We need ('revisions', 'C-id'), ('inventories', 'C-id'),
380+ # ('inventories', 'B-id'), and the corresponding chk roots for those
381+ # inventories.
382+ inv_c = b.repository.get_inventory('C-id')
383+ chk_keys_for_c_only = [
384+ inv_c.id_to_entry.key(), inv_c.parent_id_basename_to_file_id.key()]
385+ repo = self.make_repository('damaged-repo')
386+ repo.lock_write()
387+ repo.start_write_group()
388+ src_repo = b.repository
389+ repo.chk_bytes.insert_record_stream(
390+ src_repo.chk_bytes.get_record_stream(
391+ chk_keys_for_c_only, 'unordered', True))
392+ repo.inventories.insert_record_stream(
393+ src_repo.inventories.get_record_stream(
394+ [('B-id',), ('C-id',)], 'unordered', True))
395+ repo.revisions.insert_record_stream(
396+ src_repo.revisions.get_record_stream(
397+ [('C-id',)], 'unordered', True))
398+ # Make sure the presence of the missing data in a fallback does not
399+ # avoid the error.
400+ repo.add_fallback_repository(b.repository)
401+ self.assertRaises(errors.BzrCheckError, repo.commit_write_group)
402+ reopened_repo = self.reopen_repo_and_resume_write_group(repo)
403+ self.assertRaises(
404+ errors.BzrCheckError, reopened_repo.commit_write_group)
405+ reopened_repo.abort_write_group()
406+
407
408 class TestResumeableWriteGroup(TestCaseWithRepository):
409

Subscribers

People subscribed via source and target branches