It seems like you should have a news entry before you merge. === modified file 'bzrlib/groupcompress.py' --- bzrlib/groupcompress.py 2009-09-08 02:09:09 +0000 +++ bzrlib/groupcompress.py 2009-09-08 02:30:52 +0000 @@ -1182,6 +1182,12 @@ self._group_cache = LRUSizeCache(max_size=50*1024*1024) self._fallback_vfs = [] + def without_fallbacks(self): OK, it's fairly obvious, but a docstring would still be good. + gcvf = GroupCompressVersionedFiles( + self._index, self._access, self._delta) + gcvf._unadded_refs = dict(self._unadded_refs) + return gcvf + It seems a bit potentially problematic to poke this in post construction, if someone for instance changes the constructor so that it does actually start opening things. How about instead adding a private parameter to the constructor to pass this in? def add_lines(self, key, parents, lines, parent_texts=None, left_matching_blocks=None, nostore_sha=None, random_id=False, check_content=True): === modified file 'bzrlib/repofmt/groupcompress_repo.py' --- bzrlib/repofmt/groupcompress_repo.py 2009-09-08 02:09:09 +0000 +++ bzrlib/repofmt/groupcompress_repo.py 2009-09-08 02:30:52 +0000 @@ -591,8 +591,6 @@ :returns: set of missing keys. Note that not every missing key is guaranteed to be reported. """ - if getattr(self.repo, 'chk_bytes', None) is None: - return set() # Ensure that all revisions added in this write group have: # - corresponding inventories, # - chk root entries for those inventories, @@ -603,6 +601,7 @@ new_revisions_keys = key_deps.get_new_keys() no_fallback_inv_index = self.repo.inventories._index no_fallback_chk_bytes_index = self.repo.chk_bytes._index + no_fallback_texts_index = self.repo.texts._index inv_parent_map = no_fallback_inv_index.get_parent_map( new_revisions_keys) # Are any inventories for corresponding to the new revisions missing? @@ -610,7 +609,9 @@ missing_corresponding = set(new_revisions_keys) missing_corresponding.difference_update(corresponding_invs) if missing_corresponding: - return [('inventories', key) for key in missing_corresponding] + raise errors.BzrCheckError( + "Repository %s missing inventories for new revisions %r " + % (self.repo, sorted(missing_corresponding))) As Robert says, it seems like a step backwards to be raising an exception directly rather than returning a list of problems - is there a reason why you have to? # Are any chk root entries missing for any inventories? This includes # any present parent inventories, which may be used when calculating # deltas for streaming. @@ -620,17 +621,57 @@ # Filter out ghost parents. all_inv_keys.intersection_update( no_fallback_inv_index.get_parent_map(all_inv_keys)) + parent_invs_only_keys = all_inv_keys.symmetric_difference( + corresponding_invs) all_missing = set() inv_ids = [key[-1] for key in all_inv_keys] - for inv in self.repo.iter_inventories(inv_ids, 'unordered'): - root_keys = set([inv.id_to_entry.key()]) - if inv.parent_id_basename_to_file_id is not None: - root_keys.add(inv.parent_id_basename_to_file_id.key()) - present = no_fallback_chk_bytes_index.get_parent_map(root_keys) - missing = root_keys.difference(present) - all_missing.update([('chk_bytes',) + key for key in missing]) - return all_missing - + parent_invs_only_ids = [key[-1] for key in parent_invs_only_keys] + root_key_info = _build_interesting_key_sets( + self.repo, inv_ids, parent_invs_only_ids) + expected_chk_roots = root_key_info.all_keys() + present_chk_roots = no_fallback_chk_bytes_index.get_parent_map( + expected_chk_roots) + missing_chk_roots = expected_chk_roots.difference(present_chk_roots) + if missing_chk_roots: + # Don't bother checking any further. + raise errors.BzrCheckError( + "Repository %s missing chk root keys %r for new revisions" + % (self.repo, sorted(missing_chk_roots))) + # Find all interesting chk_bytes records, and make sure they are + # present, as well as the text keys they reference. + chk_bytes_no_fallbacks = self.repo.chk_bytes.without_fallbacks() + chk_bytes_no_fallbacks._search_key_func = \ + self.repo.chk_bytes._search_key_func + chk_diff = chk_map.iter_interesting_nodes( + chk_bytes_no_fallbacks, root_key_info.interesting_root_keys, + root_key_info.uninteresting_root_keys) + bytes_to_info = inventory.CHKInventory._bytes_to_utf8name_key + text_keys = set() + try: + for record in _filter_text_keys(chk_diff, text_keys, bytes_to_info): + pass + except errors.NoSuchRevision, e: + # XXX: It would be nice if we could give a more precise error here. + raise errors.BzrCheckError( + "Repository %s missing chk node(s) for new revisions." + % (self.repo,)) + chk_diff = chk_map.iter_interesting_nodes( + chk_bytes_no_fallbacks, root_key_info.interesting_pid_root_keys, + root_key_info.uninteresting_pid_root_keys) + try: + for interesting_rec, interesting_map in chk_diff: + pass + except errors.NoSuchRevision, e: + raise errors.BzrCheckError( + "Repository %s missing chk node(s) for new revisions." + % (self.repo,)) + present_text_keys = no_fallback_texts_index.get_parent_map(text_keys) + missing_text_keys = text_keys.difference(present_text_keys) + if missing_text_keys: + raise errors.BzrCheckError( + "Repository %s missing text keys %r for new revisions" + % (self.repo, sorted(missing_text_keys))) + The checks look reasonable but again it seems like we should return results rather than raising errors? def _execute_pack_operations(self, pack_operations, _packer_class=GCCHKPacker, reload_func=None): @@ -1111,6 +1144,54 @@ yield stream_info +class _InterestingKeyInfo(object): + def __init__(self): + self.interesting_root_keys = set() + self.interesting_pid_root_keys = set() + self.uninteresting_root_keys = set() + self.uninteresting_pid_root_keys = set() + + def all_interesting(self): + return self.interesting_root_keys.union(self.interesting_pid_root_keys) + + def all_uninteresting(self): + return self.uninteresting_root_keys.union( + self.uninteresting_pid_root_keys) + + def all_keys(self): + return self.all_interesting().union(self.all_uninteresting()) + + +def _build_interesting_key_sets(repo, inventory_ids, parent_only_inv_ids): + result = _InterestingKeyInfo() + for inv in repo.iter_inventories(inventory_ids, 'unordered'): + root_key = inv.id_to_entry.key() + pid_root_key = inv.parent_id_basename_to_file_id.key() + if inv.revision_id in parent_only_inv_ids: + result.uninteresting_root_keys.add(root_key) + result.uninteresting_pid_root_keys.add(pid_root_key) + else: + result.interesting_root_keys.add(root_key) + result.interesting_pid_root_keys.add(pid_root_key) + return result I always wonder whether this kind of thing should be a constructor, a factory classmethod, or just a function as you have here. + + +def _filter_text_keys(interesting_nodes_iterable, text_keys, bytes_to_info): + """Iterate the result of iter_interesting_nodes, yielding the records + and adding to text_keys. + """ + for record, items in interesting_nodes_iterable: + for name, bytes in items: + # Note: we don't care about name_utf8, because groupcompress repos + # are always rich-root, so there are no synthesised root records to + # ignore. + _, file_id, revision_id = bytes_to_info(bytes) + text_keys.add((file_id, revision_id)) + yield record + + + + class RepositoryFormatCHK1(RepositoryFormatPack): """A hashed CHK+group compress pack repository.""" === modified file 'bzrlib/tests/per_repository/test_write_group.py' --- bzrlib/tests/per_repository/test_write_group.py 2009-09-08 02:09:09 +0000 +++ bzrlib/tests/per_repository/test_write_group.py 2009-09-08 02:30:52 +0000 @@ -365,16 +365,16 @@ """commit_write_group fails with BzrCheckError when the chk root record for a new inventory is missing. """ + repo = self.make_repository('damaged-repo') + if not repo._format.supports_chks: + raise TestNotApplicable('requires repository with chk_bytes') This is a common pattern, but it's somewhat inefficient to do all the work of building a repository and then decide we don't want it, when we could presumably just look at the already-existing format object. Maybe it should be actually calling make_chk_repository() and that can raise if it's not supported? You've inserted this multiple times. Are these just moving in the tests to do less work before they skip? builder = self.make_branch_builder('simple-branch') builder.build_snapshot('A-id', None, [ ('add', ('', 'root-id', 'directory', None)), ('add', ('file', 'file-id', 'file', 'content\n'))]) b = builder.get_branch() - if not b.repository._format.supports_chks: - raise TestNotApplicable('requires repository with chk_bytes') b.lock_read() self.addCleanup(b.unlock) - repo = self.make_repository('damaged-repo') repo.lock_write() repo.start_write_group() # Now, add the objects manually @@ -411,6 +411,9 @@ (In principle the chk records are unnecessary in this case, but in practice bzr 2.0rc1 (at least) expects to find them.) """ + repo = self.make_repository('damaged-repo') + if not repo._format.supports_chks: + raise TestNotApplicable('requires repository with chk_bytes') # Make a branch where the last two revisions have identical # inventories. builder = self.make_branch_builder('simple-branch') @@ -420,8 +423,6 @@ builder.build_snapshot('B-id', None, []) builder.build_snapshot('C-id', None, []) b = builder.get_branch() - if not b.repository._format.supports_chks: - raise TestNotApplicable('requires repository with chk_bytes') b.lock_read() self.addCleanup(b.unlock) # check our setup: B-id and C-id should have identical chk root keys. @@ -433,10 +434,71 @@ # We need ('revisions', 'C-id'), ('inventories', 'C-id'), # ('inventories', 'B-id'), and the corresponding chk roots for those # inventories. + repo.lock_write() + repo.start_write_group() + src_repo = b.repository + repo.inventories.insert_record_stream( + src_repo.inventories.get_record_stream( + [('B-id',), ('C-id',)], 'unordered', True)) + repo.revisions.insert_record_stream( + src_repo.revisions.get_record_stream( + [('C-id',)], 'unordered', True)) + # Make sure the presence of the missing data in a fallback does not + # avoid the error. + repo.add_fallback_repository(b.repository) + self.assertRaises(errors.BzrCheckError, repo.commit_write_group) + reopened_repo = self.reopen_repo_and_resume_write_group(repo) + self.assertRaises( + errors.BzrCheckError, reopened_repo.commit_write_group) + reopened_repo.abort_write_group() + ok + def test_missing_chk_leaf_for_inventory(self): + """commit_write_group fails with BzrCheckError when the chk root record + for a parent inventory of a new revision is missing. + """ repo = self.make_repository('damaged-repo') + if not repo._format.supports_chks: + raise TestNotApplicable('requires repository with chk_bytes') + builder = self.make_branch_builder('simple-branch') + # add and modify files with very long file-ids, so that the chk map + # will need more than just a root node. + file_adds = [] + file_modifies = [] + for char in 'abc': + name = char * 10000 + file_adds.append( + ('add', ('file-' + name, 'file-%s-id' % name, 'file', + 'content %s\n' % name))) + file_modifies.append( + ('modify', ('file-%s-id' % name, 'new content %s\n' % name))) + builder.build_snapshot('A-id', None, [ + ('add', ('', 'root-id', 'directory', None))] + + file_adds) + builder.build_snapshot('B-id', None, []) + builder.build_snapshot('C-id', None, file_modifies) + b = builder.get_branch() This seems redundant with make_branch_with_multiple_chk_nodes...? I think it is a very good candidate to be lifted out. The other tests look pretty clear though there does seem to be a bit of duplication...