It seems like you should have a news entry before you merge.


=== modified file 'bzrlib/groupcompress.py'
--- bzrlib/groupcompress.py	2009-09-08 02:09:09 +0000
+++ bzrlib/groupcompress.py	2009-09-08 02:30:52 +0000
@@ -1182,6 +1182,12 @@
         self._group_cache = LRUSizeCache(max_size=50*1024*1024)
         self._fallback_vfs = []
 
+    def without_fallbacks(self):

OK, it's fairly obvious, but a docstring would still be good.

+        gcvf = GroupCompressVersionedFiles(
+            self._index, self._access, self._delta)
+        gcvf._unadded_refs = dict(self._unadded_refs)
+        return gcvf
+

It seems a bit potentially problematic to poke this in post
construction, if someone for instance changes the constructor so that it
does actually start opening things.  How about instead adding a private
parameter to the constructor to pass this in?

     def add_lines(self, key, parents, lines, parent_texts=None,
         left_matching_blocks=None, nostore_sha=None, random_id=False,
         check_content=True):

=== modified file 'bzrlib/repofmt/groupcompress_repo.py'
--- bzrlib/repofmt/groupcompress_repo.py	2009-09-08 02:09:09 +0000
+++ bzrlib/repofmt/groupcompress_repo.py	2009-09-08 02:30:52 +0000
@@ -591,8 +591,6 @@
         :returns: set of missing keys.  Note that not every missing key is
             guaranteed to be reported.
         """
-        if getattr(self.repo, 'chk_bytes', None) is None:
-            return set()
         # Ensure that all revisions added in this write group have:
         #   - corresponding inventories,
         #   - chk root entries for those inventories,
@@ -603,6 +601,7 @@
         new_revisions_keys = key_deps.get_new_keys()
         no_fallback_inv_index = self.repo.inventories._index
         no_fallback_chk_bytes_index = self.repo.chk_bytes._index
+        no_fallback_texts_index = self.repo.texts._index
         inv_parent_map = no_fallback_inv_index.get_parent_map(
             new_revisions_keys)
         # Are any inventories for corresponding to the new revisions missing?
@@ -610,7 +609,9 @@
         missing_corresponding = set(new_revisions_keys)
         missing_corresponding.difference_update(corresponding_invs)
         if missing_corresponding:
-            return [('inventories', key) for key in missing_corresponding]
+            raise errors.BzrCheckError(
+                "Repository %s missing inventories for new revisions %r "
+                 % (self.repo, sorted(missing_corresponding)))

As Robert says, it seems like a step backwards to be raising an
exception directly rather than returning a list of problems - is there a
reason why you have to?

         # Are any chk root entries missing for any inventories?  This includes
         # any present parent inventories, which may be used when calculating
         # deltas for streaming.
@@ -620,17 +621,57 @@
         # Filter out ghost parents.
         all_inv_keys.intersection_update(
             no_fallback_inv_index.get_parent_map(all_inv_keys))
+        parent_invs_only_keys = all_inv_keys.symmetric_difference(
+            corresponding_invs)
         all_missing = set()
         inv_ids = [key[-1] for key in all_inv_keys]
-        for inv in self.repo.iter_inventories(inv_ids, 'unordered'):
-            root_keys = set([inv.id_to_entry.key()])
-            if inv.parent_id_basename_to_file_id is not None:
-                root_keys.add(inv.parent_id_basename_to_file_id.key())
-            present = no_fallback_chk_bytes_index.get_parent_map(root_keys)
-            missing = root_keys.difference(present)
-            all_missing.update([('chk_bytes',) + key for key in missing])
-        return all_missing
-        
+        parent_invs_only_ids = [key[-1] for key in parent_invs_only_keys]
+        root_key_info = _build_interesting_key_sets(
+            self.repo, inv_ids, parent_invs_only_ids)
+        expected_chk_roots = root_key_info.all_keys()
+        present_chk_roots = no_fallback_chk_bytes_index.get_parent_map(
+            expected_chk_roots)
+        missing_chk_roots = expected_chk_roots.difference(present_chk_roots)
+        if missing_chk_roots:
+            # Don't bother checking any further.
+            raise errors.BzrCheckError(
+                "Repository %s missing chk root keys %r for new revisions"
+                 % (self.repo, sorted(missing_chk_roots)))
+        # Find all interesting chk_bytes records, and make sure they are
+        # present, as well as the text keys they reference.
+        chk_bytes_no_fallbacks = self.repo.chk_bytes.without_fallbacks()
+        chk_bytes_no_fallbacks._search_key_func = \
+            self.repo.chk_bytes._search_key_func
+        chk_diff = chk_map.iter_interesting_nodes(
+            chk_bytes_no_fallbacks, root_key_info.interesting_root_keys,
+            root_key_info.uninteresting_root_keys)
+        bytes_to_info = inventory.CHKInventory._bytes_to_utf8name_key
+        text_keys = set()
+        try:
+            for record in _filter_text_keys(chk_diff, text_keys, bytes_to_info):
+                pass
+        except errors.NoSuchRevision, e:
+            # XXX: It would be nice if we could give a more precise error here.
+            raise errors.BzrCheckError(
+                "Repository %s missing chk node(s) for new revisions."
+                % (self.repo,))
+        chk_diff = chk_map.iter_interesting_nodes(
+            chk_bytes_no_fallbacks, root_key_info.interesting_pid_root_keys,
+            root_key_info.uninteresting_pid_root_keys)
+        try:
+            for interesting_rec, interesting_map in chk_diff:
+                pass
+        except errors.NoSuchRevision, e:
+            raise errors.BzrCheckError(
+                "Repository %s missing chk node(s) for new revisions."
+                % (self.repo,))
+        present_text_keys = no_fallback_texts_index.get_parent_map(text_keys)
+        missing_text_keys = text_keys.difference(present_text_keys)
+        if missing_text_keys:
+            raise errors.BzrCheckError(
+                "Repository %s missing text keys %r for new revisions"
+                 % (self.repo, sorted(missing_text_keys)))
+

The checks look reasonable but again it seems like we should return
results rather than raising errors?

     def _execute_pack_operations(self, pack_operations,
                                  _packer_class=GCCHKPacker,
                                  reload_func=None):
@@ -1111,6 +1144,54 @@
             yield stream_info
 
 
+class _InterestingKeyInfo(object):
+    def __init__(self):
+        self.interesting_root_keys = set()
+        self.interesting_pid_root_keys = set()
+        self.uninteresting_root_keys = set()
+        self.uninteresting_pid_root_keys = set()
+
+    def all_interesting(self):
+        return self.interesting_root_keys.union(self.interesting_pid_root_keys)
+
+    def all_uninteresting(self):
+        return self.uninteresting_root_keys.union(
+            self.uninteresting_pid_root_keys)
+
+    def all_keys(self):
+        return self.all_interesting().union(self.all_uninteresting())
+
+
+def _build_interesting_key_sets(repo, inventory_ids, parent_only_inv_ids):
+    result = _InterestingKeyInfo()
+    for inv in repo.iter_inventories(inventory_ids, 'unordered'):
+        root_key = inv.id_to_entry.key()
+        pid_root_key = inv.parent_id_basename_to_file_id.key()
+        if inv.revision_id in parent_only_inv_ids:
+            result.uninteresting_root_keys.add(root_key)
+            result.uninteresting_pid_root_keys.add(pid_root_key)
+        else:
+            result.interesting_root_keys.add(root_key)
+            result.interesting_pid_root_keys.add(pid_root_key)
+    return result

I always wonder whether this kind of thing should be a constructor, a
factory classmethod, or just a function as you have here.

+
+
+def _filter_text_keys(interesting_nodes_iterable, text_keys, bytes_to_info):
+    """Iterate the result of iter_interesting_nodes, yielding the records
+    and adding to text_keys.
+    """
+    for record, items in interesting_nodes_iterable:
+        for name, bytes in items:
+            # Note: we don't care about name_utf8, because groupcompress repos
+            # are always rich-root, so there are no synthesised root records to
+            # ignore.
+            _, file_id, revision_id = bytes_to_info(bytes)
+            text_keys.add((file_id, revision_id))
+        yield record
+
+
+
+
 class RepositoryFormatCHK1(RepositoryFormatPack):
     """A hashed CHK+group compress pack repository."""
 

=== modified file 'bzrlib/tests/per_repository/test_write_group.py'
--- bzrlib/tests/per_repository/test_write_group.py	2009-09-08 02:09:09 +0000
+++ bzrlib/tests/per_repository/test_write_group.py	2009-09-08 02:30:52 +0000
@@ -365,16 +365,16 @@
         """commit_write_group fails with BzrCheckError when the chk root record
         for a new inventory is missing.
         """
+        repo = self.make_repository('damaged-repo')
+        if not repo._format.supports_chks:
+            raise TestNotApplicable('requires repository with chk_bytes')

This is a common pattern, but it's somewhat inefficient to do all the
work of building a repository and then decide we don't want it, when we
could presumably just look at the already-existing format object.

Maybe it should be actually calling make_chk_repository() and that can
raise if it's not supported?  You've inserted this multiple times.

Are these just moving in the tests to do less work before they skip?

         builder = self.make_branch_builder('simple-branch')
         builder.build_snapshot('A-id', None, [
             ('add', ('', 'root-id', 'directory', None)),
             ('add', ('file', 'file-id', 'file', 'content\n'))])
         b = builder.get_branch()
-        if not b.repository._format.supports_chks:
-            raise TestNotApplicable('requires repository with chk_bytes')
         b.lock_read()
         self.addCleanup(b.unlock)
-        repo = self.make_repository('damaged-repo')
         repo.lock_write()
         repo.start_write_group()
         # Now, add the objects manually
@@ -411,6 +411,9 @@
         (In principle the chk records are unnecessary in this case, but in
         practice bzr 2.0rc1 (at least) expects to find them.)
         """
+        repo = self.make_repository('damaged-repo')
+        if not repo._format.supports_chks:
+            raise TestNotApplicable('requires repository with chk_bytes')
         # Make a branch where the last two revisions have identical
         # inventories.
         builder = self.make_branch_builder('simple-branch')
@@ -420,8 +423,6 @@
         builder.build_snapshot('B-id', None, [])
         builder.build_snapshot('C-id', None, [])
         b = builder.get_branch()
-        if not b.repository._format.supports_chks:
-            raise TestNotApplicable('requires repository with chk_bytes')
         b.lock_read()
         self.addCleanup(b.unlock)
         # check our setup: B-id and C-id should have identical chk root keys.
@@ -433,10 +434,71 @@
         # We need ('revisions', 'C-id'), ('inventories', 'C-id'),
         # ('inventories', 'B-id'), and the corresponding chk roots for those
         # inventories.
+        repo.lock_write()
+        repo.start_write_group()
+        src_repo = b.repository
+        repo.inventories.insert_record_stream(
+            src_repo.inventories.get_record_stream(
+                [('B-id',), ('C-id',)], 'unordered', True))
+        repo.revisions.insert_record_stream(
+            src_repo.revisions.get_record_stream(
+                [('C-id',)], 'unordered', True))
+        # Make sure the presence of the missing data in a fallback does not
+        # avoid the error.
+        repo.add_fallback_repository(b.repository)
+        self.assertRaises(errors.BzrCheckError, repo.commit_write_group)
+        reopened_repo = self.reopen_repo_and_resume_write_group(repo)
+        self.assertRaises(
+            errors.BzrCheckError, reopened_repo.commit_write_group)
+        reopened_repo.abort_write_group()
+

ok

+    def test_missing_chk_leaf_for_inventory(self):
+        """commit_write_group fails with BzrCheckError when the chk root record
+        for a parent inventory of a new revision is missing.
+        """
         repo = self.make_repository('damaged-repo')
+        if not repo._format.supports_chks:
+            raise TestNotApplicable('requires repository with chk_bytes')
+        builder = self.make_branch_builder('simple-branch')
+        # add and modify files with very long file-ids, so that the chk map
+        # will need more than just a root node.
+        file_adds = []
+        file_modifies = []
+        for char in 'abc':
+            name = char * 10000
+            file_adds.append(
+                ('add', ('file-' + name, 'file-%s-id' % name, 'file',
+                         'content %s\n' % name)))
+            file_modifies.append(
+                ('modify', ('file-%s-id' % name, 'new content %s\n' % name)))
+        builder.build_snapshot('A-id', None, [
+            ('add', ('', 'root-id', 'directory', None))] +
+            file_adds)
+        builder.build_snapshot('B-id', None, [])
+        builder.build_snapshot('C-id', None, file_modifies)
+        b = builder.get_branch()

This seems redundant with make_branch_with_multiple_chk_nodes...?

I think it is a very good candidate to be lifted out.

The other tests look pretty clear though there does seem to be a bit of
duplication...