Bazaar

Code review comment for lp:~ian-clatworthy/bzr/faster-log-file

faster-log-file
Merge into trunk-old

Revision history for this message

Martin Pool (mbp) wrote on 2009-07-24:

I think you should test this on some files from MySQL or Launchpad,
which will have a longer but still very bushy history?

This looks fairly plausible to me, but I think there should be a unit
test for the fairly nontrivial function you've added. The approach here
of falling back to the old code means that any bugs here may not
actually be exercised by the test suite.

=== modified file 'NEWS'
--- NEWS 2009-06-16 09:05:34 +0000
+++ NEWS 2009-06-17 05:48:37 +0000
@@ -29,6 +29,10 @@
Improvements
************

+* ``bzr log FILE`` is now substantially faster on flat-ish histories.
+ On OpenOffice.org for example, logging a typical file now takes
+ a second or so instead of 29 seconds. (Ian Clatworthy)
+
* Resolving a revno to a revision id on a branch accessed via ``bzr://``
or ``bzr+ssh://`` is now much faster and involves no VFS operations.
This speeds up commands like ``bzr pull -r 123``. (Andrew Bennetts)

=== modified file 'bzrlib/builtins.py'
--- bzrlib/builtins.py 2009-06-15 06:47:14 +0000
+++ bzrlib/builtins.py 2009-06-16 12:46:50 +0000
@@ -2237,16 +2237,14 @@
             # the underlying repository format is faster at generating
             # deltas or can provide everything we need from the indices.
             # The default algorithm - match-using-deltas - works for
- # multiple files and directories and is faster for small
- # amounts of history (200 revisions say). However, it's too
+ # multiple files and directories. However, it's too
             # slow for logging a single file in a repository with deep
             # history, i.e. > 10K revisions. In the spirit of "do no
             # evil when adding features", we continue to use the
             # original algorithm - per-file-graph - for the "single
             # file that isn't a directory without showing a delta" case.
- partial_history = revision and b.repository._format.supports_chks
             match_using_deltas = (len(file_ids) != 1 or filter_by_dir
- or delta_type or partial_history)
+ or delta_type)

# Build the LogRequest and execute it
if len(file_ids) == 0:

     def _log_revision_iterator_using_per_file_graph(self):
- # Get the base revisions, filtering by the revision range.
- # Note that we always generate the merge revisions because
- # filter_revisions_touching_file_id() requires them ...
         rqst = self.rqst
- view_revisions = _calc_view_revisions(self.branch, self.start_rev_id,
- self.end_rev_id, rqst.get('direction'), True)
- if not isinstance(view_revisions, list):
- view_revisions = list(view_revisions)
- view_revisions = _filter_revisions_touching_file_id(self.branch,
- rqst.get('specific_fileids')[0], view_revisions,
- include_merges=rqst.get('levels') != 1)
+ direction = rqst.get('direction')
+ file_id = rqst.get('specific_fileids')[0]
+ multi_level = rqst.get('levels') != 1
+ try:
+ file_graph, graph_tip = _per_file_graph(self.branch, file_id,
+ self.end_rev_id)
+ except errors.NoSuchId:
+ # File doesn't exist at end of range - fall back to old algorithm
+ view_revisions = None
+ else:
+ # Try iterating over the revisions given by the per-file graph.
+ # This returns None if it fails.
+ view_revisions = _calc_view_revisions_for_file(self.branch,
+ file_graph, graph_tip, self.start_rev_id, self.end_rev_id,
+ direction, multi_level)
+
+ if view_revisions is None:
+ # Get the base revisions, filtering by the revision range.
+ # Note that we always generate the merge revisions because
+ # filter_revisions_touching_file_id() requires them ...
+ view_revisions = _calc_view_revisions(self.branch,
+ self.start_rev_id, self.end_rev_id, direction, True)
+ if not isinstance(view_revisions, list):
+ view_revisions = list(view_revisions)
+ # TODO: pass in the already calculated file graph and re-use it
+ view_revisions = _filter_revisions_touching_file_id(self.branch,
+ file_id, view_revisions, include_merges=multi_level)
         return make_log_rev_iterator(self.branch, view_revisions,
             rqst.get('delta_type'), rqst.get('message_search'))

+def _per_file_graph(branch, file_id, end_rev_id):
+ """Get the per file graph.
+
+ :param end_rev_id: the last interesting revision-id or None to use
+ the basis tree. If non-None, the file must exist in that revision
+ or NoSuchId will be raised.
+ :return: graph, tip where
+ graph is a Graph with (file_id,rev_id) tuple keys and
+ tip is the graph tip
+ """
+ # Find when the file was last modified
+ if end_rev_id is None:
+ rev_tree = branch.basis_tree()
+ else:
+ rev_tree = branch.repository.revision_tree(end_rev_id)
+ last_modified = rev_tree.inventory[file_id].revision
+
+ # Return the result
+ tip = (file_id, last_modified)
+ return graph.Graph(branch.repository.texts), tip

Should this be a branch method?

+
+
+def _calc_view_revisions_for_file(branch, file_graph, graph_tip, start_rev_id,
+ end_rev_id, direction, include_merges):
+ """Calculate the revisions to view for a file.
+
+ :param file_graph: the per-file graph
+ :param graph_tip: the tip of the per-file graph
+ :param include_merges: if True, include all revisions, not just the top
+ level
+ :return: An list of (revision_id, dotted_revno, merge_depth) tuples OR
+ None if the algorithm fails (and another one should be used).
+ """
+ br_revno, br_rev_id = branch.last_revision_info()
+ if br_revno == 0:
+ return []
+
+ # Find the revisions where the file was changed and merged
+ file_rev_ids = []
+ file_merges = []
+ for (_, rev_id), parents in file_graph.iter_ancestry([graph_tip]):
+ file_rev_ids.append(rev_id)
+ if len(parents) > 1:
+ file_merges.append(rev_id)
+
+ # Handle the simple cases
+ if len(file_rev_ids) == 1:
+ return _generate_one_revision(branch, file_rev_ids[0], br_rev_id,
+ br_revno)
+ elif len(file_rev_ids) == 0:
+ # Should this ever happen?
+ return []
+ elif file_merges and include_merges:
+ # Fall back to the old algorithm for now
+ return None
+
+ # Find all the revisions we can using a linear search
+ result = []
+ missing = set(file_rev_ids)
+ merges_to_search = 0
+ created_timestamp = None
+ try:
+ candidates = _linear_view_revisions(branch, start_rev_id, end_rev_id)
+ for index, (rev_id, revno, depth) in enumerate(candidates):
+ if rev_id in missing:
+ result.append((rev_id, revno, depth))
+ missing.remove(rev_id)
+ if len(missing) == 0:
+ break
+ else:
+ if _has_merges(branch, rev_id):
+ merges_to_search += 1
+ # If this is a dense tree, this optimisation is unlikely
+ # to result in a net win - fall back to old algorithm.
+ if merges_to_search > 100:
+ return None
+ # Check the timestamp to avoid going back too far on the
+ # mainline for files created in merge revisions. We don't
+ # do this every revision, just regularly, to minimise the
+ # number of revisions that we load at this point.
+ if index and index % 500 == 0:
+ if created_timestamp is None:
+ created_rev = branch.repository.get_revision(
+ file_rev_ids[-1])
+ created_timestamp = created_rev.timestamp
+ rev = branch.repository.get_revision(rev_id)
+ if created_timestamp > rev.timestamp:
+ return None
+
+ except _StartNotLinearAncestor:
+ raise errors.BzrCommandError('Start revision not found in'
+ ' left-hand history of end revision.')

If this causes a bug report, it'll be easier to debug if you include
some of the relevant variables in the message.

+
+ # If no merges were found in the revision range, then we can be
+ # certain that we've found all the revisions we care about.
+ if missing and merges_to_search:
+ # TODO: search the deltas of the merges, splicing successful
+ # matches into their rightful spots. That should work well on
+ # chk repositories for typical histories but we need to benchmark
+ # it to confirm. There's most likely a sweet spot above which
+ # the O(history) traditional way - generating the full graph of
+ # history and post-filtering - remains the best performer.
+ trace.mutter("log file fastpath failed to find %d revisions" %
+ len(missing))
+ return None
+
+ # We came, we saw, we walked away victorious ...
+ if direction == 'forward':
+ result = reversed(result)
+ return result
+
+
def _calc_view_revisions(branch, start_rev_id, end_rev_id, direction,
generate_merge_revisions, delayed_graph_generation=False):
"""Calculate the revisions to view.

I think you should test this on some files from MySQL or Launchpad,
which will have a longer but still very bushy history?

This looks fairly plausible to me, but I think there should be a unit
test for the fairly nontrivial function you've added.  The approach here
of falling back to the old code means that any bugs here may not
actually be exercised by the test suite.

=== modified file 'NEWS'
--- NEWS	2009-06-16 09:05:34 +0000
+++ NEWS	2009-06-17 05:48:37 +0000
@@ -29,6 +29,10 @@
 Improvements
 ************
 
+* ``bzr log FILE`` is now substantially faster on flat-ish histories.
+  On OpenOffice.org for example, logging a typical file now takes
+  a second or so instead of 29 seconds. (Ian Clatworthy)
+
 * Resolving a revno to a revision id on a branch accessed via ``bzr://``
   or ``bzr+ssh://`` is now much faster and involves no VFS operations.
   This speeds up commands like ``bzr pull -r 123``.  (Andrew Bennetts)

=== modified file 'bzrlib/builtins.py'
--- bzrlib/builtins.py	2009-06-15 06:47:14 +0000
+++ bzrlib/builtins.py	2009-06-16 12:46:50 +0000
@@ -2237,16 +2237,14 @@
             # the underlying repository format is faster at generating
             # deltas or can provide everything we need from the indices.
             # The default algorithm - match-using-deltas - works for
-            # multiple files and directories and is faster for small
-            # amounts of history (200 revisions say). However, it's too
+            # multiple files and directories. However, it's too
             # slow for logging a single file in a repository with deep
             # history, i.e. > 10K revisions. In the spirit of "do no
             # evil when adding features", we continue to use the
             # original algorithm - per-file-graph - for the "single
             # file that isn't a directory without showing a delta" case.
-            partial_history = revision and b.repository._format.supports_chks
             match_using_deltas = (len(file_ids) != 1 or filter_by_dir
-                or delta_type or partial_history)
+                or delta_type)
 
             # Build the LogRequest and execute it
             if len(file_ids) == 0:

=== modified file 'bzrlib/log.py'
--- bzrlib/log.py	2009-06-10 03:56:49 +0000
+++ bzrlib/log.py	2009-06-17 05:35:10 +0000
@@ -70,6 +70,7 @@
     diff,
     errors,
     foreign,
+    graph,
     repository as _mod_repository,
     revision as _mod_revision,
     revisionspec,
@@ -460,21 +461,150 @@
             direction=rqst.get('direction'))
 
     def _log_revision_iterator_using_per_file_graph(self):
-        # Get the base revisions, filtering by the revision range.
-        # Note that we always generate the merge revisions because
-        # filter_revisions_touching_file_id() requires them ...
         rqst = self.rqst
-        view_revisions = _calc_view_revisions(self.branch, self.start_rev_id,
-            self.end_rev_id, rqst.get('direction'), True)
-        if not isinstance(view_revisions, list):
-            view_revisions = list(view_revisions)
-        view_revisions = _filter_revisions_touching_file_id(self.branch,
-            rqst.get('specific_fileids')[0], view_revisions,
-            include_merges=rqst.get('levels') != 1)
+        direction = rqst.get('direction')
+        file_id = rqst.get('specific_fileids')[0]
+        multi_level = rqst.get('levels') != 1
+        try:
+            file_graph, graph_tip = _per_file_graph(self.branch, file_id,
+                self.end_rev_id)
+        except errors.NoSuchId:
+            # File doesn't exist at end of range - fall back to old algorithm
+            view_revisions = None
+        else:
+            # Try iterating over the revisions given by the per-file graph.
+            # This returns None if it fails.
+            view_revisions = _calc_view_revisions_for_file(self.branch,
+                file_graph, graph_tip, self.start_rev_id, self.end_rev_id,
+                direction, multi_level)
+
+        if view_revisions is None:
+            # Get the base revisions, filtering by the revision range.
+            # Note that we always generate the merge revisions because
+            # filter_revisions_touching_file_id() requires them ...
+            view_revisions = _calc_view_revisions(self.branch,
+                self.start_rev_id, self.end_rev_id, direction, True)
+            if not isinstance(view_revisions, list):
+                view_revisions = list(view_revisions)
+            # TODO: pass in the already calculated file graph and re-use it
+            view_revisions = _filter_revisions_touching_file_id(self.branch,
+                file_id, view_revisions, include_merges=multi_level)
         return make_log_rev_iterator(self.branch, view_revisions,
             rqst.get('delta_type'), rqst.get('message_search'))

+def _per_file_graph(branch, file_id, end_rev_id):
+    """Get the per file graph.
+
+    :param end_rev_id: the last interesting revision-id or None to use
+      the basis tree. If non-None, the file must exist in that revision
+      or NoSuchId will be raised.
+    :return: graph, tip where
+      graph is a Graph with (file_id,rev_id) tuple keys and
+      tip is the graph tip
+    """
+    # Find when the file was last modified
+    if end_rev_id is None:
+        rev_tree = branch.basis_tree()
+    else:
+        rev_tree = branch.repository.revision_tree(end_rev_id)
+    last_modified = rev_tree.inventory[file_id].revision
+
+    # Return the result
+    tip = (file_id, last_modified)
+    return graph.Graph(branch.repository.texts), tip

Should this be a branch method?

+
+
+def _calc_view_revisions_for_file(branch, file_graph, graph_tip, start_rev_id,
+    end_rev_id, direction, include_merges):
+    """Calculate the revisions to view for a file.
+
+    :param file_graph: the per-file graph
+    :param graph_tip: the tip of the per-file graph
+    :param include_merges: if True, include all revisions, not just the top
+      level
+    :return: An list of (revision_id, dotted_revno, merge_depth) tuples OR
+      None if the algorithm fails (and another one should be used).
+    """
+    br_revno, br_rev_id = branch.last_revision_info()
+    if br_revno == 0:
+        return []
+
+    # Find the revisions where the file was changed and merged
+    file_rev_ids = []
+    file_merges = []
+    for (_, rev_id), parents in file_graph.iter_ancestry([graph_tip]):
+        file_rev_ids.append(rev_id)
+        if len(parents) > 1:
+            file_merges.append(rev_id)
+
+    # Handle the simple cases
+    if len(file_rev_ids) == 1:
+        return _generate_one_revision(branch, file_rev_ids[0], br_rev_id,
+            br_revno)
+    elif len(file_rev_ids) == 0:
+        # Should this ever happen?
+        return []
+    elif file_merges and include_merges:
+        # Fall back to the old algorithm for now
+        return None
+
+    # Find all the revisions we can using a linear search
+    result = []
+    missing = set(file_rev_ids)
+    merges_to_search = 0
+    created_timestamp = None
+    try:
+        candidates = _linear_view_revisions(branch, start_rev_id, end_rev_id)
+        for index, (rev_id, revno, depth) in enumerate(candidates):
+            if rev_id in missing:
+                result.append((rev_id, revno, depth))
+                missing.remove(rev_id)
+                if len(missing) == 0:
+                    break
+            else:
+                if _has_merges(branch, rev_id):
+                    merges_to_search += 1
+                    # If this is a dense tree, this optimisation is unlikely
+                    # to result in a net win - fall back to old algorithm.
+                    if merges_to_search > 100:
+                        return None
+                # Check the timestamp to avoid going back too far on the
+                # mainline for files created in merge revisions. We don't
+                # do this every revision, just regularly, to minimise the
+                # number of revisions that we load at this point.
+                if index and index % 500 == 0:
+                    if created_timestamp is None:
+                        created_rev = branch.repository.get_revision(
+                            file_rev_ids[-1])
+                        created_timestamp = created_rev.timestamp
+                    rev = branch.repository.get_revision(rev_id)
+                    if created_timestamp > rev.timestamp:
+                        return None
+
+    except _StartNotLinearAncestor:
+        raise errors.BzrCommandError('Start revision not found in'
+            ' left-hand history of end revision.')

If this causes a bug report, it'll be easier to debug if you include
some of the relevant variables in the message.

+
+    # If no merges were found in the revision range, then we can be
+    # certain that we've found all the revisions we care about.
+    if missing and merges_to_search:
+        # TODO: search the deltas of the merges, splicing successful
+        # matches into their rightful spots. That should work well on
+        # chk repositories for typical histories but we need to benchmark
+        # it to confirm. There's most likely a sweet spot above which
+        # the O(history) traditional way - generating the full graph of
+        # history and post-filtering - remains the best performer.
+        trace.mutter("log file fastpath failed to find %d revisions" %
+            len(missing))
+        return None
+
+    # We came, we saw, we walked away victorious ...
+    if direction == 'forward':
+        result = reversed(result)
+    return result
+
+
 def _calc_view_revisions(branch, start_rev_id, end_rev_id, direction,
     generate_merge_revisions, delayed_graph_generation=False):
     """Calculate the revisions to view.

review: Needs Information

« Back to merge proposal