Merge lp:~mbp/bzr/415508-content-filtering into lp:bzr/1.18
- 415508-content-filtering
- Merge into 1.18
Status: | Merged | ||||
---|---|---|---|---|---|
Merged at revision: | not available | ||||
Proposed branch: | lp:~mbp/bzr/415508-content-filtering | ||||
Merge into: | lp:bzr/1.18 | ||||
Diff against target: | 345 lines | ||||
To merge this branch: | bzr merge lp:~mbp/bzr/415508-content-filtering | ||||
Related bugs: |
|
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Ian Clatworthy (community) | Approve | ||
John A Meinel (community) | Approve | ||
Review via email: mp+10640@code.launchpad.net |
Commit message
Description of the change
Martin Pool (mbp) wrote : | # |
John A Meinel (jameinel) wrote : | # |
I think this patch is good.
I don't see a test added that would catch the bug that we discovered. Namely, that we *don't* add a new node for texts that have a content filter, but which have not otherwise changed.
I would like to see something along those lines, but I'm not going to block this patch on it, because I think the bug we have right now is pretty huge and needs fixing.
Martin Pool (mbp) wrote : | # |
I need an additional test change, like this, for the path_content_
=== modified file 'bzrlib/
--- bzrlib/
+++ bzrlib/
@@ -1,4 +1,4 @@
-# Copyright (C) 2007 Canonical Ltd
+# Copyright (C) 2007, 2009 Canonical Ltd
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -34,6 +34,18 @@
return result
+ def check_content_
+ # if the tree supports content filters, then it's allowed to leave out
+ # the size because it might be difficult to compute. otherwise, it
+ # must be present and correct
+ returned_size = summary[1]
+ if returned_size == expected_size or (
+ tree.supports_
+ and returned_size is None):
+ pass
+ else:
+ self.fail("invalid size in summary: %r" % (returned_size,))
+
def test_symlink_
tree = self.make_
@@ -76,8 +88,7 @@
summary = self._convert_
- # size must be known
- self.assertEqua
+ self.check_
# executable
# may have hash,
@@ -91,8 +102,7 @@
summary = self._convert_
- # size must be known
- self.assertEqua
+ self.check_
# not executable
if osutils.
Ian Clatworthy (ian-clatworthy) wrote : | # |
The code and tests look good to me. The docstring and code in _path_content_
The NEWS items needs a trivial tweak: 'of file' -> 'of a file'.
Preview Diff
1 | === modified file 'NEWS' |
2 | --- NEWS 2009-08-20 08:38:09 +0000 |
3 | +++ NEWS 2009-08-26 06:35:38 +0000 |
4 | @@ -7,6 +7,24 @@ |
5 | :depth: 1 |
6 | |
7 | |
8 | +bzr 1.18.1 NOT RELEASED YET |
9 | +########################### |
10 | + |
11 | +Bug Fixes |
12 | +********* |
13 | + |
14 | +* Fixed a problem where using content filtering and especially end-of-line |
15 | + conversion will commit too many copies a file. |
16 | + (Martin Pool, #415508) |
17 | + |
18 | +API Changes |
19 | +*********** |
20 | + |
21 | +* ``Tree.path_content_summary`` may return a size of None, when called on |
22 | + a tree with content filtering where the size of the canonical form |
23 | + cannot be cheaply determined. (Martin Pool) |
24 | + |
25 | + |
26 | bzr 1.18 |
27 | ######## |
28 | |
29 | |
30 | === modified file 'bzrlib/commit.py' |
31 | --- bzrlib/commit.py 2009-07-15 05:54:37 +0000 |
32 | +++ bzrlib/commit.py 2009-08-26 06:35:39 +0000 |
33 | @@ -802,10 +802,11 @@ |
34 | # _update_builder_with_changes. |
35 | continue |
36 | content_summary = self.work_tree.path_content_summary(path) |
37 | + kind = content_summary[0] |
38 | # Note that when a filter of specific files is given, we must only |
39 | # skip/record deleted files matching that filter. |
40 | if not specific_files or is_inside_any(specific_files, path): |
41 | - if content_summary[0] == 'missing': |
42 | + if kind == 'missing': |
43 | if not deleted_paths: |
44 | # path won't have been split yet. |
45 | path_segments = splitpath(path) |
46 | @@ -818,23 +819,20 @@ |
47 | continue |
48 | # TODO: have the builder do the nested commit just-in-time IF and |
49 | # only if needed. |
50 | - if content_summary[0] == 'tree-reference': |
51 | + if kind == 'tree-reference': |
52 | # enforce repository nested tree policy. |
53 | if (not self.work_tree.supports_tree_reference() or |
54 | # repository does not support it either. |
55 | not self.branch.repository._format.supports_tree_reference): |
56 | - content_summary = ('directory',) + content_summary[1:] |
57 | - kind = content_summary[0] |
58 | - # TODO: specific_files filtering before nested tree processing |
59 | - if kind == 'tree-reference': |
60 | - if self.recursive == 'down': |
61 | + kind = 'directory' |
62 | + content_summary = (kind, None, None, None) |
63 | + elif self.recursive == 'down': |
64 | nested_revision_id = self._commit_nested_tree( |
65 | file_id, path) |
66 | - content_summary = content_summary[:3] + ( |
67 | - nested_revision_id,) |
68 | + content_summary = (kind, None, None, nested_revision_id) |
69 | else: |
70 | - content_summary = content_summary[:3] + ( |
71 | - self.work_tree.get_reference_revision(file_id),) |
72 | + nested_revision_id = self.work_tree.get_reference_revision(file_id) |
73 | + content_summary = (kind, None, None, nested_revision_id) |
74 | |
75 | # Record an entry for this item |
76 | # Note: I don't particularly want to have the existing_ie |
77 | |
78 | === modified file 'bzrlib/repository.py' |
79 | --- bzrlib/repository.py 2009-08-06 02:23:37 +0000 |
80 | +++ bzrlib/repository.py 2009-08-26 06:35:39 +0000 |
81 | @@ -464,9 +464,9 @@ |
82 | if content_summary[2] is None: |
83 | raise ValueError("Files must not have executable = None") |
84 | if not store: |
85 | - if (# if the file length changed we have to store: |
86 | - parent_entry.text_size != content_summary[1] or |
87 | - # if the exec bit has changed we have to store: |
88 | + # We can't trust a check of the file length because of content |
89 | + # filtering... |
90 | + if (# if the exec bit has changed we have to store: |
91 | parent_entry.executable != content_summary[2]): |
92 | store = True |
93 | elif parent_entry.text_sha1 == content_summary[3]: |
94 | @@ -539,6 +539,9 @@ |
95 | ie.revision = parent_entry.revision |
96 | return self._get_delta(ie, basis_inv, path), False, None |
97 | ie.reference_revision = content_summary[3] |
98 | + if ie.reference_revision is None: |
99 | + raise AssertionError("invalid content_summary for nested tree: %r" |
100 | + % (content_summary,)) |
101 | self._add_text_to_weave(ie.file_id, '', heads, None) |
102 | else: |
103 | raise NotImplementedError('unknown kind') |
104 | |
105 | === modified file 'bzrlib/tests/per_tree/test_path_content_summary.py' |
106 | --- bzrlib/tests/per_tree/test_path_content_summary.py 2009-07-10 07:14:02 +0000 |
107 | +++ bzrlib/tests/per_tree/test_path_content_summary.py 2009-08-26 06:35:39 +0000 |
108 | @@ -1,4 +1,4 @@ |
109 | -# Copyright (C) 2007 Canonical Ltd |
110 | +# Copyright (C) 2007, 2009 Canonical Ltd |
111 | # |
112 | # This program is free software; you can redistribute it and/or modify |
113 | # it under the terms of the GNU General Public License as published by |
114 | @@ -34,6 +34,18 @@ |
115 | self.addCleanup(result.unlock) |
116 | return result |
117 | |
118 | + def check_content_summary_size(self, tree, summary, expected_size): |
119 | + # if the tree supports content filters, then it's allowed to leave out |
120 | + # the size because it might be difficult to compute. otherwise, it |
121 | + # must be present and correct |
122 | + returned_size = summary[1] |
123 | + if returned_size == expected_size or ( |
124 | + tree.supports_content_filtering() |
125 | + and returned_size is None): |
126 | + pass |
127 | + else: |
128 | + self.fail("invalid size in summary: %r" % (returned_size,)) |
129 | + |
130 | def test_symlink_content_summary(self): |
131 | self.requireFeature(tests.SymlinkFeature) |
132 | tree = self.make_branch_and_tree('tree') |
133 | @@ -76,8 +88,7 @@ |
134 | summary = self._convert_tree(tree).path_content_summary('path') |
135 | self.assertEqual(4, len(summary)) |
136 | self.assertEqual('file', summary[0]) |
137 | - # size must be known |
138 | - self.assertEqual(22, summary[1]) |
139 | + self.check_content_summary_size(tree, summary, 22) |
140 | # executable |
141 | self.assertEqual(True, summary[2]) |
142 | # may have hash, |
143 | @@ -91,8 +102,7 @@ |
144 | summary = self._convert_tree(tree).path_content_summary('path') |
145 | self.assertEqual(4, len(summary)) |
146 | self.assertEqual('file', summary[0]) |
147 | - # size must be known |
148 | - self.assertEqual(22, summary[1]) |
149 | + self.check_content_summary_size(tree, summary, 22) |
150 | # not executable |
151 | if osutils.supports_executable: |
152 | self.assertEqual(False, summary[2]) |
153 | |
154 | === modified file 'bzrlib/tests/per_workingtree/test_content_filters.py' |
155 | --- bzrlib/tests/per_workingtree/test_content_filters.py 2009-07-10 07:14:02 +0000 |
156 | +++ bzrlib/tests/per_workingtree/test_content_filters.py 2009-08-26 06:35:39 +0000 |
157 | @@ -43,6 +43,25 @@ |
158 | return _converter_helper(chunks, 'lower') |
159 | |
160 | |
161 | +_trailer_string = '\nend string\n' |
162 | + |
163 | + |
164 | +def _append_text(chunks, context=None): |
165 | + """A content filter that appends a string to the end of the file. |
166 | + |
167 | + This tests filters that change the length.""" |
168 | + return chunks + [_trailer_string] |
169 | + |
170 | + |
171 | +def _remove_appended_text(chunks, context=None): |
172 | + """Remove the appended text.""" |
173 | + |
174 | + text = ''.join(chunks) |
175 | + if text.endswith(_trailer_string): |
176 | + text = text[:-len(_trailer_string)] |
177 | + return [text] |
178 | + |
179 | + |
180 | class TestWorkingTreeWithContentFilters(TestCaseWithWorkingTree): |
181 | |
182 | def create_cf_tree(self, txt_reader, txt_writer, dir='.'): |
183 | @@ -159,3 +178,34 @@ |
184 | self.assertFileEqual("fOO tXT", 'target/file1.txt') |
185 | changes = target.changes_from(source.basis_tree()) |
186 | self.assertFalse(changes.has_changed()) |
187 | + |
188 | + def test_path_content_summary(self): |
189 | + """path_content_summary should always talk about the canonical form.""" |
190 | + # see https://bugs.edge.launchpad.net/bzr/+bug/415508 |
191 | + # |
192 | + # set up a tree where the canonical form has a string added to the |
193 | + # end |
194 | + source, txt_fileid, bin_fileid = self.create_cf_tree( |
195 | + txt_reader=_append_text, |
196 | + txt_writer=_remove_appended_text, |
197 | + dir='source') |
198 | + if not source.supports_content_filtering(): |
199 | + return |
200 | + source.lock_read() |
201 | + self.addCleanup(source.unlock) |
202 | + |
203 | + expected_canonical_form = 'Foo Txt\nend string\n' |
204 | + self.assertEquals(source.get_file(txt_fileid, filtered=True).read(), |
205 | + expected_canonical_form) |
206 | + self.assertEquals(source.get_file(txt_fileid, filtered=False).read(), |
207 | + 'Foo Txt') |
208 | + |
209 | + # results are: kind, size, executable, sha1_or_link_target |
210 | + result = source.path_content_summary('file1.txt') |
211 | + |
212 | + self.assertEquals(result, |
213 | + ('file', None, False, None)) |
214 | + |
215 | + # we could give back the length of the canonical form, but in general |
216 | + # that will be expensive to compute, so it's acceptable to just return |
217 | + # None. |
218 | |
219 | === modified file 'bzrlib/tree.py' |
220 | --- bzrlib/tree.py 2009-07-17 06:04:35 +0000 |
221 | +++ bzrlib/tree.py 2009-08-26 06:35:39 +0000 |
222 | @@ -218,10 +218,14 @@ |
223 | def path_content_summary(self, path): |
224 | """Get a summary of the information about path. |
225 | |
226 | + All the attributes returned are for the canonical form, not the |
227 | + convenient form (if content filters are in use.) |
228 | + |
229 | :param path: A relative path within the tree. |
230 | :return: A tuple containing kind, size, exec, sha1-or-link. |
231 | Kind is always present (see tree.kind()). |
232 | - size is present if kind is file, None otherwise. |
233 | + size is present if kind is file and the size of the |
234 | + canonical form can be cheaply determined, None otherwise. |
235 | exec is None unless kind is file and the platform supports the 'x' |
236 | bit. |
237 | sha1-or-link is the link target if kind is symlink, or the sha1 if |
238 | |
239 | === modified file 'bzrlib/workingtree.py' |
240 | --- bzrlib/workingtree.py 2009-08-04 04:36:34 +0000 |
241 | +++ bzrlib/workingtree.py 2009-08-26 06:35:39 +0000 |
242 | @@ -613,6 +613,8 @@ |
243 | |
244 | def get_file_size(self, file_id): |
245 | """See Tree.get_file_size""" |
246 | + # XXX: this returns the on-disk size; it should probably return the |
247 | + # canonical size |
248 | try: |
249 | return os.path.getsize(self.id2abspath(file_id)) |
250 | except OSError, e: |
251 | @@ -749,11 +751,7 @@ |
252 | raise |
253 | kind = _mapper(stat_result.st_mode) |
254 | if kind == 'file': |
255 | - size = stat_result.st_size |
256 | - # try for a stat cache lookup |
257 | - executable = self._is_executable_from_path_and_stat(path, stat_result) |
258 | - return (kind, size, executable, self._sha_from_stat( |
259 | - path, stat_result)) |
260 | + return self._file_content_summary(path, stat_result) |
261 | elif kind == 'directory': |
262 | # perhaps it looks like a plain directory, but it's really a |
263 | # reference. |
264 | @@ -766,6 +764,13 @@ |
265 | else: |
266 | return (kind, None, None, None) |
267 | |
268 | + def _file_content_summary(self, path, stat_result): |
269 | + size = stat_result.st_size |
270 | + executable = self._is_executable_from_path_and_stat(path, stat_result) |
271 | + # try for a stat cache lookup |
272 | + return ('file', size, executable, self._sha_from_stat( |
273 | + path, stat_result)) |
274 | + |
275 | def _check_parents_for_ghosts(self, revision_ids, allow_leftmost_as_ghost): |
276 | """Common ghost checking functionality from set_parent_*. |
277 | |
278 | |
279 | === modified file 'bzrlib/workingtree_4.py' |
280 | --- bzrlib/workingtree_4.py 2009-08-04 11:25:23 +0000 |
281 | +++ bzrlib/workingtree_4.py 2009-08-26 06:35:39 +0000 |
282 | @@ -1300,6 +1300,32 @@ |
283 | return statvalue, sha1 |
284 | |
285 | |
286 | +class ContentFilteringDirStateWorkingTree(DirStateWorkingTree): |
287 | + """Dirstate working tree that supports content filtering. |
288 | + |
289 | + The dirstate holds the hash and size of the canonical form of the file, |
290 | + and most methods must return that. |
291 | + """ |
292 | + |
293 | + def _file_content_summary(self, path, stat_result): |
294 | + # This is to support the somewhat obsolete path_content_summary method |
295 | + # with content filtering: see |
296 | + # <https://bugs.edge.launchpad.net/bzr/+bug/415508>. |
297 | + # |
298 | + # If the dirstate cache is up to date and knows the hash and size, |
299 | + # return that. |
300 | + # Otherwise if there are no content filters, return the on-disk size |
301 | + # and leave the hash blank. |
302 | + # Otherwise, read and filter the on-disk file and use its size and |
303 | + # hash. |
304 | + # |
305 | + # The dirstate doesn't store the size of the canonical form so we |
306 | + # can't trust it for content-filtered trees. We just return None. |
307 | + dirstate_sha1 = self._dirstate.sha1_from_stat(path, stat_result) |
308 | + executable = self._is_executable_from_path_and_stat(path, stat_result) |
309 | + return ('file', None, executable, dirstate_sha1) |
310 | + |
311 | + |
312 | class WorkingTree4(DirStateWorkingTree): |
313 | """This is the Format 4 working tree. |
314 | |
315 | @@ -1313,7 +1339,7 @@ |
316 | """ |
317 | |
318 | |
319 | -class WorkingTree5(DirStateWorkingTree): |
320 | +class WorkingTree5(ContentFilteringDirStateWorkingTree): |
321 | """This is the Format 5 working tree. |
322 | |
323 | This differs from WorkingTree4 by: |
324 | @@ -1323,7 +1349,7 @@ |
325 | """ |
326 | |
327 | |
328 | -class WorkingTree6(DirStateWorkingTree): |
329 | +class WorkingTree6(ContentFilteringDirStateWorkingTree): |
330 | """This is the Format 6 working tree. |
331 | |
332 | This differs from WorkingTree5 by: |
333 | @@ -1550,7 +1576,11 @@ |
334 | |
335 | |
336 | class DirStateRevisionTree(Tree): |
337 | - """A revision tree pulling the inventory from a dirstate.""" |
338 | + """A revision tree pulling the inventory from a dirstate. |
339 | + |
340 | + Note that this is one of the historical (ie revision) trees cached in the |
341 | + dirstate for easy access, not the workingtree. |
342 | + """ |
343 | |
344 | def __init__(self, dirstate, revision_id, repository): |
345 | self._dirstate = dirstate |
fix for https:/ /bugs.edge. launchpad. net/bzr/ +bug/415508:
* when getting the contents_summary on a file, don't return the size because it's not needed and it's not cheap to compute for content-filtered trees