-----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 Hi Benji, thank you for this nice fix. It is basically good to land after you have considered a few comments of mine. review approve code Cheers, Henning Am 30.08.2010 17:03, schrieb Benji York: > > === modified file 'lib/canonical/launchpad/scripts/tests/test_librarian_apache_log_parser.py' > --- lib/canonical/launchpad/scripts/tests/test_librarian_apache_log_parser.py 2010-08-20 20:31:18 +0000 > +++ lib/canonical/launchpad/scripts/tests/test_librarian_apache_log_parser.py 2010-08-30 15:03:44 +0000 > @@ -102,7 +102,7 @@ > '69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET ' > '/15018215/ul_logo_64x64.png HTTP/1.1" 200 2261 ' > '"https://launchpad.net/~ubuntulite/+archive" "Mozilla"') > - downloads, parsed_bytes = parse_file( > + downloads, parsed_bytes, ignored = parse_file( I am not sure "ignored" is the best choice here. I wondered at first if it means "ignored_bytes" or "ignored_lines". Would it hurt to call it what it is ("parsed_lines") and then simply ignore it? There are multiple uses of "ignored" in here, as you know. ;-) > fd, start_position=0, logger=self.logger, > get_download_key=get_library_file_id) > self.assertEqual( > @@ -121,7 +121,7 @@ > fd = StringIO( > '69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET / HTTP/1.1" ' > '200 2261 "https://launchpad.net/~ubuntulite/+archive" "Mozilla"') > - downloads, parsed_bytes = parse_file( > + downloads, parsed_bytes, ignored = parse_file( > fd, start_position=0, logger=self.logger, > get_download_key=get_library_file_id) > self.assertEqual( > > === modified file 'lib/lp/services/apachelogparser/base.py' > --- lib/lp/services/apachelogparser/base.py 2010-08-27 18:41:01 +0000 > +++ lib/lp/services/apachelogparser/base.py 2010-08-30 15:03:44 +0000 Very nice. Thank you. [...] > === modified file 'lib/lp/services/apachelogparser/script.py' > --- lib/lp/services/apachelogparser/script.py 2010-08-27 18:41:01 +0000 > +++ lib/lp/services/apachelogparser/script.py 2010-08-30 15:03:44 +0000 > @@ -8,6 +8,7 @@ > > from zope.component import getUtility > > +from canonical.config import config > from lp.app.errors import NotFoundError > from lp.services.apachelogparser.base import ( > create_or_update_parsedlog_entry, > @@ -65,8 +66,15 @@ > > self.setUpUtilities() > country_set = getUtility(ICountrySet) > + parsed_lines = 0 > + max_parsed_lines = getattr( > + config.launchpad, 'logparser_max_parsed_lines', None) > for fd, position in files_to_parse: > - downloads, parsed_bytes = parse_file( > + # If we've used up our budget of lines to process, stop. > + if (max_parsed_lines is not None > + and parsed_lines >= max_parsed_lines): This line must be indented and the "and" should probably go on the previous line but multi-line "if" conditions are always bad to read because the dangling lines align with the body of the statement. Storing the boolean in a variable first and checking that in the statement often works better. > + break > + downloads, parsed_bytes, parsed_lines = parse_file( > fd, position, self.logger, self.getDownloadKey) > # Use a while loop here because we want to pop items from the dict > # in order to free some memory as we go along. This is a good > > === modified file 'lib/lp/services/apachelogparser/tests/test_apachelogparser.py' > --- lib/lp/services/apachelogparser/tests/test_apachelogparser.py 2010-08-27 18:41:01 +0000 > +++ lib/lp/services/apachelogparser/tests/test_apachelogparser.py 2010-08-30 15:03:44 +0000 > @@ -6,7 +6,6 @@ > import os > from StringIO import StringIO > import tempfile > -import textwrap > from operator import itemgetter > import unittest > > @@ -125,17 +124,18 @@ > > def test_parsing(self): > # The parse_file() function returns a tuple containing a dict (mapping > - # days and library file IDs to number of downloads) and the total > - # number of bytes that have been parsed from this file. In our sample > + # days and library file IDs to number of downloads), the total number > + # of bytes that have been parsed from this file, and the running total > + # of lines parsed (for testing against the maximum). In our sample > # log, the file with ID 8196569 has been downloaded twice (once from > - # Argentina and once from Japan) and the files with ID 12060796 > - # and 9096290 have been downloaded once. The file with ID 15018215 > - # has also been downloaded once (last line of the sample log), but > + # Argentina and once from Japan) and the files with ID 12060796 and > + # 9096290 have been downloaded once. The file with ID 15018215 has > + # also been downloaded once (last line of the sample log), but > # parse_file() always skips the last line as it may be truncated, so > # it doesn't show up in the dict returned. > fd = open(os.path.join( > here, 'apache-log-files', 'launchpadlibrarian.net.access-log')) > - downloads, parsed_bytes = parse_file( > + downloads, parsed_bytes, ignored = parse_file( > fd, start_position=0, logger=self.logger, > get_download_key=get_path_download_key) > self.assertEqual( > @@ -159,7 +159,7 @@ > # line without worrying about whether or not it's been truncated. > fd = open(os.path.join( > here, 'apache-log-files', 'launchpadlibrarian.net.access-log')) > - downloads, parsed_bytes = parse_file( > + downloads, parsed_bytes, ignored = parse_file( > fd, start_position=self._getLastLineStart(fd), logger=self.logger, > get_download_key=get_path_download_key) > self.assertEqual( > @@ -177,7 +177,7 @@ > # parsed up to the line before the one where the failure occurred. > # Here we force an unexpected error on the first line. > fd = StringIO('Not a log') > - downloads, parsed_bytes = parse_file( > + downloads, parsed_bytes, ignored = parse_file( > fd, start_position=0, logger=self.logger, > get_download_key=get_path_download_key) > self.assertIn('Error', self.logger.buffer.getvalue()) > @@ -188,7 +188,7 @@ > """Assert that responses with the given status are ignored.""" > fd = StringIO( > self.sample_line % dict(status=status, method='GET')) > - downloads, parsed_bytes = parse_file( > + downloads, parsed_bytes, ignored = parse_file( > fd, start_position=0, logger=self.logger, > get_download_key=get_path_download_key) > self.assertEqual( > @@ -213,7 +213,7 @@ > """Assert that requests with the given method are ignored.""" > fd = StringIO( > self.sample_line % dict(status='200', method=method)) > - downloads, parsed_bytes = parse_file( > + downloads, parsed_bytes, ignored = parse_file( > fd, start_position=0, logger=self.logger, > get_download_key=get_path_download_key) > self.assertEqual( > @@ -231,7 +231,7 @@ > def test_normal_request_is_not_ignored(self): > fd = StringIO( > self.sample_line % dict(status=200, method='GET')) > - downloads, parsed_bytes = parse_file( > + downloads, parsed_bytes, ignored = parse_file( > fd, start_position=0, logger=self.logger, > get_download_key=get_path_download_key) > self.assertEqual( > @@ -249,22 +249,20 @@ > # lines. > config.push( > 'log_parser config', > - textwrap.dedent('''\ > - [launchpad] > - logparser_max_parsed_lines: 2 > - ''')) > + '[launchpad]\nlogparser_max_parsed_lines: 2') > self.addCleanup(config.pop, 'log_parser config') > fd = open(os.path.join( > here, 'apache-log-files', 'launchpadlibrarian.net.access-log')) > self.addCleanup(fd.close) > > - downloads, parsed_bytes = parse_file( > + downloads, parsed_bytes, parsed_lines = parse_file( > fd, start_position=0, logger=self.logger, > get_download_key=get_path_download_key) > > # We have initially parsed only the first two lines of data, > # corresponding to one download (the first line is a 404 and > # so ignored). > + self.assertEqual(parsed_lines, 2) > date = datetime(2008, 6, 13) > self.assertContentEqual( > downloads.items(), > @@ -276,7 +274,7 @@ > > # And the subsequent parse will be for the 3rd and 4th lines, > # corresponding to two downloads of the same file. > - downloads, parsed_bytes = parse_file( > + downloads, parsed_bytes, ignored = parse_file( > fd, start_position=parsed_bytes, logger=self.logger, > get_download_key=get_path_download_key) > self.assertContentEqual( > @@ -285,6 +283,33 @@ > ('/8196569/mediumubuntulogo.png', {date: {'AR': 1}})]) > self.assertEqual(parsed_bytes, sum(line_lengths[:4])) > > + def test_max_parsed_lines_exceeded(self): > + # The max_parsed_lines config option limits the number of parsed > + # lines. > + config.push( > + 'log_parser config', > + '[launchpad]\nlogparser_max_parsed_lines: 2') > + self.addCleanup(config.pop, 'log_parser config') > + fd = open(os.path.join( > + here, 'apache-log-files', 'launchpadlibrarian.net.access-log')) > + self.addCleanup(fd.close) > + > + # If we have already parsed some lines, then the number of lines > + # parsed will be passed in (parsed_lines argument) and parse_file will > + # take that number into account when determining if the maximum number > + # of lines to parse has been reached. Hm, this is a functional test and not documentation. This description should probably be found in the docstring of parse_file, should it not? If found there, it need not be explained in such detail here. > + parsed_lines = 1 > + downloads, parsed_bytes, parsed_lines = parse_file( > + fd, start_position=0, logger=self.logger, > + get_download_key=get_path_download_key, parsed_lines=parsed_lines) > + > + # Since we told parse_file that we had already parsed 1 line and the > + # limit is 2 lines, it only parsed the first line of the file which is > + # a 404, so there were no downloads recorded. The 404 is fairly irrelevant to the test. If you want to check that the parser stopped at the right place, you will need a downloaded item right before the limit and one right after the limit and check for those. You could construct the two lines in a string right here in the test and use StringIO to feed it to parse_file. That would be more explicit and independent of sample data. > + self.assertEqual(parsed_lines, 2) > + self.assertContentEqual(downloads.items(), []) See comment above. > + fd.seek(0) Is that really needed at the end? > + > > class TestParsedFilesDetection(TestCase): > """Test the detection of already parsed logs.""" -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.10 (GNU/Linux) Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org/ iEYEARECAAYFAkx70WEACgkQBT3oW1L17ihhTgCgxk5M1HxDrcwHIcu0Ti5BZbol nBMAoN8mC+OwGVO2gVNWi/ddcjvLYB+Q =R5BP -----END PGP SIGNATURE-----