1
=== modified file 'lib/canonical/launchpad/scripts/tests/test_librarian_apache_log_parser.py'
2
--- lib/canonical/launchpad/scripts/tests/test_librarian_apache_log_parser.py	2010-08-20 20:31:18 +0000
3
+++ lib/canonical/launchpad/scripts/tests/test_librarian_apache_log_parser.py	2010-08-30 19:47:43 +0000
4
@@ -102,7 +102,7 @@
5
102
            '69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET '
102
            '69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET '
6
103
            '/15018215/ul_logo_64x64.png HTTP/1.1" 200 2261 '
103
            '/15018215/ul_logo_64x64.png HTTP/1.1" 200 2261 '
7
104
            '"https://launchpad.net/~ubuntulite/+archive" "Mozilla"')
104
            '"https://launchpad.net/~ubuntulite/+archive" "Mozilla"')
9
105
        downloads, parsed_bytes = parse_file(
105
        downloads, parsed_bytes, ignored = parse_file(
10
106
            fd, start_position=0, logger=self.logger,
106
            fd, start_position=0, logger=self.logger,
11
107
            get_download_key=get_library_file_id)
107
            get_download_key=get_library_file_id)
12
108
        self.assertEqual(
108
        self.assertEqual(
13
@@ -121,7 +121,7 @@
14
121
        fd = StringIO(
121
        fd = StringIO(
15
122
            '69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET / HTTP/1.1" '
122
            '69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET / HTTP/1.1" '
16
123
            '200 2261 "https://launchpad.net/~ubuntulite/+archive" "Mozilla"')
123
            '200 2261 "https://launchpad.net/~ubuntulite/+archive" "Mozilla"')
18
124
        downloads, parsed_bytes = parse_file(
124
        downloads, parsed_bytes, ignored = parse_file(
19
125
            fd, start_position=0, logger=self.logger,
125
            fd, start_position=0, logger=self.logger,
20
126
            get_download_key=get_library_file_id)
126
            get_download_key=get_library_file_id)
21
127
        self.assertEqual(
127
        self.assertEqual(
22
128
128
23
=== modified file 'lib/lp/services/apachelogparser/base.py'
24
--- lib/lp/services/apachelogparser/base.py	2010-08-27 18:41:01 +0000
25
+++ lib/lp/services/apachelogparser/base.py	2010-08-30 19:47:43 +0000
26
@@ -77,9 +77,14 @@
27
77
    return fd, file_size
77
    return fd, file_size
28
78
78
29
79
79
31
80
def parse_file(fd, start_position, logger, get_download_key):
80
def parse_file(fd, start_position, logger, get_download_key, parsed_lines=0):
32
81
    """Parse the given file starting on the given position.
81
    """Parse the given file starting on the given position.
33
82
82
34
83
    parsed_lines accepts the number of lines that have been parsed during
35
84
    previous calls to this function so they can be taken into account against
36
85
    max_parsed_lines.  The total number of parsed lines is then returned so it
37
86
    can be passed back to future calls to this function.
38
87
39
83
    Return a dictionary mapping file_ids (from the librarian) to days to
88
    Return a dictionary mapping file_ids (from the librarian) to days to
40
84
    countries to number of downloads.
89
    countries to number of downloads.
41
85
    """
90
    """
42
@@ -91,7 +96,6 @@
43
91
96
44
92
    geoip = getUtility(IGeoIP)
97
    geoip = getUtility(IGeoIP)
45
93
    downloads = {}
98
    downloads = {}
46
94
    parsed_lines = 0
47
95
99
48
96
    # Check for an optional max_parsed_lines config option.
100
    # Check for an optional max_parsed_lines config option.
49
97
    max_parsed_lines = getattr(
101
    max_parsed_lines = getattr(
50
@@ -167,7 +171,7 @@
51
167
        logger.info('Parsed %d lines resulting in %d download stats.' % (
171
        logger.info('Parsed %d lines resulting in %d download stats.' % (
52
168
            parsed_lines, len(downloads)))
172
            parsed_lines, len(downloads)))
53
169
173
55
170
    return downloads, parsed_bytes
174
    return downloads, parsed_bytes, parsed_lines
56
171
175
57
172
176
58
173
def create_or_update_parsedlog_entry(first_line, parsed_bytes):
177
def create_or_update_parsedlog_entry(first_line, parsed_bytes):
59
174
178
60
=== modified file 'lib/lp/services/apachelogparser/script.py'
61
--- lib/lp/services/apachelogparser/script.py	2010-08-27 18:41:01 +0000
62
+++ lib/lp/services/apachelogparser/script.py	2010-08-30 19:47:43 +0000
63
@@ -8,6 +8,7 @@
64
8
8
65
9
from zope.component import getUtility
9
from zope.component import getUtility
66
10
10
67
11
from canonical.config import config
68
11
from lp.app.errors import NotFoundError
12
from lp.app.errors import NotFoundError
69
12
from lp.services.apachelogparser.base import (
13
from lp.services.apachelogparser.base import (
70
13
    create_or_update_parsedlog_entry,
14
    create_or_update_parsedlog_entry,
71
@@ -65,8 +66,15 @@
72
65
66
73
66
        self.setUpUtilities()
67
        self.setUpUtilities()
74
67
        country_set = getUtility(ICountrySet)
68
        country_set = getUtility(ICountrySet)
75
69
        parsed_lines = 0
76
70
        max_parsed_lines = getattr(
77
71
            config.launchpad, 'logparser_max_parsed_lines', None)
78
72
        max_is_set = max_parsed_lines is not None
79
68
        for fd, position in files_to_parse:
73
        for fd, position in files_to_parse:
81
69
            downloads, parsed_bytes = parse_file(
74
            # If we've used up our budget of lines to process, stop.
82
75
            if (max_is_set and parsed_lines >= max_parsed_lines):
83
76
                break
84
77
            downloads, parsed_bytes, parsed_lines = parse_file(
85
70
                fd, position, self.logger, self.getDownloadKey)
78
                fd, position, self.logger, self.getDownloadKey)
86
71
            # Use a while loop here because we want to pop items from the dict
79
            # Use a while loop here because we want to pop items from the dict
87
72
            # in order to free some memory as we go along. This is a good
80
            # in order to free some memory as we go along. This is a good
88
73
81
89
=== modified file 'lib/lp/services/apachelogparser/tests/test_apachelogparser.py'
90
--- lib/lp/services/apachelogparser/tests/test_apachelogparser.py	2010-08-27 18:41:01 +0000
91
+++ lib/lp/services/apachelogparser/tests/test_apachelogparser.py	2010-08-30 19:47:43 +0000
92
@@ -6,7 +6,6 @@
93
6
import os
6
import os
94
7
from StringIO import StringIO
7
from StringIO import StringIO
95
8
import tempfile
8
import tempfile
96
9
import textwrap
97
10
from operator import itemgetter
9
from operator import itemgetter
98
11
import unittest
10
import unittest
99
12
11
100
@@ -125,17 +124,18 @@
101
125
124
102
126
    def test_parsing(self):
125
    def test_parsing(self):
103
127
        # The parse_file() function returns a tuple containing a dict (mapping
126
        # The parse_file() function returns a tuple containing a dict (mapping
106
128
        # days and library file IDs to number of downloads) and the total
127
        # days and library file IDs to number of downloads), the total number
107
129
        # number of bytes that have been parsed from this file.  In our sample
128
        # of bytes that have been parsed from this file, and the running total
108
129
        # of lines parsed (for testing against the maximum).  In our sample
109
130
        # log, the file with ID 8196569 has been downloaded twice (once from
130
        # log, the file with ID 8196569 has been downloaded twice (once from
113
131
        # Argentina and once from Japan) and the files with ID 12060796
131
        # Argentina and once from Japan) and the files with ID 12060796 and
114
132
        # and 9096290 have been downloaded once.  The file with ID 15018215
132
        # 9096290 have been downloaded once.  The file with ID 15018215 has
115
133
        # has also been downloaded once (last line of the sample log), but
133
        # also been downloaded once (last line of the sample log), but
116
134
        # parse_file() always skips the last line as it may be truncated, so
134
        # parse_file() always skips the last line as it may be truncated, so
117
135
        # it doesn't show up in the dict returned.
135
        # it doesn't show up in the dict returned.
118
136
        fd = open(os.path.join(
136
        fd = open(os.path.join(
119
137
            here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
137
            here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
121
138
        downloads, parsed_bytes = parse_file(
138
        downloads, parsed_bytes, parsed_lines = parse_file(
122
139
            fd, start_position=0, logger=self.logger,
139
            fd, start_position=0, logger=self.logger,
123
140
            get_download_key=get_path_download_key)
140
            get_download_key=get_path_download_key)
124
141
        self.assertEqual(
141
        self.assertEqual(
125
@@ -159,7 +159,7 @@
126
159
        # line without worrying about whether or not it's been truncated.
159
        # line without worrying about whether or not it's been truncated.
127
160
        fd = open(os.path.join(
160
        fd = open(os.path.join(
128
161
            here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
161
            here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
130
162
        downloads, parsed_bytes = parse_file(
162
        downloads, parsed_bytes, parsed_lines = parse_file(
131
163
            fd, start_position=self._getLastLineStart(fd), logger=self.logger,
163
            fd, start_position=self._getLastLineStart(fd), logger=self.logger,
132
164
            get_download_key=get_path_download_key)
164
            get_download_key=get_path_download_key)
133
165
        self.assertEqual(
165
        self.assertEqual(
134
@@ -177,7 +177,7 @@
135
177
        # parsed up to the line before the one where the failure occurred.
177
        # parsed up to the line before the one where the failure occurred.
136
178
        # Here we force an unexpected error on the first line.
178
        # Here we force an unexpected error on the first line.
137
179
        fd = StringIO('Not a log')
179
        fd = StringIO('Not a log')
139
180
        downloads, parsed_bytes = parse_file(
180
        downloads, parsed_bytes, parsed_lines = parse_file(
140
181
            fd, start_position=0, logger=self.logger,
181
            fd, start_position=0, logger=self.logger,
141
182
            get_download_key=get_path_download_key)
182
            get_download_key=get_path_download_key)
142
183
        self.assertIn('Error', self.logger.buffer.getvalue())
183
        self.assertIn('Error', self.logger.buffer.getvalue())
143
@@ -188,7 +188,7 @@
144
188
        """Assert that responses with the given status are ignored."""
188
        """Assert that responses with the given status are ignored."""
145
189
        fd = StringIO(
189
        fd = StringIO(
146
190
            self.sample_line % dict(status=status, method='GET'))
190
            self.sample_line % dict(status=status, method='GET'))
148
191
        downloads, parsed_bytes = parse_file(
191
        downloads, parsed_bytes, parsed_lines = parse_file(
149
192
            fd, start_position=0, logger=self.logger,
192
            fd, start_position=0, logger=self.logger,
150
193
            get_download_key=get_path_download_key)
193
            get_download_key=get_path_download_key)
151
194
        self.assertEqual(
194
        self.assertEqual(
152
@@ -213,7 +213,7 @@
153
213
        """Assert that requests with the given method are ignored."""
213
        """Assert that requests with the given method are ignored."""
154
214
        fd = StringIO(
214
        fd = StringIO(
155
215
            self.sample_line % dict(status='200', method=method))
215
            self.sample_line % dict(status='200', method=method))
157
216
        downloads, parsed_bytes = parse_file(
216
        downloads, parsed_bytes, parsed_lines = parse_file(
158
217
            fd, start_position=0, logger=self.logger,
217
            fd, start_position=0, logger=self.logger,
159
218
            get_download_key=get_path_download_key)
218
            get_download_key=get_path_download_key)
160
219
        self.assertEqual(
219
        self.assertEqual(
161
@@ -231,7 +231,7 @@
162
231
    def test_normal_request_is_not_ignored(self):
231
    def test_normal_request_is_not_ignored(self):
163
232
        fd = StringIO(
232
        fd = StringIO(
164
233
            self.sample_line % dict(status=200, method='GET'))
233
            self.sample_line % dict(status=200, method='GET'))
166
234
        downloads, parsed_bytes = parse_file(
234
        downloads, parsed_bytes, parsed_lines = parse_file(
167
235
            fd, start_position=0, logger=self.logger,
235
            fd, start_position=0, logger=self.logger,
168
236
            get_download_key=get_path_download_key)
236
            get_download_key=get_path_download_key)
169
237
        self.assertEqual(
237
        self.assertEqual(
170
@@ -249,22 +249,20 @@
171
249
        # lines.
249
        # lines.
172
250
        config.push(
250
        config.push(
173
251
            'log_parser config',
251
            'log_parser config',
178
252
            textwrap.dedent('''\
252
            '[launchpad]\nlogparser_max_parsed_lines: 2')
175
253
                [launchpad]
176
254
                logparser_max_parsed_lines: 2
177
255
                '''))
179
256
        self.addCleanup(config.pop, 'log_parser config')
253
        self.addCleanup(config.pop, 'log_parser config')
180
257
        fd = open(os.path.join(
254
        fd = open(os.path.join(
181
258
            here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
255
            here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
182
259
        self.addCleanup(fd.close)
256
        self.addCleanup(fd.close)
183
260
257
185
261
        downloads, parsed_bytes = parse_file(
258
        downloads, parsed_bytes, parsed_lines = parse_file(
186
262
            fd, start_position=0, logger=self.logger,
259
            fd, start_position=0, logger=self.logger,
187
263
            get_download_key=get_path_download_key)
260
            get_download_key=get_path_download_key)
188
264
261
189
265
        # We have initially parsed only the first two lines of data,
262
        # We have initially parsed only the first two lines of data,
190
266
        # corresponding to one download (the first line is a 404 and
263
        # corresponding to one download (the first line is a 404 and
191
267
        # so ignored).
264
        # so ignored).
192
265
        self.assertEqual(parsed_lines, 2)
193
268
        date = datetime(2008, 6, 13)
266
        date = datetime(2008, 6, 13)
194
269
        self.assertContentEqual(
267
        self.assertContentEqual(
195
270
            downloads.items(),
268
            downloads.items(),
196
@@ -276,7 +274,7 @@
197
276
274
198
277
        # And the subsequent parse will be for the 3rd and 4th lines,
275
        # And the subsequent parse will be for the 3rd and 4th lines,
199
278
        # corresponding to two downloads of the same file.
276
        # corresponding to two downloads of the same file.
201
279
        downloads, parsed_bytes = parse_file(
277
        downloads, parsed_bytes, parsed_lines = parse_file(
202
280
            fd, start_position=parsed_bytes, logger=self.logger,
278
            fd, start_position=parsed_bytes, logger=self.logger,
203
281
            get_download_key=get_path_download_key)
279
            get_download_key=get_path_download_key)
204
282
        self.assertContentEqual(
280
        self.assertContentEqual(
205
@@ -285,6 +283,45 @@
206
285
             ('/8196569/mediumubuntulogo.png', {date: {'AR': 1}})])
283
             ('/8196569/mediumubuntulogo.png', {date: {'AR': 1}})])
207
286
        self.assertEqual(parsed_bytes, sum(line_lengths[:4]))
284
        self.assertEqual(parsed_bytes, sum(line_lengths[:4]))
208
287
285
209
286
    def test_max_parsed_lines_exceeded(self):
210
287
        # Show that if a non-zero parsed_lines is passed in, the number of
211
288
        # lines parsed will be less than it would otherwise have been.
212
289
213
290
        # The max_parsed_lines config option limits the number of parsed
214
291
        # lines.
215
292
        config.push(
216
293
            'log_parser config',
217
294
            '[launchpad]\nlogparser_max_parsed_lines: 2')
218
295
        self.addCleanup(config.pop, 'log_parser config')
219
296
        fd = open(os.path.join(
220
297
            here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
221
298
        self.addCleanup(fd.close)
222
299
223
300
        # We want to start parsing on line 2 so we will have a value in
224
301
        # "downloads" to make a positive assertion about.  (The first line is
225
302
        # a 404 so wouldn't generate any output.)
226
303
        start_position = len(fd.readline())
227
304
228
305
        # If we have already parsed some lines, then the number of lines
229
306
        # parsed will be passed in (parsed_lines argument) and parse_file will
230
307
        # take that number into account when determining if the maximum number
231
308
        # of lines to parse has been reached.
232
309
        parsed_lines = 1
233
310
        downloads, parsed_bytes, parsed_lines = parse_file(
234
311
            fd, start_position=start_position, logger=self.logger,
235
312
            get_download_key=get_path_download_key, parsed_lines=parsed_lines)
236
313
237
314
        # The total number of lines parsed during the run (1 line) plus the
238
315
        # number of lines parsed previously (1 line, as passed in via
239
316
        # parsed_lines) is returned.
240
317
        self.assertEqual(parsed_lines, 2)
241
318
        # Since we told parse_file that we had already parsed 1 line and the
242
319
        # limit is 2 lines, it only parsed a single line.
243
320
        date = datetime(2008, 6, 13)
244
321
        self.assertContentEqual(
245
322
            downloads.items(),
246
323
            [('/9096290/me-tv-icon-14x14.png', {date: {'AU': 1}})])
247
324
248
288
325
249
289
class TestParsedFilesDetection(TestCase):
326
class TestParsedFilesDetection(TestCase):
250
290
    """Test the detection of already parsed logs."""
327
    """Test the detection of already parsed logs."""
Reviewer	Review Type	Date Requested	Status
Henning Eggers (community)	code	2010-08-30	Approve on 2010-08-30
Review via email: mp+34077@code.launchpad.net