Merge lp:~lifeless/launchpad/foundations into lp:launchpad
- foundations
- Merge into devel
Status: | Merged |
---|---|
Approved by: | Stuart Bishop |
Approved revision: | no longer in the source branch. |
Merged at revision: | 11396 |
Proposed branch: | lp:~lifeless/launchpad/foundations |
Merge into: | lp:launchpad |
Diff against target: |
440 lines (+158/-73) 2 files modified
lib/lp/scripts/utilities/pageperformancereport.py (+154/-68) utilities/page-performance-report-daily.sh (+4/-5) |
To merge this branch: | bzr merge lp:~lifeless/launchpad/foundations |
Related bugs: |
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Stuart Bishop (community) | Approve | ||
Review via email: mp+32299@code.launchpad.net |
Commit message
Page performance report improvements - top N urls and 99th percentile
Description of the change
Generate a page performance report page that will (hopefully) be a useful hit-list for developers to focus on.
Stuart Bishop (stub) wrote : | # |
Robert Collins (lifeless) wrote : | # |
Thanks Stuart; the width of 10 for the candidates-timeout graphs is a
little weird; I think it should take the same upper bound - or even a
higher one of the max time in the dataset.
-Rob
Robert Collins (lifeless) wrote : | # |
Oh, and I was a muppet, setting the timeout -2 *default* to 10, should
have set it to 12, the staging timeout which would cut off
appropriately at 10. Mea culpa.
Robert Collins (lifeless) wrote : | # |
Stuart, I've updated this with a patch to raise the default timeout to
12 (stagings current timeout) and to separate out the
graph capping (50% above timeout : we regularly go up to this or
higher due to the way the timeouts work), and the stddeviation
calculations : our numeric stats were being distorted by the
calculations to make the histograms nice.
Thanks,
Rob
Stuart Bishop (stub) wrote : | # |
Looks fine and is running happily on devpad
Preview Diff
1 | === modified file 'lib/lp/scripts/utilities/pageperformancereport.py' |
2 | --- lib/lp/scripts/utilities/pageperformancereport.py 2010-07-08 11:44:02 +0000 |
3 | +++ lib/lp/scripts/utilities/pageperformancereport.py 2010-08-12 05:59:44 +0000 |
4 | @@ -6,15 +6,13 @@ |
5 | __metaclass__ = type |
6 | __all__ = ['main'] |
7 | |
8 | -import bz2 |
9 | from cgi import escape as html_quote |
10 | from ConfigParser import RawConfigParser |
11 | from datetime import datetime |
12 | -import gzip |
13 | import re |
14 | import sre_constants |
15 | -from tempfile import TemporaryFile |
16 | import os.path |
17 | +import subprocess |
18 | from textwrap import dedent |
19 | import time |
20 | |
21 | @@ -84,6 +82,7 @@ |
22 | median = 0 # Median time per hit. |
23 | std = 0 # Standard deviation per hit. |
24 | var = 0 # Variance per hit. |
25 | + ninetyninth_percentile_time = 0 |
26 | histogram = None # # Request times histogram. |
27 | |
28 | total_sqltime = 0 # Total time spent waiting for SQL to process. |
29 | @@ -98,27 +97,31 @@ |
30 | std_sqlstatements = 0 |
31 | var_sqlstatements = 0 |
32 | |
33 | +empty_stats = Stats() # Singleton. |
34 | + |
35 | |
36 | class Times: |
37 | """Collection of request times.""" |
38 | def __init__(self, timeout): |
39 | - self.spool = TemporaryFile() |
40 | + self.total_hits = 0 |
41 | + self.total_time = 0 |
42 | self.request_times = [] |
43 | self.sql_statements = [] |
44 | self.sql_times = [] |
45 | self.ticks = [] |
46 | - self.timeout = timeout |
47 | + self.histogram_width = int(1.5*timeout) |
48 | |
49 | def add(self, request): |
50 | - """Add the application time from the request to the collection. |
51 | - |
52 | - The application time is capped to our timeout. |
53 | - """ |
54 | - print >> self.spool, "%s,%s,%s,%s" % ( |
55 | - min(request.app_seconds, self.timeout), |
56 | - request.sql_statements or '', |
57 | - request.sql_seconds or '', |
58 | - request.ticks or '') |
59 | + """Add the application time from the request to the collection.""" |
60 | + self.total_hits += 1 |
61 | + self.total_time += request.app_seconds |
62 | + self.request_times.append(request.app_seconds) |
63 | + if request.sql_statements is not None: |
64 | + self.sql_statements.append(request.sql_statements) |
65 | + if request.sql_seconds is not None: |
66 | + self.sql_times.append(request.sql_seconds) |
67 | + if request.ticks is not None: |
68 | + self.ticks.append(request.ticks) |
69 | |
70 | _stats = None |
71 | |
72 | @@ -133,58 +136,56 @@ |
73 | 1 and 2 seconds etc. histogram is None if there are no requests in |
74 | this Category. |
75 | """ |
76 | + if not self.total_hits: |
77 | + return empty_stats |
78 | + |
79 | if self._stats is not None: |
80 | return self._stats |
81 | |
82 | - def iter_spool(index, cast): |
83 | - """Generator returning one column from our spool file. |
84 | - |
85 | - Skips None values. |
86 | - """ |
87 | - self.spool.flush() |
88 | - self.spool.seek(0) |
89 | - for line in self.spool: |
90 | - value = line.split(',')[index] |
91 | - if value != '': |
92 | - yield cast(value) |
93 | - |
94 | stats = Stats() |
95 | |
96 | + stats.total_hits = self.total_hits |
97 | + |
98 | # Time stats |
99 | - array = numpy.fromiter(iter_spool(0, numpy.float32), numpy.float32) |
100 | + array = numpy.asarray(self.request_times, numpy.float32) |
101 | stats.total_time = numpy.sum(array) |
102 | - stats.total_hits = len(array) |
103 | stats.mean = numpy.mean(array) |
104 | stats.median = numpy.median(array) |
105 | stats.std = numpy.std(array) |
106 | stats.var = numpy.var(array) |
107 | + # This is an approximation which may not be true: we don't know if we |
108 | + # have a std distribution or not. We could just find the 99th |
109 | + # percentile by counting. Shock. Horror; however this appears pretty |
110 | + # good based on eyeballing things so far - once we're down in the 2-3 |
111 | + # second range for everything we may want to revisit. |
112 | + stats.ninetyninth_percentile_time = stats.mean + stats.std*3 |
113 | + capped_times = (min(a_time, self.histogram_width) for a_time in |
114 | + self.request_times) |
115 | + array = numpy.fromiter(capped_times, numpy.float32, |
116 | + len(self.request_times)) |
117 | histogram = numpy.histogram( |
118 | array, normed=True, |
119 | - range=(0, self.timeout), bins=self.timeout) |
120 | + range=(0, self.histogram_width), bins=self.histogram_width) |
121 | stats.histogram = zip(histogram[1], histogram[0]) |
122 | |
123 | - # SQL query count. |
124 | - array = numpy.fromiter(iter_spool(1, numpy.int), numpy.int) |
125 | - stats.total_sqlstatements = numpy.sum(array) |
126 | - stats.mean_sqlstatements = numpy.mean(array) |
127 | - stats.median_sqlstatements = numpy.median(array) |
128 | - stats.std_sqlstatements = numpy.std(array) |
129 | - stats.var_sqlstatements = numpy.var(array) |
130 | - |
131 | # SQL time stats. |
132 | - array = numpy.fromiter(iter_spool(2, numpy.float32), numpy.float32) |
133 | + array = numpy.asarray(self.sql_times, numpy.float32) |
134 | stats.total_sqltime = numpy.sum(array) |
135 | stats.mean_sqltime = numpy.mean(array) |
136 | stats.median_sqltime = numpy.median(array) |
137 | stats.std_sqltime = numpy.std(array) |
138 | stats.var_sqltime = numpy.var(array) |
139 | |
140 | + # SQL query count. |
141 | + array = numpy.asarray(self.sql_statements, numpy.int) |
142 | + stats.total_sqlstatements = int(numpy.sum(array)) |
143 | + stats.mean_sqlstatements = numpy.mean(array) |
144 | + stats.median_sqlstatements = numpy.median(array) |
145 | + stats.std_sqlstatements = numpy.std(array) |
146 | + stats.var_sqlstatements = numpy.var(array) |
147 | + |
148 | # Cache for next invocation. |
149 | self._stats = stats |
150 | - |
151 | - # Clean up the spool file |
152 | - self.spool = None |
153 | - |
154 | return stats |
155 | |
156 | def __str__(self): |
157 | @@ -194,6 +195,9 @@ |
158 | return "%2.2f %2.2f %2.2f %s" % ( |
159 | total, mean, median, std, hstr) |
160 | |
161 | + def __cmp__(self, b): |
162 | + return cmp(self.total_time, b.total_time) |
163 | + |
164 | |
165 | def main(): |
166 | parser = LPOptionParser("%prog [args] tracelog [...]") |
167 | @@ -204,10 +208,6 @@ |
168 | config.root, "utilities", "page-performance-report.ini"), |
169 | metavar="FILE", help="Load configuration from FILE") |
170 | parser.add_option( |
171 | - "--timeout", dest="timeout", type="int", |
172 | - default=20, metavar="SECS", |
173 | - help="Requests taking more than SECS seconds are timeouts") |
174 | - parser.add_option( |
175 | "--from", dest="from_ts", type="datetime", |
176 | default=None, metavar="TIMESTAMP", |
177 | help="Ignore log entries before TIMESTAMP") |
178 | @@ -224,9 +224,17 @@ |
179 | action="store_false", default=True, |
180 | help="Do not produce pageids report") |
181 | parser.add_option( |
182 | + "--top-urls", dest="top_urls", type=int, metavar="N", |
183 | + default=50, help="Generate report for top N urls by hitcount.") |
184 | + parser.add_option( |
185 | "--directory", dest="directory", |
186 | default=os.getcwd(), metavar="DIR", |
187 | help="Output reports in DIR directory") |
188 | + parser.add_option( |
189 | + "--timeout", dest="timeout", |
190 | + # Default to 12: the staging timeout. |
191 | + default=12, type="int", |
192 | + help="The configured timeout value : determines high risk page ids.") |
193 | |
194 | options, args = parser.parse_args() |
195 | |
196 | @@ -268,25 +276,51 @@ |
197 | parser.error("No data in [categories] section of configuration.") |
198 | |
199 | pageid_times = {} |
200 | - |
201 | - parse(args, categories, pageid_times, options) |
202 | + url_times = {} |
203 | + |
204 | + parse(args, categories, pageid_times, url_times, options) |
205 | + |
206 | + # Truncate the URL times to the top N. |
207 | + if options.top_urls: |
208 | + sorted_urls = sorted( |
209 | + ((times, url) for url, times in url_times.items() |
210 | + if times.total_hits > 0), reverse=True) |
211 | + url_times = [(url, times) |
212 | + for times, url in sorted_urls[:options.top_urls]] |
213 | + |
214 | + def _report_filename(filename): |
215 | + return os.path.join(options.directory, filename) |
216 | |
217 | # Category only report. |
218 | if options.categories: |
219 | - report_filename = os.path.join(options.directory,'categories.html') |
220 | + report_filename = _report_filename('categories.html') |
221 | log.info("Generating %s", report_filename) |
222 | - html_report(open(report_filename, 'w'), categories, None) |
223 | + html_report(open(report_filename, 'w'), categories, None, None) |
224 | |
225 | # Pageid only report. |
226 | if options.pageids: |
227 | - report_filename = os.path.join(options.directory,'pageids.html') |
228 | - log.info("Generating %s", report_filename) |
229 | - html_report(open(report_filename, 'w'), None, pageid_times) |
230 | + report_filename = _report_filename('pageids.html') |
231 | + log.info("Generating %s", report_filename) |
232 | + html_report(open(report_filename, 'w'), None, pageid_times, None) |
233 | + |
234 | + # Top URL only report. |
235 | + if options.top_urls: |
236 | + report_filename = _report_filename('top%d.html' % options.top_urls) |
237 | + log.info("Generating %s", report_filename) |
238 | + html_report(open(report_filename, 'w'), None, None, url_times) |
239 | |
240 | # Combined report. |
241 | if options.categories and options.pageids: |
242 | - report_filename = os.path.join(options.directory,'combined.html') |
243 | - html_report(open(report_filename, 'w'), categories, pageid_times) |
244 | + report_filename = _report_filename('combined.html') |
245 | + html_report( |
246 | + open(report_filename, 'w'), categories, pageid_times, url_times) |
247 | + |
248 | + # Report of likely timeout candidates |
249 | + report_filename = _report_filename('timeout-candidates.html') |
250 | + log.info("Generating %s", report_filename) |
251 | + html_report( |
252 | + open(report_filename, 'w'), None, pageid_times, None, |
253 | + options.timeout - 2) |
254 | |
255 | return 0 |
256 | |
257 | @@ -298,9 +332,17 @@ |
258 | """ |
259 | ext = os.path.splitext(filename)[1] |
260 | if ext == '.bz2': |
261 | - return bz2.BZ2File(filename, mode) |
262 | + p = subprocess.Popen( |
263 | + ['bunzip2', '-c', filename], |
264 | + stdout=subprocess.PIPE, stdin=subprocess.PIPE) |
265 | + p.stdin.close() |
266 | + return p.stdout |
267 | elif ext == '.gz': |
268 | - return gzip.open(filename, mode) |
269 | + p = subprocess.Popen( |
270 | + ['gunzip', '-c', filename], |
271 | + stdout=subprocess.PIPE, stdin=subprocess.PIPE) |
272 | + p.stdin.close() |
273 | + return p.stdout |
274 | else: |
275 | return open(filename, mode) |
276 | |
277 | @@ -321,7 +363,7 @@ |
278 | *(int(elem) for elem in match.groups() if elem is not None)) |
279 | |
280 | |
281 | -def parse(tracefiles, categories, pageid_times, options): |
282 | +def parse(tracefiles, categories, pageid_times, url_times, options): |
283 | requests = {} |
284 | total_requests = 0 |
285 | for tracefile in tracefiles: |
286 | @@ -402,12 +444,12 @@ |
287 | log.debug("Parsed %d requests", total_requests) |
288 | |
289 | # Add the request to any matching categories. |
290 | - if categories is not None: |
291 | + if options.categories: |
292 | for category in categories: |
293 | category.add(request) |
294 | |
295 | # Add the request to the times for that pageid. |
296 | - if pageid_times is not None and request.pageid is not None: |
297 | + if options.pageids: |
298 | pageid = request.pageid |
299 | try: |
300 | times = pageid_times[pageid] |
301 | @@ -416,6 +458,21 @@ |
302 | pageid_times[pageid] = times |
303 | times.add(request) |
304 | |
305 | + # Add the request to the times for that URL. |
306 | + if options.top_urls: |
307 | + url = request.url |
308 | + # Hack to remove opstats from top N report. This |
309 | + # should go into a config file if we end up with |
310 | + # more pages that need to be ignored because |
311 | + # they are just noise. |
312 | + if not (url is None or url.endswith('+opstats')): |
313 | + try: |
314 | + times = url_times[url] |
315 | + except KeyError: |
316 | + times = Times(options.timeout) |
317 | + url_times[url] = times |
318 | + times.add(request) |
319 | + |
320 | else: |
321 | raise MalformedLine('Unknown record type %s', record_type) |
322 | except MalformedLine, x: |
323 | @@ -442,7 +499,19 @@ |
324 | "Unknown extension prefix %s" % prefix) |
325 | |
326 | |
327 | -def html_report(outf, categories, pageid_times): |
328 | +def html_report( |
329 | + outf, categories, pageid_times, url_times, |
330 | + ninetyninth_percentile_threshold=None): |
331 | + """Write an html report to outf. |
332 | + |
333 | + :param outf: A file object to write the report to. |
334 | + :param categories: Categories to report. |
335 | + :param pageid_times: The time statistics for pageids. |
336 | + :param url_times: The time statistics for the top XXX urls. |
337 | + :param ninetyninth_percentile_threshold: Lower threshold for inclusion of |
338 | + pages in the pageid section; pages where 99 percent of the requests are |
339 | + served under this threshold will not be included. |
340 | + """ |
341 | |
342 | print >> outf, dedent('''\ |
343 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" |
344 | @@ -502,6 +571,8 @@ |
345 | |
346 | <th class="clickable">Total Time (secs)</th> |
347 | |
348 | + <th class="clickable">99% Under Time (secs)</th> |
349 | + |
350 | <th class="clickable">Mean Time (secs)</th> |
351 | <th class="clickable">Time Standard Deviation</th> |
352 | <th class="clickable">Time Variance</th> |
353 | @@ -537,6 +608,7 @@ |
354 | <th class="category-title">%s</th> |
355 | <td class="numeric total_hits">%d</td> |
356 | <td class="numeric total_time">%.2f</td> |
357 | + <td class="numeric 99pc_under">%.2f</td> |
358 | <td class="numeric mean_time">%.2f</td> |
359 | <td class="numeric std_time">%.2f</td> |
360 | <td class="numeric var_time">%.2f</td> |
361 | @@ -559,6 +631,7 @@ |
362 | """ % ( |
363 | html_title, |
364 | stats.total_hits, stats.total_time, |
365 | + stats.ninetyninth_percentile_time, |
366 | stats.mean, stats.std, stats.var, stats.median, |
367 | len(histograms) - 1, |
368 | stats.total_sqltime, stats.mean_sqltime, |
369 | @@ -568,13 +641,14 @@ |
370 | stats.median_sqlstatements)) |
371 | |
372 | # Table of contents |
373 | - if categories and pageid_times: |
374 | - print >> outf, dedent('''\ |
375 | - <ol> |
376 | - <li><a href="#catrep">Category Report</a></li> |
377 | - <li><a href="#pageidrep">Pageid Report</a></li> |
378 | - </ol> |
379 | - ''') |
380 | + print >> outf, '<ol>' |
381 | + if categories: |
382 | + print >> outf, '<li><a href="#catrep">Category Report</a></li>' |
383 | + if pageid_times: |
384 | + print >> outf, '<li><a href="#pageidrep">Pageid Report</a></li>' |
385 | + if url_times: |
386 | + print >> outf, '<li><a href="#topurlrep">Top URL Report</a></li>' |
387 | + print >> outf, '</ol>' |
388 | |
389 | if categories: |
390 | print >> outf, '<h2 id="catrep">Category Report</h2>' |
391 | @@ -589,9 +663,21 @@ |
392 | print >> outf, '<h2 id="pageidrep">Pageid Report</h2>' |
393 | print >> outf, table_header |
394 | for pageid, times in sorted(pageid_times.items()): |
395 | + pageid = pageid or 'None' |
396 | + if (ninetyninth_percentile_threshold is not None and |
397 | + (times.stats().ninetyninth_percentile_time < |
398 | + ninetyninth_percentile_threshold)): |
399 | + continue |
400 | handle_times(html_quote(pageid), times) |
401 | print >> outf, table_footer |
402 | |
403 | + if url_times: |
404 | + print >> outf, '<h2 id="topurlrep">Top URL Report</h2>' |
405 | + print >> outf, table_header |
406 | + for url, times in url_times: |
407 | + handle_times(html_quote(url), times) |
408 | + print >> outf, table_footer |
409 | + |
410 | # Ourput the javascript to render our histograms nicely, replacing |
411 | # the placeholder <div> tags output earlier. |
412 | print >> outf, dedent("""\ |
413 | |
414 | === modified file 'utilities/page-performance-report-daily.sh' |
415 | --- utilities/page-performance-report-daily.sh 2010-07-08 09:11:54 +0000 |
416 | +++ utilities/page-performance-report-daily.sh 2010-08-12 05:59:44 +0000 |
417 | @@ -26,11 +26,14 @@ |
418 | echo Generating report from $from until $until into $dir `date` |
419 | |
420 | ./page-performance-report.py -v --from=$from --until=$until \ |
421 | - --directory=${dir} $logs |
422 | + --top-urls=200 --directory=${dir} $logs |
423 | |
424 | ln -sf ${dir}/categories.html ${root}/latest-${type}-categories.html |
425 | ln -sf ${dir}/pageids.html ${root}/latest-${type}-pageids.html |
426 | ln -sf ${dir}/combined.html ${root}/latest-${type}-combined.html |
427 | + ln -sf ${dir}/top200.html ${root}/latest-${type}-top200.html |
428 | + ln -sf ${dir}/timeout-candidates.html \ |
429 | + ${root}/latest-${type}-timeout-candidates.html |
430 | |
431 | return 0 |
432 | } |
433 | @@ -57,8 +60,4 @@ |
434 | report 32 monthly `date -d 'last month' $fmt` $now |
435 | fi |
436 | |
437 | -# One off reports to populate history. |
438 | -## report 40 monthly `date -d '1 june 2010' $fmt` `date -d '1 july 2010' $fmt` |
439 | -## report 23 weekly `date -d '19 june 2010' $fmt` `date -d '26 june 2010' $fmt` |
440 | -## report 16 weekly `date -d '26 june 2010' $fmt` `date -d '3 july 2010' $fmt` |
441 |
I've fixed some bugs in this - it doesn't look like it had been tested.
I've merged it in with changes I had neglected to push in the days before the Epic but have been running live. This work is in lp:~stub/launchpad/page-performance-report. It seems to be working locally just fine with a single log downloaded from devpad. I've pushed this to the tree on devpad so we can see the result with the scheduled run.