Merge lp:~lifeless/launchpad/foundations into lp:launchpad

Proposed by Robert Collins
Status: Merged
Approved by: Stuart Bishop
Approved revision: no longer in the source branch.
Merged at revision: 11396
Proposed branch: lp:~lifeless/launchpad/foundations
Merge into: lp:launchpad
Diff against target: 440 lines (+158/-73)
2 files modified
lib/lp/scripts/utilities/pageperformancereport.py (+154/-68)
utilities/page-performance-report-daily.sh (+4/-5)
To merge this branch: bzr merge lp:~lifeless/launchpad/foundations
Reviewer Review Type Date Requested Status
Stuart Bishop (community) Approve
Review via email: mp+32299@code.launchpad.net

Commit message

Page performance report improvements - top N urls and 99th percentile

Description of the change

Generate a page performance report page that will (hopefully) be a useful hit-list for developers to focus on.

To post a comment you must log in.
Revision history for this message
Stuart Bishop (stub) wrote :

I've fixed some bugs in this - it doesn't look like it had been tested.

I've merged it in with changes I had neglected to push in the days before the Epic but have been running live. This work is in lp:~stub/launchpad/page-performance-report. It seems to be working locally just fine with a single log downloaded from devpad. I've pushed this to the tree on devpad so we can see the result with the scheduled run.

Revision history for this message
Robert Collins (lifeless) wrote :

Thanks Stuart; the width of 10 for the candidates-timeout graphs is a
little weird; I think it should take the same upper bound - or even a
higher one of the max time in the dataset.

-Rob

Revision history for this message
Robert Collins (lifeless) wrote :

Oh, and I was a muppet, setting the timeout -2 *default* to 10, should
have set it to 12, the staging timeout which would cut off
appropriately at 10. Mea culpa.

Revision history for this message
Robert Collins (lifeless) wrote :

Stuart, I've updated this with a patch to raise the default timeout to
12 (stagings current timeout) and to separate out the
graph capping (50% above timeout : we regularly go up to this or
higher due to the way the timeouts work), and the stddeviation
calculations : our numeric stats were being distorted by the
calculations to make the histograms nice.

Thanks,
Rob

Revision history for this message
Stuart Bishop (stub) wrote :

Looks fine and is running happily on devpad

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'lib/lp/scripts/utilities/pageperformancereport.py'
2--- lib/lp/scripts/utilities/pageperformancereport.py 2010-07-08 11:44:02 +0000
3+++ lib/lp/scripts/utilities/pageperformancereport.py 2010-08-12 05:59:44 +0000
4@@ -6,15 +6,13 @@
5 __metaclass__ = type
6 __all__ = ['main']
7
8-import bz2
9 from cgi import escape as html_quote
10 from ConfigParser import RawConfigParser
11 from datetime import datetime
12-import gzip
13 import re
14 import sre_constants
15-from tempfile import TemporaryFile
16 import os.path
17+import subprocess
18 from textwrap import dedent
19 import time
20
21@@ -84,6 +82,7 @@
22 median = 0 # Median time per hit.
23 std = 0 # Standard deviation per hit.
24 var = 0 # Variance per hit.
25+ ninetyninth_percentile_time = 0
26 histogram = None # # Request times histogram.
27
28 total_sqltime = 0 # Total time spent waiting for SQL to process.
29@@ -98,27 +97,31 @@
30 std_sqlstatements = 0
31 var_sqlstatements = 0
32
33+empty_stats = Stats() # Singleton.
34+
35
36 class Times:
37 """Collection of request times."""
38 def __init__(self, timeout):
39- self.spool = TemporaryFile()
40+ self.total_hits = 0
41+ self.total_time = 0
42 self.request_times = []
43 self.sql_statements = []
44 self.sql_times = []
45 self.ticks = []
46- self.timeout = timeout
47+ self.histogram_width = int(1.5*timeout)
48
49 def add(self, request):
50- """Add the application time from the request to the collection.
51-
52- The application time is capped to our timeout.
53- """
54- print >> self.spool, "%s,%s,%s,%s" % (
55- min(request.app_seconds, self.timeout),
56- request.sql_statements or '',
57- request.sql_seconds or '',
58- request.ticks or '')
59+ """Add the application time from the request to the collection."""
60+ self.total_hits += 1
61+ self.total_time += request.app_seconds
62+ self.request_times.append(request.app_seconds)
63+ if request.sql_statements is not None:
64+ self.sql_statements.append(request.sql_statements)
65+ if request.sql_seconds is not None:
66+ self.sql_times.append(request.sql_seconds)
67+ if request.ticks is not None:
68+ self.ticks.append(request.ticks)
69
70 _stats = None
71
72@@ -133,58 +136,56 @@
73 1 and 2 seconds etc. histogram is None if there are no requests in
74 this Category.
75 """
76+ if not self.total_hits:
77+ return empty_stats
78+
79 if self._stats is not None:
80 return self._stats
81
82- def iter_spool(index, cast):
83- """Generator returning one column from our spool file.
84-
85- Skips None values.
86- """
87- self.spool.flush()
88- self.spool.seek(0)
89- for line in self.spool:
90- value = line.split(',')[index]
91- if value != '':
92- yield cast(value)
93-
94 stats = Stats()
95
96+ stats.total_hits = self.total_hits
97+
98 # Time stats
99- array = numpy.fromiter(iter_spool(0, numpy.float32), numpy.float32)
100+ array = numpy.asarray(self.request_times, numpy.float32)
101 stats.total_time = numpy.sum(array)
102- stats.total_hits = len(array)
103 stats.mean = numpy.mean(array)
104 stats.median = numpy.median(array)
105 stats.std = numpy.std(array)
106 stats.var = numpy.var(array)
107+ # This is an approximation which may not be true: we don't know if we
108+ # have a std distribution or not. We could just find the 99th
109+ # percentile by counting. Shock. Horror; however this appears pretty
110+ # good based on eyeballing things so far - once we're down in the 2-3
111+ # second range for everything we may want to revisit.
112+ stats.ninetyninth_percentile_time = stats.mean + stats.std*3
113+ capped_times = (min(a_time, self.histogram_width) for a_time in
114+ self.request_times)
115+ array = numpy.fromiter(capped_times, numpy.float32,
116+ len(self.request_times))
117 histogram = numpy.histogram(
118 array, normed=True,
119- range=(0, self.timeout), bins=self.timeout)
120+ range=(0, self.histogram_width), bins=self.histogram_width)
121 stats.histogram = zip(histogram[1], histogram[0])
122
123- # SQL query count.
124- array = numpy.fromiter(iter_spool(1, numpy.int), numpy.int)
125- stats.total_sqlstatements = numpy.sum(array)
126- stats.mean_sqlstatements = numpy.mean(array)
127- stats.median_sqlstatements = numpy.median(array)
128- stats.std_sqlstatements = numpy.std(array)
129- stats.var_sqlstatements = numpy.var(array)
130-
131 # SQL time stats.
132- array = numpy.fromiter(iter_spool(2, numpy.float32), numpy.float32)
133+ array = numpy.asarray(self.sql_times, numpy.float32)
134 stats.total_sqltime = numpy.sum(array)
135 stats.mean_sqltime = numpy.mean(array)
136 stats.median_sqltime = numpy.median(array)
137 stats.std_sqltime = numpy.std(array)
138 stats.var_sqltime = numpy.var(array)
139
140+ # SQL query count.
141+ array = numpy.asarray(self.sql_statements, numpy.int)
142+ stats.total_sqlstatements = int(numpy.sum(array))
143+ stats.mean_sqlstatements = numpy.mean(array)
144+ stats.median_sqlstatements = numpy.median(array)
145+ stats.std_sqlstatements = numpy.std(array)
146+ stats.var_sqlstatements = numpy.var(array)
147+
148 # Cache for next invocation.
149 self._stats = stats
150-
151- # Clean up the spool file
152- self.spool = None
153-
154 return stats
155
156 def __str__(self):
157@@ -194,6 +195,9 @@
158 return "%2.2f %2.2f %2.2f %s" % (
159 total, mean, median, std, hstr)
160
161+ def __cmp__(self, b):
162+ return cmp(self.total_time, b.total_time)
163+
164
165 def main():
166 parser = LPOptionParser("%prog [args] tracelog [...]")
167@@ -204,10 +208,6 @@
168 config.root, "utilities", "page-performance-report.ini"),
169 metavar="FILE", help="Load configuration from FILE")
170 parser.add_option(
171- "--timeout", dest="timeout", type="int",
172- default=20, metavar="SECS",
173- help="Requests taking more than SECS seconds are timeouts")
174- parser.add_option(
175 "--from", dest="from_ts", type="datetime",
176 default=None, metavar="TIMESTAMP",
177 help="Ignore log entries before TIMESTAMP")
178@@ -224,9 +224,17 @@
179 action="store_false", default=True,
180 help="Do not produce pageids report")
181 parser.add_option(
182+ "--top-urls", dest="top_urls", type=int, metavar="N",
183+ default=50, help="Generate report for top N urls by hitcount.")
184+ parser.add_option(
185 "--directory", dest="directory",
186 default=os.getcwd(), metavar="DIR",
187 help="Output reports in DIR directory")
188+ parser.add_option(
189+ "--timeout", dest="timeout",
190+ # Default to 12: the staging timeout.
191+ default=12, type="int",
192+ help="The configured timeout value : determines high risk page ids.")
193
194 options, args = parser.parse_args()
195
196@@ -268,25 +276,51 @@
197 parser.error("No data in [categories] section of configuration.")
198
199 pageid_times = {}
200-
201- parse(args, categories, pageid_times, options)
202+ url_times = {}
203+
204+ parse(args, categories, pageid_times, url_times, options)
205+
206+ # Truncate the URL times to the top N.
207+ if options.top_urls:
208+ sorted_urls = sorted(
209+ ((times, url) for url, times in url_times.items()
210+ if times.total_hits > 0), reverse=True)
211+ url_times = [(url, times)
212+ for times, url in sorted_urls[:options.top_urls]]
213+
214+ def _report_filename(filename):
215+ return os.path.join(options.directory, filename)
216
217 # Category only report.
218 if options.categories:
219- report_filename = os.path.join(options.directory,'categories.html')
220+ report_filename = _report_filename('categories.html')
221 log.info("Generating %s", report_filename)
222- html_report(open(report_filename, 'w'), categories, None)
223+ html_report(open(report_filename, 'w'), categories, None, None)
224
225 # Pageid only report.
226 if options.pageids:
227- report_filename = os.path.join(options.directory,'pageids.html')
228- log.info("Generating %s", report_filename)
229- html_report(open(report_filename, 'w'), None, pageid_times)
230+ report_filename = _report_filename('pageids.html')
231+ log.info("Generating %s", report_filename)
232+ html_report(open(report_filename, 'w'), None, pageid_times, None)
233+
234+ # Top URL only report.
235+ if options.top_urls:
236+ report_filename = _report_filename('top%d.html' % options.top_urls)
237+ log.info("Generating %s", report_filename)
238+ html_report(open(report_filename, 'w'), None, None, url_times)
239
240 # Combined report.
241 if options.categories and options.pageids:
242- report_filename = os.path.join(options.directory,'combined.html')
243- html_report(open(report_filename, 'w'), categories, pageid_times)
244+ report_filename = _report_filename('combined.html')
245+ html_report(
246+ open(report_filename, 'w'), categories, pageid_times, url_times)
247+
248+ # Report of likely timeout candidates
249+ report_filename = _report_filename('timeout-candidates.html')
250+ log.info("Generating %s", report_filename)
251+ html_report(
252+ open(report_filename, 'w'), None, pageid_times, None,
253+ options.timeout - 2)
254
255 return 0
256
257@@ -298,9 +332,17 @@
258 """
259 ext = os.path.splitext(filename)[1]
260 if ext == '.bz2':
261- return bz2.BZ2File(filename, mode)
262+ p = subprocess.Popen(
263+ ['bunzip2', '-c', filename],
264+ stdout=subprocess.PIPE, stdin=subprocess.PIPE)
265+ p.stdin.close()
266+ return p.stdout
267 elif ext == '.gz':
268- return gzip.open(filename, mode)
269+ p = subprocess.Popen(
270+ ['gunzip', '-c', filename],
271+ stdout=subprocess.PIPE, stdin=subprocess.PIPE)
272+ p.stdin.close()
273+ return p.stdout
274 else:
275 return open(filename, mode)
276
277@@ -321,7 +363,7 @@
278 *(int(elem) for elem in match.groups() if elem is not None))
279
280
281-def parse(tracefiles, categories, pageid_times, options):
282+def parse(tracefiles, categories, pageid_times, url_times, options):
283 requests = {}
284 total_requests = 0
285 for tracefile in tracefiles:
286@@ -402,12 +444,12 @@
287 log.debug("Parsed %d requests", total_requests)
288
289 # Add the request to any matching categories.
290- if categories is not None:
291+ if options.categories:
292 for category in categories:
293 category.add(request)
294
295 # Add the request to the times for that pageid.
296- if pageid_times is not None and request.pageid is not None:
297+ if options.pageids:
298 pageid = request.pageid
299 try:
300 times = pageid_times[pageid]
301@@ -416,6 +458,21 @@
302 pageid_times[pageid] = times
303 times.add(request)
304
305+ # Add the request to the times for that URL.
306+ if options.top_urls:
307+ url = request.url
308+ # Hack to remove opstats from top N report. This
309+ # should go into a config file if we end up with
310+ # more pages that need to be ignored because
311+ # they are just noise.
312+ if not (url is None or url.endswith('+opstats')):
313+ try:
314+ times = url_times[url]
315+ except KeyError:
316+ times = Times(options.timeout)
317+ url_times[url] = times
318+ times.add(request)
319+
320 else:
321 raise MalformedLine('Unknown record type %s', record_type)
322 except MalformedLine, x:
323@@ -442,7 +499,19 @@
324 "Unknown extension prefix %s" % prefix)
325
326
327-def html_report(outf, categories, pageid_times):
328+def html_report(
329+ outf, categories, pageid_times, url_times,
330+ ninetyninth_percentile_threshold=None):
331+ """Write an html report to outf.
332+
333+ :param outf: A file object to write the report to.
334+ :param categories: Categories to report.
335+ :param pageid_times: The time statistics for pageids.
336+ :param url_times: The time statistics for the top XXX urls.
337+ :param ninetyninth_percentile_threshold: Lower threshold for inclusion of
338+ pages in the pageid section; pages where 99 percent of the requests are
339+ served under this threshold will not be included.
340+ """
341
342 print >> outf, dedent('''\
343 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
344@@ -502,6 +571,8 @@
345
346 <th class="clickable">Total Time (secs)</th>
347
348+ <th class="clickable">99% Under Time (secs)</th>
349+
350 <th class="clickable">Mean Time (secs)</th>
351 <th class="clickable">Time Standard Deviation</th>
352 <th class="clickable">Time Variance</th>
353@@ -537,6 +608,7 @@
354 <th class="category-title">%s</th>
355 <td class="numeric total_hits">%d</td>
356 <td class="numeric total_time">%.2f</td>
357+ <td class="numeric 99pc_under">%.2f</td>
358 <td class="numeric mean_time">%.2f</td>
359 <td class="numeric std_time">%.2f</td>
360 <td class="numeric var_time">%.2f</td>
361@@ -559,6 +631,7 @@
362 """ % (
363 html_title,
364 stats.total_hits, stats.total_time,
365+ stats.ninetyninth_percentile_time,
366 stats.mean, stats.std, stats.var, stats.median,
367 len(histograms) - 1,
368 stats.total_sqltime, stats.mean_sqltime,
369@@ -568,13 +641,14 @@
370 stats.median_sqlstatements))
371
372 # Table of contents
373- if categories and pageid_times:
374- print >> outf, dedent('''\
375- <ol>
376- <li><a href="#catrep">Category Report</a></li>
377- <li><a href="#pageidrep">Pageid Report</a></li>
378- </ol>
379- ''')
380+ print >> outf, '<ol>'
381+ if categories:
382+ print >> outf, '<li><a href="#catrep">Category Report</a></li>'
383+ if pageid_times:
384+ print >> outf, '<li><a href="#pageidrep">Pageid Report</a></li>'
385+ if url_times:
386+ print >> outf, '<li><a href="#topurlrep">Top URL Report</a></li>'
387+ print >> outf, '</ol>'
388
389 if categories:
390 print >> outf, '<h2 id="catrep">Category Report</h2>'
391@@ -589,9 +663,21 @@
392 print >> outf, '<h2 id="pageidrep">Pageid Report</h2>'
393 print >> outf, table_header
394 for pageid, times in sorted(pageid_times.items()):
395+ pageid = pageid or 'None'
396+ if (ninetyninth_percentile_threshold is not None and
397+ (times.stats().ninetyninth_percentile_time <
398+ ninetyninth_percentile_threshold)):
399+ continue
400 handle_times(html_quote(pageid), times)
401 print >> outf, table_footer
402
403+ if url_times:
404+ print >> outf, '<h2 id="topurlrep">Top URL Report</h2>'
405+ print >> outf, table_header
406+ for url, times in url_times:
407+ handle_times(html_quote(url), times)
408+ print >> outf, table_footer
409+
410 # Ourput the javascript to render our histograms nicely, replacing
411 # the placeholder <div> tags output earlier.
412 print >> outf, dedent("""\
413
414=== modified file 'utilities/page-performance-report-daily.sh'
415--- utilities/page-performance-report-daily.sh 2010-07-08 09:11:54 +0000
416+++ utilities/page-performance-report-daily.sh 2010-08-12 05:59:44 +0000
417@@ -26,11 +26,14 @@
418 echo Generating report from $from until $until into $dir `date`
419
420 ./page-performance-report.py -v --from=$from --until=$until \
421- --directory=${dir} $logs
422+ --top-urls=200 --directory=${dir} $logs
423
424 ln -sf ${dir}/categories.html ${root}/latest-${type}-categories.html
425 ln -sf ${dir}/pageids.html ${root}/latest-${type}-pageids.html
426 ln -sf ${dir}/combined.html ${root}/latest-${type}-combined.html
427+ ln -sf ${dir}/top200.html ${root}/latest-${type}-top200.html
428+ ln -sf ${dir}/timeout-candidates.html \
429+ ${root}/latest-${type}-timeout-candidates.html
430
431 return 0
432 }
433@@ -57,8 +60,4 @@
434 report 32 monthly `date -d 'last month' $fmt` $now
435 fi
436
437-# One off reports to populate history.
438-## report 40 monthly `date -d '1 june 2010' $fmt` `date -d '1 july 2010' $fmt`
439-## report 23 weekly `date -d '19 june 2010' $fmt` `date -d '26 june 2010' $fmt`
440-## report 16 weekly `date -d '26 june 2010' $fmt` `date -d '3 july 2010' $fmt`
441