Merge lp:~cjwatson/lp-dev-utils/ppr-talisker into lp:lp-dev-utils

Proposed by Colin Watson
Status: Merged
Merged at revision: 131
Proposed branch: lp:~cjwatson/lp-dev-utils/ppr-talisker
Merge into: lp:lp-dev-utils
Diff against target: 196 lines (+129/-9)
2 files modified
page-performance-report.ini (+2/-2)
pageperformancereport.py (+127/-7)
To merge this branch: bzr merge lp:~cjwatson/lp-dev-utils/ppr-talisker
Reviewer Review Type Date Requested Status
Cristian Gonzalez (community) Approve
Review via email: mp+405633@code.launchpad.net

Commit message

Add Talisker log format support to the PPR.

To post a comment you must log in.
Revision history for this message
Cristian Gonzalez (cristiangsp) wrote :

Looks good!

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'page-performance-report.ini'
2--- page-performance-report.ini 2021-07-09 13:19:44 +0000
3+++ page-performance-report.ini 2021-07-13 13:44:38 +0000
4@@ -51,8 +51,8 @@
5 Translations - Front page=^https?://translations\.[^/]+/translations/\+index$
6 Translations - Overview=^https?://translations\..*/\+lang/\w+(/\+index)?$
7
8-Public XML-RPC=^https://(launchpad|xmlrpc)[^/]+/bazaar/\w+$
9-Private XML-RPC=^https://(launchpad|xmlrpc)[^/]+/
10+Public XML-RPC=^https?://(launchpad|xmlrpc)[^/]+/bazaar/\w+$
11+Private XML-RPC=^https?://(launchpad|xmlrpc)[^/]+/
12 (archive|authserver|bugs|codehosting|
13 codeimportscheduler|mailinglists|
14 featureflags|git)/\w+$
15
16=== modified file 'pageperformancereport.py'
17--- pageperformancereport.py 2021-07-09 13:15:54 +0000
18+++ pageperformancereport.py 2021-07-13 13:44:38 +0000
19@@ -23,6 +23,7 @@
20 import sre_constants
21 from textwrap import dedent
22 import time
23+from urlparse import urlunsplit
24
25
26 logging.basicConfig()
27@@ -136,6 +137,44 @@
28 return interval.seconds + interval.microseconds / 1000000.0
29
30
31+class TaliskerRequest:
32+
33+ def __init__(self, dt, logfmt):
34+ self.dt = dt
35+ host = logfmt['host']
36+ if logfmt['scheme'] == 'https' and host.endswith(':443'):
37+ host = host[:-len(':443')]
38+ self.url = urlunsplit(
39+ (logfmt['scheme'], host, logfmt['path'], logfmt.get('qs', ''), ''))
40+ self.interval = timedelta(milliseconds=float(logfmt['duration_ms']))
41+ self.traversal_interval = (
42+ timedelta(milliseconds=float(logfmt['traversal_duration_ms']))
43+ if 'traversal_duration_ms' in logfmt else None)
44+ self.publication_interval = (
45+ timedelta(milliseconds=float(logfmt['publication_duration_ms']))
46+ if 'publication_duration_ms' in logfmt else None)
47+ self.pageid = logfmt.get('pageid')
48+ self.sql_statements = (
49+ int(logfmt['sql_statements'])
50+ if 'sql_statements' in logfmt else None)
51+ self.sql_seconds = (
52+ float(logfmt['sql_ms']) / 1000 if 'sql_ms' in logfmt else None)
53+
54+ @property
55+ def app_seconds(self):
56+ if self.traversal_interval is not None:
57+ interval = self.traversal_interval
58+ if self.publication_interval is not None:
59+ interval += self.publication_interval
60+ else:
61+ interval = self.interval
62+ return interval.total_seconds()
63+
64+ @property
65+ def total_seconds(self):
66+ return self.interval.total_seconds()
67+
68+
69 class Category:
70 """A Category in our report.
71
72@@ -872,6 +911,8 @@
73 parse_launchpad(args, times, options)
74 elif log_format == 'access':
75 parse_access(args, times, options)
76+ elif log_format == 'talisker':
77+ parse_talisker(args, times, options)
78 else:
79 log.fatal("No parser available for log_format: %r" % log_format)
80 return 1
81@@ -998,12 +1039,15 @@
82
83
84 _zserver_ts_re = re.compile(
85- '^(\d{4})-(\d\d)-(\d\d)\s(\d\d):(\d\d):(\d\d)(?:.(\d{6}))?$')
86+ r'^(\d{4})-(\d\d)-(\d\d)\s(\d\d):(\d\d):(\d\d)(?:.(\d{6}))?$')
87
88 _access_ts_re = re.compile(
89- '^\[(\d{2})/([A-Za-z]+)/(\d{4}):(\d{2}):(\d{2}):(\d{2})[+-]\d{4}\]$')
90-
91-
92+ r'^\[(\d{2})/([A-Za-z]+)/(\d{4}):(\d{2}):(\d{2}):(\d{2})[+-]\d{4}\]$')
93+
94+_talisker_ts_re = re.compile(
95+ r'^(\d{4})-(\d\d)-(\d\d)\s(\d\d):(\d\d):(\d\d)\.(\d{3})Z$')
96+
97+
98 def parse_timestamp_zserverlog(ts_string):
99 match = _zserver_ts_re.search(ts_string)
100 if match is None:
101@@ -1020,7 +1064,16 @@
102 month = time.strptime(alpha_month,'%b').tm_mon # get numeric month
103 return datetime(
104 *(int(elem) for elem in (year, month, day, hour, min, sec)))
105-
106+
107+
108+def parse_timestamp_taliskerlog(ts_string):
109+ match = _talisker_ts_re.search(ts_string)
110+ if match is None:
111+ raise ValueError("Invalid timestamp")
112+ year, month, day, hour, min, sec, millisec = match.groups()
113+ return datetime(
114+ int(year), int(month), int(day), int(hour), int(min), int(sec),
115+ int(millisec) * 1000)
116
117
118 def parse_launchpad(tracefiles, times, options):
119@@ -1180,8 +1233,75 @@
120 except MalformedLine as x:
121 log.error(
122 "Malformed line %s (%s)" % (repr(line), x))
123-
124-
125+
126+
127+def unquote_logfmt_value(value):
128+ if value.startswith('"') and value.endswith('"'):
129+ value = value[1:-1]
130+ return value.replace(r'\"', '"')
131+
132+
133+def parse_talisker(tracefiles, times, options):
134+ talisker_re = re.compile(
135+ r'^(?P<timestamp>\d+-\d+-\d+ \d+:\d+:\d+\.\d+Z)'
136+ r' (?P<levelname>\S+) (?P<module>[0-9a-zA-Z_]+(?:\.[0-9a-zA-Z_]+)*)'
137+ r' "(?P<message>(?:[^"\\]|\\.)*(?:"|\.\.\.))'
138+ r' ?(?P<logfmt>.*)')
139+ logfmt_re = re.compile(
140+ r'(?P<key>.+?)=(?P<value>"(?:[^"\\]|\\.)*?"|\S+)\s*')
141+ requests = {}
142+ total_requests = 0
143+ for tracefile in tracefiles:
144+ log.info('Processing %s', tracefile)
145+ for line in smart_open(tracefile):
146+ line = line.rstrip()
147+ if not line or not line[0].isdigit():
148+ # Log entries start with a digit. Other lines are things
149+ # like tracebacks or warnings, so just ignore those.
150+ continue
151+
152+ try:
153+ match = talisker_re.match(line)
154+ if not match:
155+ raise MalformedLine()
156+
157+ try:
158+ dt = parse_timestamp_taliskerlog(
159+ match.group('timestamp'))
160+ except ValueError:
161+ raise MalformedLine(
162+ 'Invalid timestamp %r' % match.group('timestamp'))
163+
164+ # Filter entries by command line date range.
165+ if options.from_ts is not None and dt < options.from_ts:
166+ continue # Skip to next line.
167+ if options.until_ts is not None and dt > options.until_ts:
168+ break # Skip to next log file.
169+
170+ # Parse logfmt.
171+ logfmt = {
172+ m.group('key'): unquote_logfmt_value(m.group('value'))
173+ for m in logfmt_re.finditer(match.group('logfmt'))}
174+ if not all(
175+ key in logfmt for key in
176+ ('request_id', 'method', 'scheme', 'host', 'path')):
177+ # Some other log entry, not an HTTP request.
178+ continue
179+
180+ url = urlunsplit(
181+ (logfmt['scheme'], logfmt['host'], logfmt['path'],
182+ logfmt.get('qs', ''), ''))
183+ request = TaliskerRequest(dt, logfmt)
184+ requests[logfmt['request_id']] = request
185+ total_requests += 1
186+ if total_requests % 10000 == 0:
187+ log.debug("Parsed %d requests", total_requests)
188+ # Add the request to any matching categories.
189+ times.add_request(request)
190+ except MalformedLine as x:
191+ log.error("Malformed line %r (%s)" % (line, x))
192+
193+
194 def html_report(
195 outf, category_times, pageid_times, url_times,
196 ninetyninth_percentile_threshold=None, histogram_resolution=0.5,

Subscribers

People subscribed via source and target branches