1
=== modified file 'lib/canonical/encoding.py'
2
--- lib/canonical/encoding.py	2009-06-25 05:30:52 +0000
3
+++ lib/canonical/encoding.py	2010-08-24 16:07:48 +0000
4
@@ -4,14 +4,17 @@
5
4
"""Character encoding utilities"""
4
"""Character encoding utilities"""
6
5
5
7
6
__metaclass__ = type
6
__metaclass__ = type
8
7
__all__ = [
9
8
    'ascii_smash',
10
9
    'escape_nonascii_uniquely',
11
10
    'guess',
12
11
    ]
13
12
14
7
import re
13
import re
15
8
import codecs
14
import codecs
16
9
import unicodedata
15
import unicodedata
17
10
from htmlentitydefs import codepoint2name
18
11
from cStringIO import StringIO
16
from cStringIO import StringIO
19
12
17
20
13
__all__ = ['guess', 'ascii_smash']
21
14
22
15
_boms = [
18
_boms = [
23
16
    (codecs.BOM_UTF16_BE, 'utf_16_be'),
19
    (codecs.BOM_UTF16_BE, 'utf_16_be'),
24
17
    (codecs.BOM_UTF16_LE, 'utf_16_le'),
20
    (codecs.BOM_UTF16_LE, 'utf_16_le'),
25
@@ -151,33 +154,6 @@
26
151
    return unicode(s, 'ISO-8859-1', 'replace')
154
    return unicode(s, 'ISO-8859-1', 'replace')
27
152
155
28
153
156
29
154
# def unicode_to_unaccented_str(text):
30
155
#     """Converts a unicode string into an ascii-only str, converting accented
31
156
#     characters to their plain equivalents.
32
157
#
33
158
#     >>> unicode_to_unaccented_str(u'')
34
159
#     ''
35
160
#     >>> unicode_to_unaccented_str(u'foo bar 123')
36
161
#     'foo bar 123'
37
162
#     >>> unicode_to_unaccented_str(u'viva S\xe3o Carlos!')
38
163
#     'viva Sao Carlos!'
39
164
#     """
40
165
#     assert isinstance(text, unicode)
41
166
#     L = []
42
167
#     for char in text:
43
168
#         charnum = ord(char)
44
169
#         codepoint = codepoint2name.get(charnum)
45
170
#         if codepoint is not None:
46
171
#             strchar = codepoint[0]
47
172
#         else:
48
173
#             try:
49
174
#                 strchar = char.encode('ascii')
50
175
#             except UnicodeEncodeError:
51
176
#                 strchar = ''
52
177
#         L.append(strchar)
53
178
#     return ''.join(L)
54
179
55
180
56
181
def ascii_smash(unicode_string):
157
def ascii_smash(unicode_string):
57
182
    """Attempt to convert the Unicode string, possibly containing accents,
158
    """Attempt to convert the Unicode string, possibly containing accents,
58
183
    to an ASCII string.
159
    to an ASCII string.
59
@@ -370,6 +346,44 @@
60
370
    if match is not None:
346
    if match is not None:
61
371
        return match.group(1)
347
        return match.group(1)
62
372
348
64
373
    # Something we can"t represent. Return empty string.
349
    # Something we can't represent. Return empty string.
65
374
    return ""
350
    return ""
66
375
351
67
352
68
353
def escape_nonascii_uniquely(bogus_string):
69
354
    """Replace non-ascii characters with a hex representation.
70
355
71
356
    This is mainly for preventing emails with invalid characters from causing
72
357
    oopses. The nonascii characters could have been removed or just converted
73
358
    to "?", but this provides some insight into what the bogus data was, and
74
359
    it prevents the message-id from two unrelated emails matching because
75
360
    all the nonascii characters have been replaced with the same ascii
76
361
    character.
77
362
78
363
    Unfortunately, all the strings below are actually part of this
79
364
    function's docstring, so python processes the backslash once before
80
365
    doctest, and then python processes it again when doctest runs the
81
366
    test. This makes it confusing, since four backslashes will get
82
367
    converted into a single ascii character.
83
368
84
369
    >>> print len('\xa9'), len('\\xa9'), len('\\\\xa9')
85
370
    1 1 4
86
371
    >>> print escape_nonascii_uniquely('hello \xa9')
87
372
    hello \\xa9
88
373
    >>> print escape_nonascii_uniquely('hello \\xa9')
89
374
    hello \\xa9
90
375
91
376
    This string only has ascii characters, so escape_nonascii_uniquely()
92
377
    actually has no effect.
93
378
94
379
    >>> print escape_nonascii_uniquely('hello \\\\xa9')
95
380
    hello \\xa9
96
381
    """
97
382
    nonascii_regex = re.compile(r'[\200-\377]')
98
383
    # By encoding the invalid ascii with a backslash, x, and then the
99
384
    # hex value, it makes it easy to decode it by pasting into a python
100
385
    # interpreter. quopri() is not used, since that could caused the
101
386
    # decoding of an email to fail.
102
387
    def quote(match):
103
388
        return '\\x%x' % ord(match.group(0))
104
389
    return nonascii_regex.sub(quote, bogus_string)
105
376
390
106
=== modified file 'lib/canonical/launchpad/xmlrpc/mailinglist.py'
107
--- lib/canonical/launchpad/xmlrpc/mailinglist.py	2010-08-20 20:31:18 +0000
108
+++ lib/canonical/launchpad/xmlrpc/mailinglist.py	2010-08-24 16:07:48 +0000
109
@@ -8,7 +8,7 @@
110
8
    'MailingListAPIView',
8
    'MailingListAPIView',
111
9
    ]
9
    ]
112
10
10
114
11
11
import re
115
12
import xmlrpclib
12
import xmlrpclib
116
13
13
117
14
from zope.component import getUtility
14
from zope.component import getUtility
118
@@ -16,6 +16,7 @@
119
16
from zope.security.proxy import removeSecurityProxy
16
from zope.security.proxy import removeSecurityProxy
120
17
17
121
18
from canonical.config import config
18
from canonical.config import config
122
19
from canonical.encoding import escape_nonascii_uniquely
123
19
from canonical.launchpad.interfaces import (
20
from canonical.launchpad.interfaces import (
124
20
    EmailAddressStatus,
21
    EmailAddressStatus,
125
21
    IEmailAddressSet,
22
    IEmailAddressSet,
126
@@ -240,6 +241,15 @@
127
240
        # though it's much more convenient to just pass 8-bit strings.
241
        # though it's much more convenient to just pass 8-bit strings.
128
241
        if isinstance(bytes, xmlrpclib.Binary):
242
        if isinstance(bytes, xmlrpclib.Binary):
129
242
            bytes = bytes.data
243
            bytes = bytes.data
130
244
        # Although it is illegal for an email header to have unencoded
131
245
        # non-ascii characters, it is better to let the list owner
132
246
        # process the message than to cause an oops.
133
247
        header_body_separator = re.compile('\r\n\r\n|\r\r|\n\n')
134
248
        match = header_body_separator.search(bytes)
135
249
        header = bytes[:match.start()]
136
250
        header = escape_nonascii_uniquely(header)
137
251
        bytes = header + bytes[match.start():]
138
252
139
243
        mailing_list = getUtility(IMailingListSet).get(team_name)
253
        mailing_list = getUtility(IMailingListSet).get(team_name)
140
244
        message = getUtility(IMessageSet).fromEmail(bytes)
254
        message = getUtility(IMessageSet).fromEmail(bytes)
141
245
        mailing_list.holdMessage(message)
255
        mailing_list.holdMessage(message)
142
246
256
143
=== modified file 'lib/lp/registry/doc/message-holds-xmlrpc.txt'
144
--- lib/lp/registry/doc/message-holds-xmlrpc.txt	2010-07-13 20:15:26 +0000
145
+++ lib/lp/registry/doc/message-holds-xmlrpc.txt	2010-08-24 16:07:48 +0000
146
@@ -226,18 +226,20 @@
147
226
Non-ascii messages
226
Non-ascii messages
148
227
==================
227
==================
149
228
228
153
229
Messages with non-ascii in their headers or bodies are not exactly legal (they
229
Messages with non-ascii in their headers are not exactly legal
154
230
should be encoded) but do occur especially in spam.  These messages can be
230
(they should be encoded) but do occur especially in spam.  These
155
231
held for moderator approval too.
231
messages can be held for moderator approval too. To avoid blowing up
156
232
later if the string is converted to a unicode object, the non-ascii
157
233
characters are replaced.
158
232
234
159
233
    >>> spam_message = message_from_string("""\
235
    >>> spam_message = message_from_string("""\
160
234
    ... From: Anne \xa9 Person <anne.person@example.com>
236
    ... From: Anne \xa9 Person <anne.person@example.com>
161
235
    ... To: team-one@lists.launchpad.dev
237
    ... To: team-one@lists.launchpad.dev
162
236
    ... Subject: \xa9 Badgers!
238
    ... Subject: \xa9 Badgers!
164
237
    ... Message-ID: <fifth-post>
239
    ... Message-ID: <fifth-post\xa9>
165
238
    ... Date: Fri, 01 Aug 2000 01:08:59 -0000
240
    ... Date: Fri, 01 Aug 2000 01:08:59 -0000
166
239
    ...
241
    ...
168
240
    ... Watch out for badgers! \xa9
242
    ... Don't escape non-ascii characters in the body! \xa9
169
241
    ... """)
243
    ... """)
170
242
244
171
243
    >>> import xmlrpclib
245
    >>> import xmlrpclib
172
@@ -247,9 +249,10 @@
173
247
    True
249
    True
174
248
    >>> commit()
250
    >>> commit()
175
249
251
177
250
    >>> held_message_spam = message_set.getMessageByMessageID('<fifth-post>')
252
    >>> held_message_spam = message_set.getMessageByMessageID(
178
253
    ...     '<fifth-post\\xa9>')
179
251
    >>> print held_message_spam.message_id
254
    >>> print held_message_spam.message_id
181
252
    <fifth-post>
255
    <fifth-post\xa9>
182
253
    >>> print held_message_spam.posted_by.displayname
256
    >>> print held_message_spam.posted_by.displayname
183
254
    Anne Person
257
    Anne Person
184
255
258
185
@@ -258,14 +261,14 @@
186
258
    ...     message_content = held_message_spam.posted_message.read()
261
    ...     message_content = held_message_spam.posted_message.read()
187
259
    ... finally:
262
    ... finally:
188
260
    ...     held_message_spam.posted_message.close()
263
    ...     held_message_spam.posted_message.close()
191
261
    >>> message_content.splitlines()
264
    >>> print pretty(message_content.splitlines())
192
262
    ['From: Anne \xa9 Person <anne.person@example.com>',
265
    ['From: Anne \\xa9 Person <anne.person@example.com>',
193
263
     'To: team-one@lists.launchpad.dev',
266
     'To: team-one@lists.launchpad.dev',
196
264
     'Subject: \xa9 Badgers!',
267
     'Subject: \\xa9 Badgers!',
197
265
     'Message-ID: <fifth-post>',
268
     'Message-ID: <fifth-post\\xa9>',
198
266
     'Date: Fri, 01 Aug 2000 01:08:59 -0000',
269
     'Date: Fri, 01 Aug 2000 01:08:59 -0000',
199
267
     '',
270
     '',
201
268
     'Watch out for badgers! \xa9']
271
     "Don't escape non-ascii characters in the body! \xa9"]
202
269
272
203
270
    >>> held_message_spam.status
273
    >>> held_message_spam.status
204
271
     <DBItem PostedMessageStatus.NEW, (0) New status>
274
     <DBItem PostedMessageStatus.NEW, (0) New status>
Reviewer	Review Type	Date Requested	Status
Aaron Bentley (community)		2010-08-23	Approve on 2010-08-24
Review via email: mp+33428@code.launchpad.net