Launchpad itself

Code review comment for lp:~edwin-grubbs/launchpad/bug-615655-unicode-oops

bug-615655-unicode-oops
Merge into devel

Revision history for this message

Edwin Grubbs (edwin-grubbs) wrote on 2010-08-23:

Hi Aaron,

I've fixed it to only escape the headers. I tried using message_from_string(), but the Message object requires that each header be deleted and then re-added, otherwise it would just create a duplicate header, and I started to worry about corner cases such as duplicate headers, which the method I was using didn't seem to take into account.

-Edwin

+import re
import xmlrpclib

from zope.component import getUtility
@@ -233,10 +234,15 @@
         # though it's much more convenient to just pass 8-bit strings.
         if isinstance(bytes, xmlrpclib.Binary):
             bytes = bytes.data
- # Although it is illegal for an email to have unencoded non-ascii
- # characters, it is better to let the list owner process the
- # message than to cause an oops.
- bytes = escape_nonascii_uniquely(bytes)
+ # Although it is illegal for an email header to have unencoded
+ # non-ascii characters, it is better to let the list owner
+ # process the message than to cause an oops.
+ header_body_separator = re.compile('\r\n\r\n|\r\r|\n\n')
+ match = header_body_separator.search(bytes)
+ header = bytes[:match.start()]
+ header = escape_nonascii_uniquely(header)
+ bytes = header + bytes[match.start():]
+
         mailing_list = getUtility(IMailingListSet).get(team_name)
         message = getUtility(IMessageSet).fromEmail(bytes)
         mailing_list.holdMessage(message)

-Messages with non-ascii in their headers or bodies are not exactly legal
+Messages with non-ascii in their headers are not exactly legal
(they should be encoded) but do occur especially in spam. These
messages can be held for moderator approval too. To avoid blowing up
later if the string is converted to a unicode object, the non-ascii
@@ -239,8 +239,7 @@
     ... Message-ID: <fifth-post\xa9>
     ... Date: Fri, 01 Aug 2000 01:08:59 -0000
     ...
- ... Watch out for badgers! \xa9
- ... Don't double quote characters: =E3=F6=FC
+ ... Don't escape non-ascii characters in the body! \xa9
     ... """)

     >>> import xmlrpclib
@@ -269,8 +268,7 @@
      'Message-ID: <fifth-post\\xa9>',
      'Date: Fri, 01 Aug 2000 01:08:59 -0000',
      '',
- 'Watch out for badgers! \\xa9',
- "Don't double quote characters: =E3=F6=FC"]
+ "Don't escape non-ascii characters in the body! \xa9"]

>>> held_message_spam.status
<DBItem PostedMessageStatus.NEW, (0) New status>

Hi Aaron,

-Edwin

=== modified file 'lib/canonical/launchpad/xmlrpc/mailinglist.py'
--- lib/canonical/launchpad/xmlrpc/mailinglist.py	2010-08-23 17:43:51 +0000
+++ lib/canonical/launchpad/xmlrpc/mailinglist.py	2010-08-23 22:03:50 +0000
@@ -8,6 +8,7 @@
     'MailingListAPIView',
     ]
 
+import re
 import xmlrpclib
 
 from zope.component import getUtility
@@ -233,10 +234,15 @@
         # though it's much more convenient to just pass 8-bit strings.
         if isinstance(bytes, xmlrpclib.Binary):
             bytes = bytes.data
-        # Although it is illegal for an email to have unencoded non-ascii
-        # characters, it is better to let the list owner process the
-        # message than to cause an oops.
-        bytes = escape_nonascii_uniquely(bytes)
+        # Although it is illegal for an email header to have unencoded
+        # non-ascii characters, it is better to let the list owner
+        # process the message than to cause an oops.
+        header_body_separator = re.compile('\r\n\r\n|\r\r|\n\n')
+        match = header_body_separator.search(bytes)
+        header = bytes[:match.start()]
+        header = escape_nonascii_uniquely(header)
+        bytes = header + bytes[match.start():]
+
         mailing_list = getUtility(IMailingListSet).get(team_name)
         message = getUtility(IMessageSet).fromEmail(bytes)
         mailing_list.holdMessage(message)

=== modified file 'lib/lp/registry/doc/message-holds-xmlrpc.txt'
--- lib/lp/registry/doc/message-holds-xmlrpc.txt	2010-08-23 17:50:11 +0000
+++ lib/lp/registry/doc/message-holds-xmlrpc.txt	2010-08-23 22:06:19 +0000
@@ -226,7 +226,7 @@
 Non-ascii messages
 ==================
 
-Messages with non-ascii in their headers or bodies are not exactly legal
+Messages with non-ascii in their headers are not exactly legal
 (they should be encoded) but do occur especially in spam.  These
 messages can be held for moderator approval too. To avoid blowing up
 later if the string is converted to a unicode object, the non-ascii
@@ -239,8 +239,7 @@
     ... Message-ID: <fifth-post\xa9>
     ... Date: Fri, 01 Aug 2000 01:08:59 -0000
     ...
-    ... Watch out for badgers! \xa9
-    ... Don't double quote characters: =E3=F6=FC
+    ... Don't escape non-ascii characters in the body! \xa9
     ... """)
 
     >>> import xmlrpclib
@@ -269,8 +268,7 @@
      'Message-ID: <fifth-post\\xa9>',
      'Date: Fri, 01 Aug 2000 01:08:59 -0000',
      '',
-     'Watch out for badgers! \\xa9',
-     "Don't double quote characters: =E3=F6=FC"]
+     "Don't escape non-ascii characters in the body! \xa9"]
 
     >>> held_message_spam.status
      <DBItem PostedMessageStatus.NEW, (0) New status>

« Back to merge proposal