Merge lp:~stefanor/ibid/feedcache-330880 into lp:~ibid-core/ibid/old-trunk-pack-0.92

Proposed by Stefano Rivera
Status: Merged
Approved by: Michael Gorven
Approved revision: 566
Merged at revision: 567
Proposed branch: lp:~stefanor/ibid/feedcache-330880
Merge into: lp:~ibid-core/ibid/old-trunk-pack-0.92
Diff against target: None lines
To merge this branch: bzr merge lp:~stefanor/ibid/feedcache-330880
Reviewer Review Type Date Requested Status
Michael Gorven Approve
Jonathan Hitchcock Approve
Review via email: mp+4271@code.launchpad.net
To post a comment you must log in.
Revision history for this message
Stefano Rivera (stefanor) wrote :

My squid started biting me. We may come across similar content-encoding issues elsewhere, too.

Revision history for this message
Jonathan Hitchcock (vhata) :
review: Approve
Revision history for this message
Michael Gorven (mgorven) wrote :

Looks fine. Needs to wait for lp:~stefanor/ibid/exchange-336443 though.

lp:~stefanor/ibid/feedcache-330880 updated
564. By Stefano Rivera

Found a bug in compression header handling

565. By Stefano Rivera

Merge from trunk

566. By Stefano Rivera

Typo

Revision history for this message
Michael Gorven (mgorven) :
review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'ibid/plugins/feeds.py'
2--- ibid/plugins/feeds.py 2009-03-01 23:01:30 +0000
3+++ ibid/plugins/feeds.py 2009-03-07 18:41:10 +0000
4@@ -1,6 +1,9 @@
5 import re
6 from datetime import datetime
7 import logging
8+from urllib2 import urlopen, URLError
9+from urlparse import urljoin
10+from html5lib import HTMLParser, treebuilders
11
12 from sqlalchemy import Column, Integer, Unicode, DateTime, UnicodeText, ForeignKey, Table
13 from sqlalchemy.sql import func
14@@ -10,6 +13,7 @@
15 import ibid
16 from ibid.plugins import Processor, match, authorise
17 from ibid.models import Base
18+from ibid.utils import cacheable_download, get_soup
19
20 help = {'feeds': u'Displays articles from RSS and Atom feeds'}
21
22@@ -32,15 +36,11 @@
23 self.url = url
24 self.identity_id = identity_id
25 self.time = datetime.now()
26-
27- def is_valid(self):
28 self.update()
29- if self.feed['version']:
30- return True
31- return False
32
33 def update(self):
34- self.feed = feedparser.parse(self.url)
35+ feedfile = cacheable_download(self.url, "feeds/%s-%i.xml" % (re.sub(r'\W+', '_', self.name), self.identity_id))
36+ self.feed = feedparser.parse(feedfile)
37 self.entries = self.feed['entries']
38
39 class Manage(Processor):
40@@ -59,18 +59,31 @@
41
42 if feed:
43 event.addresponse(u"I already have the %s feed" % name)
44- else:
45- feed = Feed(unicode(name), unicode(url), event.identity)
46-
47- if feed.is_valid():
48- session.save(feed)
49- session.flush()
50- event.addresponse(True)
51- log.info(u"Added feed '%s' by %s/%s (%s): %s (Found %s entries)", name, event.account, event.identity, event.sender['connection'], url, len(feed.entries))
52- else:
53+ return
54+
55+ valid = bool(feedparser.parse(url)["version"])
56+
57+ if not valid:
58+ soup = get_soup(url)
59+ for alternate in soup.findAll('link', {'rel': 'alternate',
60+ 'type': re.compile(r'^application/(atom|rss)\+xml$'),
61+ 'href': re.compile(r'.+')}):
62+ newurl = urljoin(url, alternate["href"])
63+ valid = bool(feedparser.parse(newurl)["version"])
64+
65+ if valid:
66+ url = newurl
67+ break
68+
69+ if not valid:
70 event.addresponse(u"Sorry, I could not add the %s feed. %s is not a valid feed" % (name,url))
71+ return
72
73- session.close()
74+ feed = Feed(unicode(name), unicode(url), event.identity)
75+ session.save(feed)
76+ session.flush()
77+ event.addresponse(True)
78+ log.info(u"Added feed '%s' by %s/%s (%s): %s (Found %s entries)", name, event.account, event.identity, event.sender['connection'], url, len(feed.entries))
79
80 @match(r'^(?:list\s+)?feeds$')
81 def list(self, event):
82
83=== modified file 'ibid/utils.py'
84--- ibid/utils.py 2009-03-05 16:33:12 +0000
85+++ ibid/utils.py 2009-03-07 18:41:10 +0000
86@@ -1,10 +1,16 @@
87+import cgi
88+from gzip import GzipFile
89 from htmlentitydefs import name2codepoint
90 import os
91 import os.path
92 from pkg_resources import resource_exists, resource_string
93 import re
94+from StringIO import StringIO
95 import time
96 import urllib2
97+import zlib
98+
99+from html5lib import HTMLParser, treebuilders
100
101 import ibid
102
103@@ -80,10 +86,21 @@
104 # Download into a temporary file, in case something goes wrong
105 downloadfile = os.path.join(plugindir, ".download." + os.path.basename(cachefile))
106 outfile = file(downloadfile, "wb")
107- buf = "x"
108- while len(buf) > 0:
109- buf = connection.read(1024)
110- outfile.write(buf)
111+ data = connection.read()
112+
113+ compression = connection.headers.get('content-encoding')
114+ if compression:
115+ if compression.lower() == "deflate":
116+ try:
117+ data = zlib.decompress(data)
118+ except zlib.error:
119+ data = zlib.decompress(data, -zlib.MAX_WBITS)
120+ elif compression.lower() == "gzip":
121+ compressedstream = StringIO(data)
122+ gzipper = GzipFile(fileobj=compressedstream)
123+ data = gzipper.read()
124+
125+ outfile.write(data)
126
127 outfile.close()
128
129@@ -112,4 +129,34 @@
130 def ibid_version():
131 return resource_exists(__name__, '.version') and resource_string(__name__, '.version').strip() or None
132
133+def get_soup(url, data=None, headers={}):
134+ "Request a URL and create a BeautifulSoup parse tree from it"
135+
136+ req = urllib2.Request(url, data, headers)
137+ f = urllib2.urlopen(req)
138+ data = f.read()
139+ f.close()
140+
141+ encoding = None
142+ contentType = f.headers.get('content-type')
143+ if contentType:
144+ (mediaType, params) = cgi.parse_header(contentType)
145+ encoding = params.get('charset')
146+
147+ compression = f.headers.get('content-encoding')
148+ if compression.lower() == "deflate":
149+ try:
150+ data = zlib.decompress(data)
151+ except zlib.error:
152+ data = zlib.decompress(data, -zlib.MAX_WBITS)
153+ elif compression.lower() == "gzip":
154+ compressedstream = StringIO(data)
155+ gzipper = GzipFile(fileobj=compressedstream)
156+ data = gzipper.read()
157+
158+ treebuilder = treebuilders.getTreeBuilder("beautifulsoup")
159+ parser = HTMLParser(tree=treebuilder)
160+
161+ return parser.parse(data, encoding=encoding)
162+
163 # vi: set et sta sw=4 ts=4:

Subscribers

People subscribed via source and target branches