root / atomisator / sources / feedparser.py

Revision 89:dd1daf2a2d4c, 107.5 kB (checked in by Tarek Ziad?? <tarek@…>, 17 months ago)

added a first source reader

Line 
1# $Id: feedparser.py 216 2006-11-12 11:36:49Z tarek $
2"""Universal feed parser
3
4Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom feeds
5
6Visit http://feedparser.org/ for the latest version
7Visit http://feedparser.org/docs/ for the latest documentation
8
9Required: Python 2.1 or later
10Recommended: Python 2.3 or later
11Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
12"""
13
14#__version__ = "pre-3.3-" + "$Revision: 1.51 $"[11:15] + "-cvs"
15__version__ = "3.3"
16__license__ = "Python"
17__copyright__ = "Copyright 2002-4, Mark Pilgrim"
18__author__ = "Mark Pilgrim <http://diveintomark.org/>"
19__contributors__ = ["Jason Diamond <http://injektilo.org/>",
20                    "John Beimler <http://john.beimler.org/>",
21                    "Fazal Majid <http://www.majid.info/mylos/weblog/>",
22                    "Aaron Swartz <http://aaronsw.com>"]
23_debug = 0
24
25# HTTP "User-Agent" header to send to servers when downloading feeds.
26# If you are embedding feedparser in a larger application, you should
27# change this to your application name and URL.
28USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
29
30# HTTP "Accept" header to send to servers when downloading feeds.  If you don't
31# want to send an Accept header, set this to None.
32ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
33
34# List of preferred XML parsers, by SAX driver name.  These will be tried first,
35# but if they're not installed, Python will keep searching through its own list
36# of pre-installed parsers until it finds one that supports everything we need.
37PREFERRED_XML_PARSERS = ["drv_libxml2"]
38
39# If you want feedparser to automatically run HTML markup through HTML Tidy, set
40# this to 1.  This is off by default because of reports of crashing on some
41# platforms.  If it crashes for you, please submit a bug report with your OS
42# platform, Python version, and the URL of the feed you were attempting to parse.
43# Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
44TIDY_MARKUP = 0
45
46# ---------- required modules (should come with any Python distribution) ----------
47import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi
48try:
49    from cStringIO import StringIO as _StringIO
50except:
51    from StringIO import StringIO as _StringIO
52
53# ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
54
55# gzip is included with most Python distributions, but may not be available if you compiled your own
56try:
57    import gzip
58except:
59    gzip = None
60try:
61    import zlib
62except:
63    zlib = None
64
65# timeoutsocket allows feedparser to time out rather than hang forever on ultra-slow servers.
66# Python 2.3 now has this functionality available in the standard socket library, so under
67# 2.3 you don't need to install anything.  But you probably should anyway, because the socket
68# module is buggy and timeoutsocket is better.
69try:
70    import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py
71    timeoutsocket.setDefaultSocketTimeout(20)
72except ImportError:
73    import socket
74    if hasattr(socket, 'setdefaulttimeout'):
75        socket.setdefaulttimeout(20)
76import urllib, urllib2
77
78_mxtidy = None
79if TIDY_MARKUP:
80    try:
81        from mx.Tidy import Tidy as _mxtidy
82    except:
83        pass
84
85# If a real XML parser is available, feedparser will attempt to use it.  feedparser has
86# been tested with the built-in SAX parser, PyXML, and libxml2.  On platforms where the
87# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
88# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
89try:
90    import xml.sax
91    xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
92    from xml.sax.saxutils import escape as _xmlescape
93    _XML_AVAILABLE = 1
94except:
95    _XML_AVAILABLE = 0
96    def _xmlescape(data):
97        data = data.replace("&", "&amp;")
98        data = data.replace(">", "&gt;")
99        data = data.replace("<", "&lt;")
100        return data
101
102# base64 support for Atom feeds that contain embedded binary data
103try:
104    import base64, binascii
105except:
106    base64 = binascii = None
107
108# cjkcodecs and iconv_codec provide support for more character encodings.
109# Both are available from http://cjkpython.i18n.org/
110try:
111    import cjkcodecs.aliases
112except:
113    pass
114try:
115    import iconv_codec
116except:
117    pass
118
119# ---------- don't touch these ----------
120class CharacterEncodingOverride(Exception): pass
121class CharacterEncodingUnknown(Exception): pass
122class NonXMLContentType(Exception): pass
123
124sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
125sgmllib.special = re.compile('<!')
126sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]')
127
128SUPPORTED_VERSIONS = {'': 'unknown',
129                      'rss090': 'RSS 0.90',
130                      'rss091n': 'RSS 0.91 (Netscape)',
131                      'rss091u': 'RSS 0.91 (Userland)',
132                      'rss092': 'RSS 0.92',
133                      'rss093': 'RSS 0.93',
134                      'rss094': 'RSS 0.94',
135                      'rss20': 'RSS 2.0',
136                      'rss10': 'RSS 1.0',
137                      'rss': 'RSS (unknown version)',
138                      'atom01': 'Atom 0.1',
139                      'atom02': 'Atom 0.2',
140                      'atom03': 'Atom 0.3',
141                      'atom': 'Atom (unknown version)',
142                      'cdf': 'CDF',
143                      'hotrss': 'Hot RSS'
144                      }
145
146try:
147    UserDict = dict
148except NameError:
149    # Python 2.1 does not have dict
150    from UserDict import UserDict
151    def dict(aList):
152        rc = {}
153        for k, v in aList:
154            rc[k] = v
155        return rc
156
157class FeedParserDict(UserDict):
158    def __getitem__(self, key):
159        keymap = {'channel': 'feed',
160                  'items': 'entries',
161                  'guid': 'id',
162                  'date': 'modified',
163                  'date_parsed': 'modified_parsed',
164                  'description': ['tagline', 'summary']}
165        realkey = keymap.get(key, key)
166        if type(realkey) == types.ListType:
167            for k in realkey:
168                if UserDict.has_key(self, k):
169                    return UserDict.__getitem__(self, k)
170            return UserDict.__getitem__(self, key)
171        return UserDict.__getitem__(self, realkey)
172
173    def has_key(self, key):
174        return hasattr(self, key) or UserDict.has_key(self, key)
175
176    def __getattr__(self, key):
177        try:
178            return self.__dict__[key]
179        except KeyError:
180            pass
181        try:
182            return self.__getitem__(key)
183        except:
184            raise AttributeError, "object has no attribute '%s'" % key
185
186    def __contains__(self, key):
187        return self.has_key(key)
188
189def zopeCompatibilityHack():
190    global FeedParserDict
191    del FeedParserDict
192    def FeedParserDict(aDict=None):
193        rc = {}
194        if aDict:
195            rc.update(aDict)
196        return rc
197
198_ebcdic_to_ascii_map = None
199def _ebcdic_to_ascii(s):
200    global _ebcdic_to_ascii_map
201    if not _ebcdic_to_ascii_map:
202        emap = (
203            0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
204            16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
205            128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
206            144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
207            32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
208            38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
209            45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
210            186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
211            195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
212            202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
213            209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
214            216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
215            123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
216            125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
217            92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
218            48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
219            )
220        import string
221        _ebcdic_to_ascii_map = string.maketrans( \
222            "".join(map(chr, range(256))), "".join(map(chr, emap)))
223    return s.translate(_ebcdic_to_ascii_map)
224
225class _FeedParserMixin:
226    namespaces = {"": "",
227                  "http://backend.userland.com/rss": "",
228                  "http://blogs.law.harvard.edu/tech/rss": "",
229                  "http://purl.org/rss/1.0/": "",
230                  "http://my.netscape.com/rdf/simple/0.9/": "",
231                  "http://example.com/newformat#": "",
232                  "http://example.com/necho": "",
233                  "http://purl.org/echo/": "",
234                  "uri/of/echo/namespace#": "",
235                  "http://purl.org/pie/": "",
236                  "http://purl.org/atom/ns#": "",
237                  "http://purl.org/rss/1.0/modules/rss091#": "",
238
239                  "http://webns.net/mvcb/":                               "admin",
240                  "http://purl.org/rss/1.0/modules/aggregation/":         "ag",
241                  "http://purl.org/rss/1.0/modules/annotate/":            "annotate",
242                  "http://media.tangent.org/rss/1.0/":                    "audio",
243                  "http://backend.userland.com/blogChannelModule":        "blogChannel",
244                  "http://web.resource.org/cc/":                          "cc",
245                  "http://backend.userland.com/creativeCommonsRssModule": "creativeCommons",
246                  "http://purl.org/rss/1.0/modules/company":              "co",
247                  "http://purl.org/rss/1.0/modules/content/":             "content",
248                  "http://my.theinfo.org/changed/1.0/rss/":               "cp",
249                  "http://purl.org/dc/elements/1.1/":                     "dc",
250                  "http://purl.org/dc/terms/":                            "dcterms",
251                  "http://purl.org/rss/1.0/modules/email/":               "email",
252                  "http://purl.org/rss/1.0/modules/event/":               "ev",
253                  "http://postneo.com/icbm/":                             "icbm",
254                  "http://purl.org/rss/1.0/modules/image/":               "image",
255                  "http://xmlns.com/foaf/0.1/":                           "foaf",
256                  "http://freshmeat.net/rss/fm/":                         "fm",
257                  "http://purl.org/rss/1.0/modules/link/":                "l",
258                  "http://madskills.com/public/xml/rss/module/pingback/": "pingback",
259                  "http://prismstandard.org/namespaces/1.2/basic/":       "prism",
260                  "http://www.w3.org/1999/02/22-rdf-syntax-ns#":          "rdf",
261                  "http://www.w3.org/2000/01/rdf-schema#":                "rdfs",
262                  "http://purl.org/rss/1.0/modules/reference/":           "ref",
263                  "http://purl.org/rss/1.0/modules/richequiv/":           "reqv",
264                  "http://purl.org/rss/1.0/modules/search/":              "search",
265                  "http://purl.org/rss/1.0/modules/slash/":               "slash",
266                  "http://purl.org/rss/1.0/modules/servicestatus/":       "ss",
267                  "http://hacks.benhammersley.com/rss/streaming/":        "str",
268                  "http://purl.org/rss/1.0/modules/subscription/":        "sub",
269                  "http://purl.org/rss/1.0/modules/syndication/":         "sy",
270                  "http://purl.org/rss/1.0/modules/taxonomy/":            "taxo",
271                  "http://purl.org/rss/1.0/modules/threading/":           "thr",
272                  "http://purl.org/rss/1.0/modules/textinput/":           "ti",
273                  "http://madskills.com/public/xml/rss/module/trackback/":"trackback",
274                  "http://wellformedweb.org/CommentAPI/":                 "wfw",
275                  "http://purl.org/rss/1.0/modules/wiki/":                "wiki",
276                  "http://schemas.xmlsoap.org/soap/envelope/":            "soap",
277                  "http://www.w3.org/1999/xhtml":                         "xhtml",
278                  "http://www.w3.org/XML/1998/namespace":                 "xml"
279}
280
281    can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'comments', 'license']
282    can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'copyright', 'description']
283    can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'copyright', 'description']
284    html_types = ['text/html', 'application/xhtml+xml']
285
286    def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):
287        if _debug: sys.stderr.write("initializing FeedParser\n")
288        self.feeddata = FeedParserDict() # feed-level data
289        self.encoding = encoding # character encoding
290        self.entries = [] # list of entry-level data
291        self.version = '' # feed type/version, see SUPPORTED_VERSIONS
292
293        # the following are used internally to track state;
294        # some of this is kind of out of control and should
295        # probably be refactored into a finite state machine
296        self.infeed = 0
297        self.inentry = 0
298        self.incontent = 0
299        self.intextinput = 0
300        self.inimage = 0
301        self.inauthor = 0
302        self.incontributor = 0
303        self.contentparams = FeedParserDict()
304        self.namespacemap = {}
305        self.elementstack = []
306        self.basestack = []
307        self.langstack = []
308        self.baseuri = baseuri or ''
309        self.lang = baselang or None
310        if baselang:
311            self.feeddata['language'] = baselang
312
313    def unknown_starttag(self, tag, attrs):
314        if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
315        # normalize attrs
316        attrs = [(k.lower(), v) for k, v in attrs]
317        attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
318
319        # track xml:base and xml:lang
320        attrsD = dict(attrs)
321        baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
322        self.baseuri = baseuri
323        lang = attrsD.get('xml:lang', attrsD.get('lang'))
324        if lang == '':
325            # xml:lang could be explicitly set to '', we need to capture that
326            lang = None
327        elif lang is None:
328            # if no xml:lang is specified, use parent lang
329            lang = self.lang
330        if lang:
331            if tag in ('feed', 'rss', 'rdf:RDF'):
332                self.feeddata['language'] = lang
333        self.lang = lang
334        self.basestack.append(baseuri)
335        self.langstack.append(lang)
336
337        # track namespaces
338        for prefix, uri in attrs:
339            if prefix.startswith('xmlns:'):
340                self.trackNamespace(prefix[6:], uri)
341            elif prefix == 'xmlns':
342                self.trackNamespace(None, uri)
343
344        # track inline content
345        if self.incontent and self.contentparams.get('mode') == 'escaped':
346            # element declared itself as escaped markup, but it isn't really
347            self.contentparams['mode'] = 'xml'
348        if self.incontent and self.contentparams.get('mode') == 'xml':
349            # Note: probably shouldn't simply recreate localname here, but
350            # our namespace handling isn't actually 100% correct in cases where
351            # the feed redefines the default namespace (which is actually
352            # the usual case for inline content, thanks Sam), so here we
353            # cheat and just reconstruct the element based on localname
354            # because that compensates for the bugs in our namespace handling.
355            # This will horribly munge inline content with non-empty qnames,
356            # but nobody actually does that, so I'm not fixing it.
357            tag = tag.split(':')[-1]
358            return self.handle_data("<%s%s>" % (tag, "".join([' %s="