| 1 | # $Id: feedparser.py 216 2006-11-12 11:36:49Z tarek $ |
|---|
| 2 | """Universal feed parser |
|---|
| 3 | |
|---|
| 4 | Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom feeds |
|---|
| 5 | |
|---|
| 6 | Visit http://feedparser.org/ for the latest version |
|---|
| 7 | Visit http://feedparser.org/docs/ for the latest documentation |
|---|
| 8 | |
|---|
| 9 | Required: Python 2.1 or later |
|---|
| 10 | Recommended: Python 2.3 or later |
|---|
| 11 | Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/> |
|---|
| 12 | """ |
|---|
| 13 | |
|---|
| 14 | #__version__ = "pre-3.3-" + "$Revision: 1.51 $"[11:15] + "-cvs" |
|---|
| 15 | __version__ = "3.3" |
|---|
| 16 | __license__ = "Python" |
|---|
| 17 | __copyright__ = "Copyright 2002-4, Mark Pilgrim" |
|---|
| 18 | __author__ = "Mark Pilgrim <http://diveintomark.org/>" |
|---|
| 19 | __contributors__ = ["Jason Diamond <http://injektilo.org/>", |
|---|
| 20 | "John Beimler <http://john.beimler.org/>", |
|---|
| 21 | "Fazal Majid <http://www.majid.info/mylos/weblog/>", |
|---|
| 22 | "Aaron Swartz <http://aaronsw.com>"] |
|---|
| 23 | _debug = 0 |
|---|
| 24 | |
|---|
| 25 | # HTTP "User-Agent" header to send to servers when downloading feeds. |
|---|
| 26 | # If you are embedding feedparser in a larger application, you should |
|---|
| 27 | # change this to your application name and URL. |
|---|
| 28 | USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__ |
|---|
| 29 | |
|---|
| 30 | # HTTP "Accept" header to send to servers when downloading feeds. If you don't |
|---|
| 31 | # want to send an Accept header, set this to None. |
|---|
| 32 | ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1" |
|---|
| 33 | |
|---|
| 34 | # List of preferred XML parsers, by SAX driver name. These will be tried first, |
|---|
| 35 | # but if they're not installed, Python will keep searching through its own list |
|---|
| 36 | # of pre-installed parsers until it finds one that supports everything we need. |
|---|
| 37 | PREFERRED_XML_PARSERS = ["drv_libxml2"] |
|---|
| 38 | |
|---|
| 39 | # If you want feedparser to automatically run HTML markup through HTML Tidy, set |
|---|
| 40 | # this to 1. This is off by default because of reports of crashing on some |
|---|
| 41 | # platforms. If it crashes for you, please submit a bug report with your OS |
|---|
| 42 | # platform, Python version, and the URL of the feed you were attempting to parse. |
|---|
| 43 | # Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html> |
|---|
| 44 | TIDY_MARKUP = 0 |
|---|
| 45 | |
|---|
| 46 | # ---------- required modules (should come with any Python distribution) ---------- |
|---|
| 47 | import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi |
|---|
| 48 | try: |
|---|
| 49 | from cStringIO import StringIO as _StringIO |
|---|
| 50 | except: |
|---|
| 51 | from StringIO import StringIO as _StringIO |
|---|
| 52 | |
|---|
| 53 | # ---------- optional modules (feedparser will work without these, but with reduced functionality) ---------- |
|---|
| 54 | |
|---|
| 55 | # gzip is included with most Python distributions, but may not be available if you compiled your own |
|---|
| 56 | try: |
|---|
| 57 | import gzip |
|---|
| 58 | except: |
|---|
| 59 | gzip = None |
|---|
| 60 | try: |
|---|
| 61 | import zlib |
|---|
| 62 | except: |
|---|
| 63 | zlib = None |
|---|
| 64 | |
|---|
| 65 | # timeoutsocket allows feedparser to time out rather than hang forever on ultra-slow servers. |
|---|
| 66 | # Python 2.3 now has this functionality available in the standard socket library, so under |
|---|
| 67 | # 2.3 you don't need to install anything. But you probably should anyway, because the socket |
|---|
| 68 | # module is buggy and timeoutsocket is better. |
|---|
| 69 | try: |
|---|
| 70 | import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py |
|---|
| 71 | timeoutsocket.setDefaultSocketTimeout(20) |
|---|
| 72 | except ImportError: |
|---|
| 73 | import socket |
|---|
| 74 | if hasattr(socket, 'setdefaulttimeout'): |
|---|
| 75 | socket.setdefaulttimeout(20) |
|---|
| 76 | import urllib, urllib2 |
|---|
| 77 | |
|---|
| 78 | _mxtidy = None |
|---|
| 79 | if TIDY_MARKUP: |
|---|
| 80 | try: |
|---|
| 81 | from mx.Tidy import Tidy as _mxtidy |
|---|
| 82 | except: |
|---|
| 83 | pass |
|---|
| 84 | |
|---|
| 85 | # If a real XML parser is available, feedparser will attempt to use it. feedparser has |
|---|
| 86 | # been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the |
|---|
| 87 | # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some |
|---|
| 88 | # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing. |
|---|
| 89 | try: |
|---|
| 90 | import xml.sax |
|---|
| 91 | xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers |
|---|
| 92 | from xml.sax.saxutils import escape as _xmlescape |
|---|
| 93 | _XML_AVAILABLE = 1 |
|---|
| 94 | except: |
|---|
| 95 | _XML_AVAILABLE = 0 |
|---|
| 96 | def _xmlescape(data): |
|---|
| 97 | data = data.replace("&", "&") |
|---|
| 98 | data = data.replace(">", ">") |
|---|
| 99 | data = data.replace("<", "<") |
|---|
| 100 | return data |
|---|
| 101 | |
|---|
| 102 | # base64 support for Atom feeds that contain embedded binary data |
|---|
| 103 | try: |
|---|
| 104 | import base64, binascii |
|---|
| 105 | except: |
|---|
| 106 | base64 = binascii = None |
|---|
| 107 | |
|---|
| 108 | # cjkcodecs and iconv_codec provide support for more character encodings. |
|---|
| 109 | # Both are available from http://cjkpython.i18n.org/ |
|---|
| 110 | try: |
|---|
| 111 | import cjkcodecs.aliases |
|---|
| 112 | except: |
|---|
| 113 | pass |
|---|
| 114 | try: |
|---|
| 115 | import iconv_codec |
|---|
| 116 | except: |
|---|
| 117 | pass |
|---|
| 118 | |
|---|
| 119 | # ---------- don't touch these ---------- |
|---|
| 120 | class CharacterEncodingOverride(Exception): pass |
|---|
| 121 | class CharacterEncodingUnknown(Exception): pass |
|---|
| 122 | class NonXMLContentType(Exception): pass |
|---|
| 123 | |
|---|
| 124 | sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') |
|---|
| 125 | sgmllib.special = re.compile('<!') |
|---|
| 126 | sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]') |
|---|
| 127 | |
|---|
| 128 | SUPPORTED_VERSIONS = {'': 'unknown', |
|---|
| 129 | 'rss090': 'RSS 0.90', |
|---|
| 130 | 'rss091n': 'RSS 0.91 (Netscape)', |
|---|
| 131 | 'rss091u': 'RSS 0.91 (Userland)', |
|---|
| 132 | 'rss092': 'RSS 0.92', |
|---|
| 133 | 'rss093': 'RSS 0.93', |
|---|
| 134 | 'rss094': 'RSS 0.94', |
|---|
| 135 | 'rss20': 'RSS 2.0', |
|---|
| 136 | 'rss10': 'RSS 1.0', |
|---|
| 137 | 'rss': 'RSS (unknown version)', |
|---|
| 138 | 'atom01': 'Atom 0.1', |
|---|
| 139 | 'atom02': 'Atom 0.2', |
|---|
| 140 | 'atom03': 'Atom 0.3', |
|---|
| 141 | 'atom': 'Atom (unknown version)', |
|---|
| 142 | 'cdf': 'CDF', |
|---|
| 143 | 'hotrss': 'Hot RSS' |
|---|
| 144 | } |
|---|
| 145 | |
|---|
| 146 | try: |
|---|
| 147 | UserDict = dict |
|---|
| 148 | except NameError: |
|---|
| 149 | # Python 2.1 does not have dict |
|---|
| 150 | from UserDict import UserDict |
|---|
| 151 | def dict(aList): |
|---|
| 152 | rc = {} |
|---|
| 153 | for k, v in aList: |
|---|
| 154 | rc[k] = v |
|---|
| 155 | return rc |
|---|
| 156 | |
|---|
| 157 | class FeedParserDict(UserDict): |
|---|
| 158 | def __getitem__(self, key): |
|---|
| 159 | keymap = {'channel': 'feed', |
|---|
| 160 | 'items': 'entries', |
|---|
| 161 | 'guid': 'id', |
|---|
| 162 | 'date': 'modified', |
|---|
| 163 | 'date_parsed': 'modified_parsed', |
|---|
| 164 | 'description': ['tagline', 'summary']} |
|---|
| 165 | realkey = keymap.get(key, key) |
|---|
| 166 | if type(realkey) == types.ListType: |
|---|
| 167 | for k in realkey: |
|---|
| 168 | if UserDict.has_key(self, k): |
|---|
| 169 | return UserDict.__getitem__(self, k) |
|---|
| 170 | return UserDict.__getitem__(self, key) |
|---|
| 171 | return UserDict.__getitem__(self, realkey) |
|---|
| 172 | |
|---|
| 173 | def has_key(self, key): |
|---|
| 174 | return hasattr(self, key) or UserDict.has_key(self, key) |
|---|
| 175 | |
|---|
| 176 | def __getattr__(self, key): |
|---|
| 177 | try: |
|---|
| 178 | return self.__dict__[key] |
|---|
| 179 | except KeyError: |
|---|
| 180 | pass |
|---|
| 181 | try: |
|---|
| 182 | return self.__getitem__(key) |
|---|
| 183 | except: |
|---|
| 184 | raise AttributeError, "object has no attribute '%s'" % key |
|---|
| 185 | |
|---|
| 186 | def __contains__(self, key): |
|---|
| 187 | return self.has_key(key) |
|---|
| 188 | |
|---|
| 189 | def zopeCompatibilityHack(): |
|---|
| 190 | global FeedParserDict |
|---|
| 191 | del FeedParserDict |
|---|
| 192 | def FeedParserDict(aDict=None): |
|---|
| 193 | rc = {} |
|---|
| 194 | if aDict: |
|---|
| 195 | rc.update(aDict) |
|---|
| 196 | return rc |
|---|
| 197 | |
|---|
| 198 | _ebcdic_to_ascii_map = None |
|---|
| 199 | def _ebcdic_to_ascii(s): |
|---|
| 200 | global _ebcdic_to_ascii_map |
|---|
| 201 | if not _ebcdic_to_ascii_map: |
|---|
| 202 | emap = ( |
|---|
| 203 | 0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, |
|---|
| 204 | 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, |
|---|
| 205 | 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, |
|---|
| 206 | 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, |
|---|
| 207 | 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, |
|---|
| 208 | 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, |
|---|
| 209 | 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, |
|---|
| 210 | 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, |
|---|
| 211 | 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201, |
|---|
| 212 | 202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208, |
|---|
| 213 | 209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215, |
|---|
| 214 | 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231, |
|---|
| 215 | 123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237, |
|---|
| 216 | 125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243, |
|---|
| 217 | 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249, |
|---|
| 218 | 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255 |
|---|
| 219 | ) |
|---|
| 220 | import string |
|---|
| 221 | _ebcdic_to_ascii_map = string.maketrans( \ |
|---|
| 222 | "".join(map(chr, range(256))), "".join(map(chr, emap))) |
|---|
| 223 | return s.translate(_ebcdic_to_ascii_map) |
|---|
| 224 | |
|---|
| 225 | class _FeedParserMixin: |
|---|
| 226 | namespaces = {"": "", |
|---|
| 227 | "http://backend.userland.com/rss": "", |
|---|
| 228 | "http://blogs.law.harvard.edu/tech/rss": "", |
|---|
| 229 | "http://purl.org/rss/1.0/": "", |
|---|
| 230 | "http://my.netscape.com/rdf/simple/0.9/": "", |
|---|
| 231 | "http://example.com/newformat#": "", |
|---|
| 232 | "http://example.com/necho": "", |
|---|
| 233 | "http://purl.org/echo/": "", |
|---|
| 234 | "uri/of/echo/namespace#": "", |
|---|
| 235 | "http://purl.org/pie/": "", |
|---|
| 236 | "http://purl.org/atom/ns#": "", |
|---|
| 237 | "http://purl.org/rss/1.0/modules/rss091#": "", |
|---|
| 238 | |
|---|
| 239 | "http://webns.net/mvcb/": "admin", |
|---|
| 240 | "http://purl.org/rss/1.0/modules/aggregation/": "ag", |
|---|
| 241 | "http://purl.org/rss/1.0/modules/annotate/": "annotate", |
|---|
| 242 | "http://media.tangent.org/rss/1.0/": "audio", |
|---|
| 243 | "http://backend.userland.com/blogChannelModule": "blogChannel", |
|---|
| 244 | "http://web.resource.org/cc/": "cc", |
|---|
| 245 | "http://backend.userland.com/creativeCommonsRssModule": "creativeCommons", |
|---|
| 246 | "http://purl.org/rss/1.0/modules/company": "co", |
|---|
| 247 | "http://purl.org/rss/1.0/modules/content/": "content", |
|---|
| 248 | "http://my.theinfo.org/changed/1.0/rss/": "cp", |
|---|
| 249 | "http://purl.org/dc/elements/1.1/": "dc", |
|---|
| 250 | "http://purl.org/dc/terms/": "dcterms", |
|---|
| 251 | "http://purl.org/rss/1.0/modules/email/": "email", |
|---|
| 252 | "http://purl.org/rss/1.0/modules/event/": "ev", |
|---|
| 253 | "http://postneo.com/icbm/": "icbm", |
|---|
| 254 | "http://purl.org/rss/1.0/modules/image/": "image", |
|---|
| 255 | "http://xmlns.com/foaf/0.1/": "foaf", |
|---|
| 256 | "http://freshmeat.net/rss/fm/": "fm", |
|---|
| 257 | "http://purl.org/rss/1.0/modules/link/": "l", |
|---|
| 258 | "http://madskills.com/public/xml/rss/module/pingback/": "pingback", |
|---|
| 259 | "http://prismstandard.org/namespaces/1.2/basic/": "prism", |
|---|
| 260 | "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf", |
|---|
| 261 | "http://www.w3.org/2000/01/rdf-schema#": "rdfs", |
|---|
| 262 | "http://purl.org/rss/1.0/modules/reference/": "ref", |
|---|
| 263 | "http://purl.org/rss/1.0/modules/richequiv/": "reqv", |
|---|
| 264 | "http://purl.org/rss/1.0/modules/search/": "search", |
|---|
| 265 | "http://purl.org/rss/1.0/modules/slash/": "slash", |
|---|
| 266 | "http://purl.org/rss/1.0/modules/servicestatus/": "ss", |
|---|
| 267 | "http://hacks.benhammersley.com/rss/streaming/": "str", |
|---|
| 268 | "http://purl.org/rss/1.0/modules/subscription/": "sub", |
|---|
| 269 | "http://purl.org/rss/1.0/modules/syndication/": "sy", |
|---|
| 270 | "http://purl.org/rss/1.0/modules/taxonomy/": "taxo", |
|---|
| 271 | "http://purl.org/rss/1.0/modules/threading/": "thr", |
|---|
| 272 | "http://purl.org/rss/1.0/modules/textinput/": "ti", |
|---|
| 273 | "http://madskills.com/public/xml/rss/module/trackback/":"trackback", |
|---|
| 274 | "http://wellformedweb.org/CommentAPI/": "wfw", |
|---|
| 275 | "http://purl.org/rss/1.0/modules/wiki/": "wiki", |
|---|
| 276 | "http://schemas.xmlsoap.org/soap/envelope/": "soap", |
|---|
| 277 | "http://www.w3.org/1999/xhtml": "xhtml", |
|---|
| 278 | "http://www.w3.org/XML/1998/namespace": "xml" |
|---|
| 279 | } |
|---|
| 280 | |
|---|
| 281 | can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'comments', 'license'] |
|---|
| 282 | can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'copyright', 'description'] |
|---|
| 283 | can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'copyright', 'description'] |
|---|
| 284 | html_types = ['text/html', 'application/xhtml+xml'] |
|---|
| 285 | |
|---|
| 286 | def __init__(self, baseuri=None, baselang=None, encoding='utf-8'): |
|---|
| 287 | if _debug: sys.stderr.write("initializing FeedParser\n") |
|---|
| 288 | self.feeddata = FeedParserDict() # feed-level data |
|---|
| 289 | self.encoding = encoding # character encoding |
|---|
| 290 | self.entries = [] # list of entry-level data |
|---|
| 291 | self.version = '' # feed type/version, see SUPPORTED_VERSIONS |
|---|
| 292 | |
|---|
| 293 | # the following are used internally to track state; |
|---|
| 294 | # some of this is kind of out of control and should |
|---|
| 295 | # probably be refactored into a finite state machine |
|---|
| 296 | self.infeed = 0 |
|---|
| 297 | self.inentry = 0 |
|---|
| 298 | self.incontent = 0 |
|---|
| 299 | self.intextinput = 0 |
|---|
| 300 | self.inimage = 0 |
|---|
| 301 | self.inauthor = 0 |
|---|
| 302 | self.incontributor = 0 |
|---|
| 303 | self.contentparams = FeedParserDict() |
|---|
| 304 | self.namespacemap = {} |
|---|
| 305 | self.elementstack = [] |
|---|
| 306 | self.basestack = [] |
|---|
| 307 | self.langstack = [] |
|---|
| 308 | self.baseuri = baseuri or '' |
|---|
| 309 | self.lang = baselang or None |
|---|
| 310 | if baselang: |
|---|
| 311 | self.feeddata['language'] = baselang |
|---|
| 312 | |
|---|
| 313 | def unknown_starttag(self, tag, attrs): |
|---|
| 314 | if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs)) |
|---|
| 315 | # normalize attrs |
|---|
| 316 | attrs = [(k.lower(), v) for k, v in attrs] |
|---|
| 317 | attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] |
|---|
| 318 | |
|---|
| 319 | # track xml:base and xml:lang |
|---|
| 320 | attrsD = dict(attrs) |
|---|
| 321 | baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri |
|---|
| 322 | self.baseuri = baseuri |
|---|
| 323 | lang = attrsD.get('xml:lang', attrsD.get('lang')) |
|---|
| 324 | if lang == '': |
|---|
| 325 | # xml:lang could be explicitly set to '', we need to capture that |
|---|
| 326 | lang = None |
|---|
| 327 | elif lang is None: |
|---|
| 328 | # if no xml:lang is specified, use parent lang |
|---|
| 329 | lang = self.lang |
|---|
| 330 | if lang: |
|---|
| 331 | if tag in ('feed', 'rss', 'rdf:RDF'): |
|---|
| 332 | self.feeddata['language'] = lang |
|---|
| 333 | self.lang = lang |
|---|
| 334 | self.basestack.append(baseuri) |
|---|
| 335 | self.langstack.append(lang) |
|---|
| 336 | |
|---|
| 337 | # track namespaces |
|---|
| 338 | for prefix, uri in attrs: |
|---|
| 339 | if prefix.startswith('xmlns:'): |
|---|
| 340 | self.trackNamespace(prefix[6:], uri) |
|---|
| 341 | elif prefix == 'xmlns': |
|---|
| 342 | self.trackNamespace(None, uri) |
|---|
| 343 | |
|---|
| 344 | # track inline content |
|---|
| 345 | if self.incontent and self.contentparams.get('mode') == 'escaped': |
|---|
| 346 | # element declared itself as escaped markup, but it isn't really |
|---|
| 347 | self.contentparams['mode'] = 'xml' |
|---|
| 348 | if self.incontent and self.contentparams.get('mode') == 'xml': |
|---|
| 349 | # Note: probably shouldn't simply recreate localname here, but |
|---|
| 350 | # our namespace handling isn't actually 100% correct in cases where |
|---|
| 351 | # the feed redefines the default namespace (which is actually |
|---|
| 352 | # the usual case for inline content, thanks Sam), so here we |
|---|
| 353 | # cheat and just reconstruct the element based on localname |
|---|
| 354 | # because that compensates for the bugs in our namespace handling. |
|---|
| 355 | # This will horribly munge inline content with non-empty qnames, |
|---|
| 356 | # but nobody actually does that, so I'm not fixing it. |
|---|
| 357 | tag = tag.split(':')[-1] |
|---|
| 358 | return self.handle_data("<%s%s>" % (tag, "".join([' %s=" |
|---|