flexget.plugins.input_rss
Covered: 253 lines
Missed: 47 lines
Skipped 104 lines
Percent: 84 %
  1
import logging
  2
import urlparse
  3
import xml.sax
  4
import posixpath
  5
import feedparser
  6
import urllib2
  7
import httplib
  8
import socket
  9
from flexget.feed import Entry
 10
from flexget.plugin import register_plugin, internet, PluginError
 11
from flexget.utils.cached_input import cached
 12
from flexget.utils.tools import urlopener
 14
log = logging.getLogger('rss')
 17
class InputRSS(object):
 18
    """
 19
        Parses RSS feed.
 21
        Hazzlefree configuration for public rss feeds:
 23
        rss: <url>
 25
        Configuration with basic http authentication:
 27
        rss:
 28
          url: <url>
 29
          username: <name>
 30
          password: <password>
 32
        Advanced usages:
 34
        You may wish to clean up the entry by stripping out all non-ascii characters.
 35
        This can be done by setting ascii value to yes.
 37
        Example:
 39
        rss:
 40
          url: <url>
 41
          ascii: yes
 43
        Incase RSS-feed uses some nonstandard field for urls and automatic detection fails
 44
        you can configure plugin to use url from any feedparser entry attribute.
 46
        Example:
 48
        rss:
 49
          url: <url>
 50
          link: guid
 52
        If you want to keep information in another rss field attached to the flexget entry, you can use the other_fields option.
 54
        Example:
 56
        rss:
 57
          url: <url>
 58
          other_fields: [date]
 60
        You can disable few possibly annoying warnings by setting silent value to
 61
        yes on feeds where there are frequently invalid items.
 63
        Example:
 65
        rss:
 66
          url: <url>
 67
          silent: yes
 69
        You can group all the links of an item, to make the download plugin tolerant
 70
        to broken urls: it will try to download each url until one works.
 71
        Links are enclosures plus item fields given by the link value, in that order.
 72
        The value to set is "group_links".
 74
        Example:
 76
        rss:
 77
          url: <url>
 78
          group_links: yes
 79
    """
 81
    def validator(self):
 82
        from flexget import validator
 83
        root = validator.factory()
 84
        root.accept('url')
 85
        root.accept('file')
 86
        advanced = root.accept('dict')
 87
        advanced.accept('url', key='url', required=True)
 88
        advanced.accept('file', key='url')
 89
        advanced.accept('text', key='username')
 90
        advanced.accept('text', key='password')
 91
        advanced.accept('text', key='link')
 92
        advanced.accept('list', key='link').accept('text')
 93
        advanced.accept('list', key='other_fields').accept('text')
 94
        advanced.accept('boolean', key='silent')
 95
        advanced.accept('boolean', key='ascii')
 96
        advanced.accept('boolean', key='filename')
 97
        advanced.accept('boolean', key='group_links')
 98
        return root
100
    def build_config(self, config):
101
        """Set default values to config"""
102
        if isinstance(config, basestring):
103
            config = {'url': config}
105
        config.setdefault('link', 'auto')
107
        if config.get('other_fields'):
108
            config['other_fields'] = [field.replace(':', '_').lower() for field in config['other_fields']]
110
        config.setdefault('group_links', False)
112
        if 'username' in config and 'password' in config:
113
            config['url'] = self.passwordize(config['url'], config['username'], config['password'])
114
        return config
116
    def passwordize(self, url, user, password):
117
        """Add username and password to url"""
118
        parts = list(urlparse.urlsplit(url))
119
        parts[1] = user + ':' + password + '@' + parts[1]
120
        url = urlparse.urlunsplit(parts)
121
        return url
123
    def process_invalid_content(self, feed, url):
124
        """If feedparser reports error, save the received data and log error."""
125
        log.critical('Invalid XML received from feed %s' % feed.name)
126
        try:
127
            req = urlopener(url, log)
128
        except ValueError:
129
            log.debug('invalid url %s (ok for a file)' % url)
130
            return
131
        data = req.read()
132
        req.close()
133
        ext = 'xml'
134
        if '<html>' in data.lower():
135
            log.critical('Received content is HTML page, not an RSS feed')
136
            ext = 'html'
137
        if 'login' in data.lower() or 'username' in data.lower():
138
            log.critical('Received content looks a bit like login page')
139
        if 'error' in data.lower():
140
            log.critical('Received content looks a bit like error page')
141
        import os
142
        received = os.path.join(feed.manager.config_base, 'received')
143
        if not os.path.isdir(received):
144
            os.mkdir(received)
145
        filename = os.path.join(received, '%s.%s' % (feed.name, ext))
146
        f = open(filename, 'w')
147
        f.write(data)
148
        f.close()
149
        log.critical('I have saved the invalid content to %s for you to view' % filename)
151
    def add_enclosure_info(self, entry, enclosure, filename=True, multiple=False):
152
        """Stores information from an rss enclosure into an Entry."""
153
        entry['url'] = enclosure['href']
155
        if 'length' in enclosure:
156
            try:
157
                entry['size'] = int(enclosure['length'])
158
            except:
159
                entry['size'] = 0
160
        if 'type' in enclosure:
161
            entry['type'] = enclosure['type']
164
        basename = posixpath.basename(urlparse.urlsplit(entry['url']).path)
166
        if (entry.get('size') or multiple and basename) and filename:
167
            entry['filename'] = basename
168
            log.debugall('filename `%s` from enclosure' % entry['filename'])
170
    @cached('rss', 'url')
171
    @internet(log)
172
    def on_feed_input(self, feed, config):
173
        config = self.build_config(config)
175
        log.debug('Checking feed %s (%s)' % (feed.name, config['url']))
179
        etag = None
180
        modified = None
181
        """
182
        etag = feed.cache.get('etag', None)
183
        if etag:
184
            log.debug('Sending etag %s for feed %s' % (etag, feed.name))
185
        modified = feed.cache.get('modified', None)
186
        if modified:
187
            log.debug('Sending last-modified %s for feed %s' % (etag, feed.name))
188
        """
191
        orig_timout = socket.getdefaulttimeout()
192
        socket.setdefaulttimeout(60)
195
        if urllib2._opener:
196
            rss = feedparser.parse(config['url'], etag=etag, modified=modified, handlers=urllib2._opener.handlers)
197
        else:
198
            rss = feedparser.parse(config['url'], etag=etag, modified=modified)
201
        socket.setdefaulttimeout(orig_timout)
204
        status = rss.get('status', False)
205
        if not status:
206
            log.debug('RSS does not have status (normal if processing a file)')
207
        elif status == 304:
208
            log.debug('Feed %s hasn\'t changed, skipping' % feed.name)
209
            return
210
        elif status == 401:
211
            raise PluginError('Authentication needed for feed %s: %s' % \
212
                (feed.name, rss.headers['www-authenticate']), log)
213
        elif status == 404:
214
            raise PluginError('RSS Feed %s not found' % feed.name, log)
215
        elif status == 500:
216
            raise PluginError('Internal server exception on feed %s' % feed.name, log)
219
        ex = rss.get('bozo_exception', False)
220
        ignore = False
221
        if ex:
222
            if isinstance(ex, feedparser.NonXMLContentType):
224
                log.debug('ignoring feedparser.NonXMLContentType')
225
                ignore = True
226
            elif isinstance(ex, feedparser.CharacterEncodingOverride):
228
                log.debug('ignoring feedparser.CharacterEncodingOverride')
229
                ignore = True
230
            elif isinstance(ex, UnicodeEncodeError):
231
                if rss.entries:
232
                    log.info('Feed has UnicodeEncodeError but seems to produce entries, ignoring the error ...')
233
                    ignore = True
234
            elif isinstance(ex, xml.sax._exceptions.SAXParseException):
235
                if not rss.entries:
238
                    self.process_invalid_content(feed, config['url'])
239
                    if feed.manager.options.debug:
240
                        log.exception(ex)
241
                    raise PluginError('Received invalid RSS content')
242
                else:
243
                    msg = 'Invalid XML received. However feedparser still produced entries. Ignoring the error ...'
244
                    if not config.get('silent', False):
245
                        log.info(msg)
246
                    else:
247
                        log.debug(msg)
248
                    ignore = True
249
            elif isinstance(ex, httplib.BadStatusLine) or \
250
                 isinstance(ex, IOError):
251
                raise ex # let the @internet decorator handle
252
            else:
254
                if not rss.entries:
255
                    self.process_invalid_content(feed, config['url'])
256
                    raise PluginError('Unhandled bozo_exception. Type: %s (feed: %s)' % \
257
                        (ex.__class__.__name__, feed.name), log)
258
                else:
259
                    msg = 'Invalid RSS received. However feedparser still produced entries. Ignoring the error ...'
260
                    if not config.get('silent', False):
261
                        log.info(msg)
262
                    else:
263
                        log.debug(msg)
265
        if 'bozo' in rss:
266
            if rss.bozo and not ignore:
267
                log.error(rss)
268
                log.error('Bozo exception %s on feed %s' % (type(ex), feed.name))
269
                return
270
        else:
271
            log.warn('feedparser bozo bit missing, feedparser bug? (FlexGet ticket #721)')
273
        log.debug('encoding %s' % rss.encoding)
276
        """
277
        if 'etag' in rss and type(rss['etag']) != feedparser.types.NoneType:
278
            etag = rss.etag.replace("'", '').replace('"', '')
279
            feed.cache.store('etag', etag, 90)
280
            log.debug('etag %s saved for feed %s' % (etag, feed.name))
281
        elif hasattr(rss, 'headers'):
282
            if 'last-modified' in rss.headers:
283
                feed.cache.store('modified', rss.modified, 90)
284
                log.debug('last modified saved for feed %s', feed.name)
285
        """
288
        entries = []
292
        ignored = 0
293
        for entry in rss.entries:
296
            if not getattr(entry, 'title', None):
297
                log.debug('skipping entry without title')
298
                ignored += 1
299
                continue
302
            if config.get('ascii', False):
303
                entry.title = entry.title.encode('ascii', 'ignore')
306
            entry.title = entry.title.replace(u'\u200B', u'')
311
            def add_entry(ea):
312
                from flexget.utils.tools import decode_html
313
                ea['title'] = entry.title
316
                fields = ['guid', 'author', 'description']
318
                fields.extend(config.get('other_fields', []))
319
                for field in fields:
320
                    if field in entry:
321
                        if not isinstance(getattr(entry, field), basestring):
323
                            log.error('Cannot grab non text field `%s` from rss.' % field)
325
                            config['other_fields'].remove(field)
326
                            continue
327
                        try:
328
                            ea[field] = decode_html(entry[field])
329
                            if field in config.get('other_fields', []):
331
                                log.debug('Field `%s` set to `%s` for `%s`' % (field, ea[field], ea['title']))
332
                        except UnicodeDecodeError:
333
                            log.warning('Failed to decode entry `%s` field `%s`' % (ea['title'], field))
336
                if 'username' in config and 'password' in config:
337
                    ea['basic_auth_username'] = config['username']
338
                    ea['basic_auth_password'] = config['password']
339
                entries.append(ea)
342
            enclosures = entry.get('enclosures', [])
344
            if len(enclosures) > 1 and not config.get('group_links'):
346
                log.debug('adding %i entries from enclosures' % len(enclosures))
347
                for enclosure in enclosures:
348
                    if not 'href' in enclosure:
349
                        log.debug('RSS-entry `%s` enclosure does not have URL' % entry.title)
350
                        continue
352
                    ee = Entry()
353
                    self.add_enclosure_info(ee, enclosure, config.get('filename', True), True)
354
                    add_entry(ee)
356
                continue
359
            e = Entry()
360
            urls = []
362
            if not isinstance(config.get('link'), list):
364
                if config['link'] == 'auto':
366
                    if len(entry.get('enclosures', [])) == 1 and entry['enclosures'][0].get('href'):
367
                        self.add_enclosure_info(e, entry['enclosures'][0], config.get('filename', True))
368
                    else:
370
                        for field in ['link', 'guid']:
371
                            if entry.get(field):
372
                                e['url'] = entry[field]
373
                                break
374
                else:
375
                    if entry.get(config['link']):
376
                        e['url'] = entry[config['link']]
377
            else:
379
                for field in config['link']:
380
                    if entry.get(field):
381
                        e.setdefault('url', entry[field])
382
                        if entry[field] not in e.setdefault('urls', []):
383
                            e['urls'].append(entry[field])
385
            if config.get('group_links'):
387
                e.setdefault('urls', [e['url']]).extend(
388
                        [enc.href for enc in entry.get('enclosures', []) if enc.get('href') not in e['urls']])
390
            if not e.get('url'):
391
                log.debug('%s does not have link (%s) or enclosure' % (entry.title, config['link']))
392
                ignored += 1
393
                continue
395
            add_entry(e)
397
        if ignored:
398
            if not config.get('silent'):
399
                log.warning('Skipped %s RSS-entries without required information (title, link or enclosures)' % ignored)
401
        return entries
403
register_plugin(InputRSS, 'rss', api_ver=2)