9
from flexget.feed import Entry
10
from flexget.plugin import register_plugin, internet, PluginError
11
from flexget.utils.cached_input import cached
12
from flexget.utils.tools import urlopener
14
log = logging.getLogger('rss')
17
class InputRSS(object):
21
Hazzlefree configuration for public rss feeds:
25
Configuration with basic http authentication:
34
You may wish to clean up the entry by stripping out all non-ascii characters.
35
This can be done by setting ascii value to yes.
43
Incase RSS-feed uses some nonstandard field for urls and automatic detection fails
44
you can configure plugin to use url from any feedparser entry attribute.
52
If you want to keep information in another rss field attached to the flexget entry, you can use the other_fields option.
60
You can disable few possibly annoying warnings by setting silent value to
61
yes on feeds where there are frequently invalid items.
69
You can group all the links of an item, to make the download plugin tolerant
70
to broken urls: it will try to download each url until one works.
71
Links are enclosures plus item fields given by the link value, in that order.
72
The value to set is "group_links".
82
from flexget import validator
83
root = validator.factory()
86
advanced = root.accept('dict')
87
advanced.accept('url', key='url', required=True)
88
advanced.accept('file', key='url')
89
advanced.accept('text', key='username')
90
advanced.accept('text', key='password')
91
advanced.accept('text', key='link')
92
advanced.accept('list', key='link').accept('text')
93
advanced.accept('list', key='other_fields').accept('text')
94
advanced.accept('boolean', key='silent')
95
advanced.accept('boolean', key='ascii')
96
advanced.accept('boolean', key='filename')
97
advanced.accept('boolean', key='group_links')
100
def build_config(self, config):
101
"""Set default values to config"""
102
if isinstance(config, basestring):
103
config = {'url': config}
104
# set the default link value to 'auto'
105
config.setdefault('link', 'auto')
106
# Replace : with _ and lower case other fields so they can be found in rss
107
if config.get('other_fields'):
108
config['other_fields'] = [field.replace(':', '_').lower() for field in config['other_fields']]
109
# set default value for group_links as deactivated
110
config.setdefault('group_links', False)
111
# use basic auth when needed
112
if 'username' in config and 'password' in config:
113
config['url'] = self.passwordize(config['url'], config['username'], config['password'])
116
def passwordize(self, url, user, password):
117
"""Add username and password to url"""
118
parts = list(urlparse.urlsplit(url))
119
parts[1] = user + ':' + password + '@' + parts[1]
120
url = urlparse.urlunsplit(parts)
123
def process_invalid_content(self, feed, url):
124
"""If feedparser reports error, save the received data and log error."""
125
log.critical('Invalid XML received from feed %s' % feed.name)
127
req = urlopener(url, log)
129
log.debug('invalid url %s (ok for a file)' % url)
134
if '<html>' in data.lower():
135
log.critical('Received content is HTML page, not an RSS feed')
137
if 'login' in data.lower() or 'username' in data.lower():
138
log.critical('Received content looks a bit like login page')
139
if 'error' in data.lower():
140
log.critical('Received content looks a bit like error page')
142
received = os.path.join(feed.manager.config_base, 'received')
143
if not os.path.isdir(received):
145
filename = os.path.join(received, '%s.%s' % (feed.name, ext))
146
f = open(filename, 'w')
149
log.critical('I have saved the invalid content to %s for you to view' % filename)
151
def add_enclosure_info(self, entry, enclosure, filename=True, multiple=False):
152
"""Stores information from an rss enclosure into an Entry."""
153
entry['url'] = enclosure['href']
154
# get optional meta-data
155
if 'length' in enclosure:
157
entry['size'] = int(enclosure['length'])
160
if 'type' in enclosure:
161
entry['type'] = enclosure['type']
162
# TODO: better and perhaps join/in download plugin?
163
# Parse filename from enclosure url
164
basename = posixpath.basename(urlparse.urlsplit(entry['url']).path)
165
# If enclosure has size OR there are multiple enclosures use filename from url
166
if (entry.get('size') or multiple and basename) and filename:
167
entry['filename'] = basename
168
log.debugall('filename `%s` from enclosure' % entry['filename'])
170
@cached('rss', 'url')
172
def on_feed_input(self, feed, config):
173
config = self.build_config(config)
175
log.debug('Checking feed %s (%s)' % (feed.name, config['url']))
177
# check etags and last modified -headers
178
# let's not, flexget works better when feed contains all entries all the time ?
182
etag = feed.cache.get('etag', None)
184
log.debug('Sending etag %s for feed %s' % (etag, feed.name))
185
modified = feed.cache.get('modified', None)
187
log.debug('Sending last-modified %s for feed %s' % (etag, feed.name))
190
# set timeout to one minute
191
orig_timout = socket.getdefaulttimeout()
192
socket.setdefaulttimeout(60)
194
# get the feed & parse
196
rss = feedparser.parse(config['url'], etag=etag, modified=modified, handlers=urllib2._opener.handlers)
198
rss = feedparser.parse(config['url'], etag=etag, modified=modified)
200
# restore original timeout
201
socket.setdefaulttimeout(orig_timout)
204
status = rss.get('status', False)
206
log.debug('RSS does not have status (normal if processing a file)')
208
log.debug('Feed %s hasn\'t changed, skipping' % feed.name)
211
raise PluginError('Authentication needed for feed %s: %s' % \
212
(feed.name, rss.headers['www-authenticate']), log)
214
raise PluginError('RSS Feed %s not found' % feed.name, log)
216
raise PluginError('Internal server exception on feed %s' % feed.name, log)
219
ex = rss.get('bozo_exception', False)
222
if isinstance(ex, feedparser.NonXMLContentType):
223
# see: http://www.feedparser.org/docs/character-encoding.html#advanced.encoding.nonxml
224
log.debug('ignoring feedparser.NonXMLContentType')
226
elif isinstance(ex, feedparser.CharacterEncodingOverride):
228
log.debug('ignoring feedparser.CharacterEncodingOverride')
230
elif isinstance(ex, UnicodeEncodeError):
232
log.info('Feed has UnicodeEncodeError but seems to produce entries, ignoring the error ...')
234
elif isinstance(ex, xml.sax._exceptions.SAXParseException):
236
# save invalid data for review, this is a bit ugly but users seem to really confused when
237
# html pages (login pages) are received
238
self.process_invalid_content(feed, config['url'])
239
if feed.manager.options.debug:
241
raise PluginError('Received invalid RSS content')
243
msg = 'Invalid XML received. However feedparser still produced entries. Ignoring the error ...'
244
if not config.get('silent', False):
249
elif isinstance(ex, httplib.BadStatusLine) or \
250
isinstance(ex, IOError):
251
raise ex # let the @internet decorator handle
253
# all other bozo errors
255
self.process_invalid_content(feed, config['url'])
256
raise PluginError('Unhandled bozo_exception. Type: %s (feed: %s)' % \
257
(ex.__class__.__name__, feed.name), log)
259
msg = 'Invalid RSS received. However feedparser still produced entries. Ignoring the error ...'
260
if not config.get('silent', False):
266
if rss.bozo and not ignore:
268
log.error('Bozo exception %s on feed %s' % (type(ex), feed.name))
271
log.warn('feedparser bozo bit missing, feedparser bug? (FlexGet ticket #721)')
273
log.debug('encoding %s' % rss.encoding)
275
# update etag, use last modified if no etag exists
277
if 'etag' in rss and type(rss['etag']) != feedparser.types.NoneType:
278
etag = rss.etag.replace("'", '').replace('"', '')
279
feed.cache.store('etag', etag, 90)
280
log.debug('etag %s saved for feed %s' % (etag, feed.name))
281
elif hasattr(rss, 'headers'):
282
if 'last-modified' in rss.headers:
283
feed.cache.store('modified', rss.modified, 90)
284
log.debug('last modified saved for feed %s', feed.name)
287
# new entries to be created
290
# field name for url can be configured by setting link.
291
# default value is auto but for example guid is used in some feeds
293
for entry in rss.entries:
295
# ignore entries without title
296
if not getattr(entry, 'title', None):
297
log.debug('skipping entry without title')
301
# convert title to ascii (cleanup)
302
if config.get('ascii', False):
303
entry.title = entry.title.encode('ascii', 'ignore')
305
# remove annoying zero width spaces
306
entry.title = entry.title.replace(u'\u200B', u'')
309
# TODO: confusing? refactor into class member ...
312
from flexget.utils.tools import decode_html
313
ea['title'] = entry.title
316
fields = ['guid', 'author', 'description']
317
# extend the list of fields to grab from other_fields list in config
318
fields.extend(config.get('other_fields', []))
321
if not isinstance(getattr(entry, field), basestring):
322
# Error if this field is not a string
323
log.error('Cannot grab non text field `%s` from rss.' % field)
324
# Remove field from list of fields to avoid repeated error
325
config['other_fields'].remove(field)
328
ea[field] = decode_html(entry[field])
329
if field in config.get('other_fields', []):
330
# Print a debug message for custom added fields
331
log.debug('Field `%s` set to `%s` for `%s`' % (field, ea[field], ea['title']))
332
except UnicodeDecodeError:
333
log.warning('Failed to decode entry `%s` field `%s`' % (ea['title'], field))
335
# store basic auth info
336
if 'username' in config and 'password' in config:
337
ea['basic_auth_username'] = config['username']
338
ea['basic_auth_password'] = config['password']
341
# create from enclosures if present
342
enclosures = entry.get('enclosures', [])
344
if len(enclosures) > 1 and not config.get('group_links'):
345
# There is more than 1 enclosure, create an Entry for each of them
346
log.debug('adding %i entries from enclosures' % len(enclosures))
347
for enclosure in enclosures:
348
if not 'href' in enclosure:
349
log.debug('RSS-entry `%s` enclosure does not have URL' % entry.title)
351
# There is a valid url for this enclosure, create an Entry for it
353
self.add_enclosure_info(ee, enclosure, config.get('filename', True), True)
355
# If we created entries for enclosures, we should not create an Entry for the main rss item
358
# create flexget entry
362
if not isinstance(config.get('link'), list):
363
# If the link field is not a list, search for first valid url
364
if config['link'] == 'auto':
365
# Auto mode, check for a single enclosure url first
366
if len(entry.get('enclosures', [])) == 1 and entry['enclosures'][0].get('href'):
367
self.add_enclosure_info(e, entry['enclosures'][0], config.get('filename', True))
369
# If there is no enclosure url, check link, then guid field for urls
370
for field in ['link', 'guid']:
372
e['url'] = entry[field]
375
if entry.get(config['link']):
376
e['url'] = entry[config['link']]
378
# If link was passed as a list, we create a list of urls
379
for field in config['link']:
381
e.setdefault('url', entry[field])
382
if entry[field] not in e.setdefault('urls', []):
383
e['urls'].append(entry[field])
385
if config.get('group_links'):
386
# Append a list of urls from enclosures to the urls field if group_links is enabled
387
e.setdefault('urls', [e['url']]).extend(
388
[enc.href for enc in entry.get('enclosures', []) if enc.get('href') not in e['urls']])
391
log.debug('%s does not have link (%s) or enclosure' % (entry.title, config['link']))
398
if not config.get('silent'):
399
log.warning('Skipped %s RSS-entries without required information (title, link or enclosures)' % ignored)
403
register_plugin(InputRSS, 'rss', api_ver=2)