10
from cgi import parse_header
11
from httplib import BadStatusLine
12
from requests import RequestException
13
from flexget.plugin import register_plugin, register_parser_option, get_plugin_by_name, PluginWarning, PluginError
14
from flexget.utils.tools import decode_html
15
from flexget.utils.template import RenderError
17
log = logging.getLogger('download')
20
class PluginDownload(object):
23
Downloads content from entry url and writes it into a file.
31
By default download plugin reports failure if received content
32
is a html. Usually this is some sort of custom error page without
33
proper http code and thus entry is assumed to be downloaded
36
In the rare case you actually need to retrieve html-pages you must
45
You may use commandline parameter --dl-path to temporarily override
46
all paths to another location.
50
"""Return config validator"""
51
from flexget import validator
52
root = validator.factory()
53
root.accept('path', allow_replacement=True)
54
root.accept('boolean')
55
advanced = root.accept('dict')
56
advanced.accept('path', key='path', allow_replacement=True)
57
advanced.accept('boolean', key='fail_html')
58
advanced.accept('boolean', key='overwrite')
61
def process_config(self, config):
62
"""Return plugin configuration in advanced form"""
63
if isinstance(config, basestring):
64
config = {'path': config}
65
if not isinstance(config, dict):
67
config.setdefault('fail_html', True)
68
if not config.get('path'):
69
config['require_path'] = True
72
def on_process_start(self, feed, config):
73
"""Register the usable set keywords."""
74
set_plugin = get_plugin_by_name('set')
75
set_plugin.instance.register_keys({'path': 'text'})
77
def on_feed_download(self, feed, config):
78
config = self.process_config(config)
79
self.get_temp_files(feed, require_path=config.get('require_path', False), fail_html=config['fail_html'])
81
def get_temp_file(self, feed, entry, require_path=False, handle_magnets=False, fail_html=True):
82
"""Download entry content and store in temporary folder.
84
:param bool require_path:
85
whether or not entries without 'path' field are ignored
86
:param bool handle_magnets:
87
when used any of urls containing magnet link will replace url,
88
otherwise warning is printed.
90
fail entries which url respond with html content
93
urls = entry.get('urls')
98
if url.startswith('magnet:'):
100
# Set magnet link as main url, so a torrent client plugin can grab it
101
log.debug('Accepting magnet url for %s' % entry['title'])
105
log.warning('Can\'t download magnet url')
106
errors.append('Magnet URL')
108
if require_path and 'path' not in entry:
109
# Don't fail here, there might be a magnet later in the list of urls
110
log.debug('Skipping url %s because there is no path for download' % url)
112
error = self.process_entry(feed, entry, url)
114
# disallow html content
115
html_mimes = ['html', 'text/html']
116
if entry.get('mime-type') in html_mimes and fail_html:
117
error = 'Unexpected html content received from `%s` - maybe a login page?' % entry['url']
118
self.cleanup_temp_file(entry)
121
# Set the main url, so we know where this file actually came from
122
log.debug('Successfully retrieved %s from %s' % (entry['title'], url))
128
# check if entry must have a path (download: yes)
129
if require_path and 'path' not in entry:
130
log.error('%s can\'t be downloaded, no path specified for entry' % entry['title'])
131
feed.fail(entry, 'no path specified for entry')
133
feed.fail(entry, ", ".join(errors))
135
def save_error_page(self, entry, feed, page):
136
received = os.path.join(feed.manager.config_base, 'received', feed.name)
137
if not os.path.isdir(received):
138
os.makedirs(received)
139
filename = os.path.join(received, '%s.error' % entry['title'].encode(sys.getfilesystemencoding(), 'replace'))
140
log.error('Error retrieving %s, the error page has been saved to %s' % (entry['title'], filename))
141
outfile = open(filename, 'w')
147
def get_temp_files(self, feed, require_path=False, handle_magnets=False, fail_html=True):
148
"""Download all feed content and store in temporary folder.
150
:param bool require_path:
151
whether or not entries without 'path' field are ignored
152
:param bool handle_magnets:
153
when used any of urls containing magnet link will replace url,
154
otherwise warning is printed.
156
fail entries which url respond with html content
158
for entry in feed.accepted:
159
self.get_temp_file(feed, entry, require_path, handle_magnets, fail_html)
161
def process_entry(self, feed, entry, url):
162
"""Processes :entry: by using :url: from it.
163
Does not fail the :entry: if there is a network issue, instead just log and return a string error."""
165
if feed.manager.options.test:
166
log.info('Would download: %s' % entry['title'])
168
if not feed.manager.unit_test:
169
log.info('Downloading: %s' % entry['title'])
170
self.download_entry(feed, entry, url)
171
except RequestException, e:
172
# TODO: Improve this error message?
173
log.warning('RequestException %s' % e)
174
return 'Request Exception'
175
# TODO: I think these exceptions will not be thrown by requests library.
176
except urllib2.HTTPError, e:
177
log.warning('HTTPError %s' % e.code)
179
except urllib2.URLError, e:
180
log.warning('URLError %s' % e.reason)
182
except BadStatusLine, e:
183
log.warning('Failed to reach server. Reason: %s' % getattr(e, 'message', 'N/A'))
184
return 'BadStatusLine'
186
if hasattr(e, 'reason'):
187
log.warning('Failed to reach server. Reason: %s' % e.reason)
188
elif hasattr(e, 'code'):
189
log.warning('The server couldn\'t fulfill the request. Error code: %s' % e.code)
190
log.debug('IOError', exc_info=True)
192
except ValueError, e:
193
# Probably unknown url type
194
msg = 'ValueError %s' % e
196
log.debug(msg, exc_info=True)
199
def download_entry(self, feed, entry, url):
200
"""Downloads :entry: by using :url:
203
Several types of exceptions ...
207
# see http://bugs.python.org/issue1712522
208
# note, url is already unicode ...
210
url = url.encode('latin1')
211
except UnicodeEncodeError:
212
log.debug('URL for `%s` could not be encoded in latin1' % entry['title'])
214
url = url.encode('utf-8')
216
log.warning('Unable to URL-encode URL for `%s`' % entry['title'])
217
if not isinstance(url, unicode):
218
url = urllib.quote(url, safe=':/~?=&%')
219
log.debug('Downloading url \'%s\'' % url)
223
if 'basic_auth_password' in entry and 'basic_auth_username' in entry:
224
log.debug('Basic auth enabled. User: %s Password: %s' % (entry['basic_auth_username'], entry['basic_auth_password']))
225
auth = (entry['basic_auth_username'], entry['basic_auth_password'])
227
response = feed.requests.get(url, auth=auth, raise_status=False)
228
if response.status_code != 200:
229
# Save the error page
230
response.encoding = None
232
self.save_error_page(entry, feed, response.content)
234
response.raise_for_status()
237
# generate temp file, with random md5 sum ..
238
# url alone is not random enough, it has happened that there are two entries with same url
239
md5_hash = hashlib.md5('%s%s' % (url, time.time())).hexdigest()
240
tmp_path = os.path.join(feed.manager.config_base, 'temp')
241
if not os.path.isdir(tmp_path):
242
logging.debug('creating tmp_path %s' % tmp_path)
244
datafile = os.path.join(tmp_path, md5_hash)
246
# download and write data into a temp file
247
outfile = open(datafile, 'wb')
249
for chunk in response.iter_content(decode_unicode=False):
252
# don't leave futile files behind
253
# outfile has to be closed before we can delete it on Windows
255
log.debug('Download interrupted, removing datafile')
260
# Do a sanity check on downloaded file
261
if os.path.getsize(datafile) == 0:
262
feed.fail(entry, 'File %s is 0 bytes in size' % datafile)
265
# store temp filename into entry so other plugins may read and modify content
266
# temp file is moved into final destination at self.output
267
entry['file'] = datafile
268
log.debug('%s field file set to: %s' % (entry['title'], entry['file']))
271
entry['mime-type'] = response.headers['content-type']
273
content_encoding = response.headers.get('content-encoding', '')
274
decompress = 'gzip' in content_encoding or 'deflate' in content_encoding
275
if 'content-length' in response.headers and not decompress:
276
entry['content-length'] = int(response.headers['content-length'])
278
# prefer content-disposition naming, note: content-disposition can be disabled completely
279
# by setting entry field `content-disposition` to False
280
if entry.get('content-disposition', True):
281
self.filename_from_headers(entry, response)
283
log.info('Content-disposition disabled for %s' % entry['title'])
284
self.filename_ext_from_mime(entry)
285
# TODO: LAST resort, try to scrap url for filename?
287
def filename_from_headers(self, entry, response):
288
"""Checks entry filename if it's found from content-disposition"""
289
if not response.headers.get('content-disposition'):
290
# No content disposition header, nothing we can do
292
filename = parse_header(response.headers['content-disposition'])[1].get('filename')
295
# try to decode to unicode, specs allow latin1, some may do utf-8 anyway
297
filename = filename.decode('latin1')
298
log.debug('filename header latin1 decoded')
301
filename = filename.decode('utf-8')
302
log.debug('filename header UTF-8 decoded')
305
filename = decode_html(filename)
306
log.debug('Found filename from headers: %s' % filename)
307
if 'filename' in entry:
308
log.debug('Overriding filename %s with %s from content-disposition' % (entry['filename'], filename))
309
entry['filename'] = filename
311
def filename_ext_from_mime(self, entry):
312
"""Tries to set filename extension from mime-type"""
313
extension = mimetypes.guess_extension(entry['mime-type'])
315
log.debug('Mimetype guess for %s is %s ' % (entry['mime-type'], extension))
316
if entry.get('filename'):
317
if entry['filename'].endswith(extension):
318
log.debug('Filename %s extension matches to mime-type' % entry['filename'])
320
log.debug('Adding mime-type extension %s to %s' % (extension, entry['filename']))
321
entry['filename'] = entry['filename'] + extension
323
log.debug('Python doesn\'t know extension for mime-type: %s' % entry['mime-type'])
325
def on_feed_output(self, feed, config):
326
"""Move downloaded content from temp folder to final destination"""
327
config = self.process_config(config)
328
for entry in feed.accepted:
330
self.output(feed, entry, config)
331
except PluginWarning, e:
333
log.error('Plugin error while writing: %s' % e)
336
log.exception('Exception while writing: %s' % e)
338
def output(self, feed, entry, config):
339
"""Moves temp-file into final destination
342
PluginError if operation fails
345
if 'file' not in entry and not feed.manager.options.test:
346
log.debug('file missing, entry: %s' % entry)
347
raise PluginError('Entry `%s` has no temp file associated with' % entry['title'])
350
# use path from entry if has one, otherwise use from download definition parameter
351
path = entry.get('path', config.get('path'))
352
if not isinstance(path, basestring):
353
raise PluginError('Invalid `path` in entry `%s`' % entry['title'])
355
# override path from command line parameter
356
if feed.manager.options.dl_path:
357
path = feed.manager.options.dl_path
359
# expand variables in path
361
path = os.path.expanduser(entry.render(path))
362
except RenderError, e:
363
feed.fail(entry, 'Could not set path. Error during string replacement: %s' % e)
366
# If we are in test mode, report and return
367
if feed.manager.options.test:
368
log.info('Would write `%s` to `%s`' % (entry['title'], path))
369
# Set a fake location, so the exec plugin can do string replacement during --test #1015
370
entry['output'] = os.path.join(path, 'TEST_MODE_NO_OUTPUT')
374
if not os.path.isdir(path):
375
log.info('Creating directory %s' % path)
379
raise PluginError('Cannot create path %s' % path, log)
381
# check that temp file is present
382
if not os.path.exists(entry['file']):
383
tmp_path = os.path.join(feed.manager.config_base, 'temp')
384
log.debug('entry: %s' % entry)
385
log.debug('temp: %s' % ', '.join(os.listdir(tmp_path)))
386
raise PluginWarning('Downloaded temp file `%s` doesn\'t exist!?' % entry['file'])
388
# if we still don't have a filename, try making one from title (last resort)
389
if not entry.get('filename'):
390
entry['filename'] = entry['title']
391
log.debug('set filename from title %s' % entry['filename'])
392
if not 'mime-type' in entry:
393
log.warning('Unable to figure proper filename for %s. Using title.' % entry['title'])
395
guess = mimetypes.guess_extension(entry['mime-type'])
397
log.warning('Unable to guess extension with mime-type %s' % guess)
399
self.filename_ext_from_mime(entry)
401
# combine to full path + filename, replace / from filename (replaces bc tickets #208, #325, #353)
402
name = entry.get('filename', entry['title'])
403
for char in '/:<>^*?~"':
404
name = name.replace(char, ' ')
405
# remove duplicate spaces
406
name = ' '.join(name.split())
407
destfile = os.path.join(path, name)
408
log.debug('destfile: %s' % destfile)
410
if os.path.exists(destfile):
412
if filecmp.cmp(entry['file'], destfile):
413
log.debug("Identical destination file '%s' already exists", destfile)
414
elif config.get('overwrite'):
415
log.debug("Overwriting already existing file %s" % destfile)
417
log.info('File `%s` already exists and is not identical, download failed.' % destfile)
418
feed.fail(entry, 'File `%s` already exists and is not identical.' % destfile)
422
log.debug('moving %s to %s' % (entry['file'], destfile))
425
shutil.move(entry['file'], destfile)
427
# ignore permission errors, see ticket #555
429
if not os.path.exists(destfile):
430
raise PluginError('Unable to write %s' % destfile)
431
if err.errno != errno.EPERM:
434
# store final destination as output key
435
entry['output'] = destfile
438
self.cleanup_temp_file(entry)
440
def on_feed_exit(self, feed, config):
441
"""Make sure all temp files are cleaned up when feed exits"""
442
self.cleanup_temp_files(feed)
444
def on_feed_abort(self, feed, config):
445
"""Make sure all temp files are cleaned up when feed is aborted."""
446
self.cleanup_temp_files(feed)
448
def cleanup_temp_file(self, entry):
450
if os.path.exists(entry['file']):
451
log.debug('removing temp file %s from %s' % (entry['file'], entry['title']))
452
os.remove(entry['file'])
455
def cleanup_temp_files(self, feed):
456
"""Checks all entries for leftover temp files and deletes them."""
457
for entry in feed.entries + feed.rejected + feed.failed:
458
self.cleanup_temp_file(entry)
460
register_plugin(PluginDownload, 'download', api_ver=2)
461
register_parser_option('--dl-path', action='store', dest='dl_path', default=False,
462
metavar='PATH', help='Override path for download plugin. Applies to all executed feeds.')