6
from flexget.plugin import register_plugin, register_parser_option, get_plugin_by_name, PluginWarning, PluginError
7
from httplib import BadStatusLine
8
from flexget.utils.tools import urlopener, replace_from_entry
11
log = logging.getLogger('download')
14
class PluginDownload(object):
17
Downloads content from entry url and writes it into a file.
25
By default download plugin reports failure if received content
26
is a html. Usually this is some sort of custom error page without
27
proper http code and thus entry is assumed to be downloaded
30
In the rare case you actually need to retrieve html-pages you must
37
You may use commandline parameter --dl-path to temporarily override
38
all paths to another location.
42
"""Return config validator"""
43
from flexget import validator
44
root = validator.factory()
45
root.accept('path', allow_replacement=True)
46
root.accept('boolean')
47
advanced = root.accept('dict')
48
advanced.accept('path', key='path', allow_replacement=True)
49
advanced.accept('boolean', key='fail_html')
50
advanced.accept('boolean', key='overwrite')
53
def get_config(self, feed):
54
"""Return plugin configuration in advanced form"""
55
config = feed.config['download']
56
if isinstance(config, basestring):
57
config = {'path': config}
58
if not isinstance(config, dict):
60
config.setdefault('fail_html', True)
61
if not config.get('path'):
62
config['require_path'] = True
65
def on_process_start(self, feed):
66
"""Register the usable set keywords."""
67
set_plugin = get_plugin_by_name('set')
68
set_plugin.instance.register_keys({'path': 'text'})
70
def on_feed_download(self, feed):
71
config = self.get_config(feed)
72
self.get_temp_files(feed, require_path=config.get('require_path', False), fail_html=config['fail_html'])
74
def get_temp_files(self, feed, require_path=False, handle_magnets=False, fail_html=True):
75
"""Download all feed content and store in temporary folder.
77
:require_path: whether or not entries without 'path' field are ignored
78
:handle_magnets: when used any of urls containing magnet link will replace url, otherwise warning is printed.
80
for entry in feed.accepted:
82
urls = entry.get('urls')
87
if url.startswith('magnet:'):
89
# Set magnet link as main url, so a torrent client plugin can grab it
90
log.debug('Accepting magnet url for %s' % entry['title'])
94
log.warning('Can\'t download magnet url')
95
errors.append('Magnet URL')
97
if require_path and 'path' not in entry:
98
# Don't fail here, there might be a magnet later in the list of urls
99
log.debug('Skipping url %s because there is no path for download' % url)
101
error = self.process_entry(feed, entry, url)
103
# disallow html content
104
html_mimes = ['html', 'text/html']
105
if entry.get('mime-type') in html_mimes and fail_html:
106
error = 'Unexpected html content received from `%s` - maybe a login page?' % entry['url']
107
self.cleanup_temp_file(entry)
110
# Set the main url, so we know where this file actually came from
111
log.debug('Successfully retrieved %s from %s' % (entry['title'], url))
117
# check if entry must have a path (download: yes)
118
if require_path and 'path' not in entry:
119
log.error('%s can\'t be downloaded, no path specified for entry' % entry['title'])
120
feed.fail(entry, 'no path specified for entry')
122
feed.fail(entry, ", ".join(errors))
124
def process_entry(self, feed, entry, url):
125
"""Processes :entry: by using :url: from it.
126
Does not fail the :entry: if there is a network issue, instead just log and return a string error."""
128
if feed.manager.options.test:
129
log.info('Would download: %s' % entry['title'])
131
if not feed.manager.unit_test:
132
log.info('Downloading: %s' % entry['title'])
133
self.download_entry(feed, entry, url)
134
except urllib2.HTTPError, e:
135
log.warning('HTTPError %s' % e.code)
137
except urllib2.URLError, e:
138
log.warning('URLError %s' % e.reason)
140
except BadStatusLine, e:
141
log.warning('Failed to reach server. Reason: %s' % e.reason)
142
return 'BadStatusLine'
144
if hasattr(e, 'reason'):
145
log.warning('Failed to reach server. Reason: %s' % e.reason)
146
elif hasattr(e, 'code'):
147
log.warning('The server couldn\'t fulfill the request. Error code: %s' % e.code)
149
except ValueError, e:
150
# Probably unknown url type
151
log.warning(e.message)
154
def download_entry(self, feed, entry, url):
155
"""Downloads :entry: by using :url:.
156
May raise several types of exception(s) or PluginWarning"""
158
# see http://bugs.python.org/issue1712522
159
# note, url is already unicode ...
161
url = url.encode('latin1')
162
except UnicodeEncodeError:
163
log.debug('URL for `%s` could not be encoded in latin1' % entry['title'])
165
url = url.encode('utf-8')
167
log.warning('Unable to URL-encode URL for `%s`' % entry['title'])
168
if not isinstance(url, unicode):
169
url = urllib.quote(url, safe=':/~?=&%')
170
log.debug('Downloading url \'%s\'' % url)
173
if 'basic_auth_password' in entry and 'basic_auth_username' in entry:
174
log.debug('Basic auth enabled. User: %s Password: %s' % (entry['basic_auth_username'], entry['basic_auth_password']))
175
passman = urllib2.HTTPPasswordMgrWithDefaultRealm()
176
passman.add_password(None, url, entry['basic_auth_username'], entry['basic_auth_password'])
177
handlers = [urllib2.HTTPBasicAuthHandler(passman)]
181
opener = urlopener(url, log, handlers=handlers)
182
if opener.headers.get('content-encoding') in ('gzip', 'x-gzip', 'deflate'):
184
decompress = zlib.decompressobj(15 + 32).decompress
188
# generate temp file, with random md5 sum ..
189
# url alone is not random enough, it has happened that there are two entries with same url
193
m.update('%s' % time.time())
194
tmp_path = os.path.join(feed.manager.config_base, 'temp')
195
if not os.path.isdir(tmp_path):
196
logging.debug('creating tmp_path %s' % tmp_path)
198
datafile = os.path.join(tmp_path, m.hexdigest())
200
def read_chunks(data, buffer_size=1024):
201
""" Helper generator to iterate over data in chunks """
203
chunk = data.read(buffer_size)
208
# download and write data into a temp file
210
outfile = open(datafile, 'wb')
212
for chunk in read_chunks(opener):
213
outfile.write(decompress(chunk) if decompress else chunk)
215
# don't leave futile files behind
216
# outfile has to be closed before we can delete it on Windows
218
log.debug('Download interrupted, removing datafile')
222
# store temp filename into entry so other plugins may read and modify content
223
# temp file is moved into final destination at self.output
224
entry['file'] = datafile
225
log.debug('%s field file set to: %s' % (entry['title'], entry['file']))
229
entry['mime-type'] = opener.headers.gettype()
231
if 'content-length' in opener.headers and not decompress:
232
entry['content-length'] = int(opener.headers.get('content-length'))
234
# prefer content-disposition naming, note: content-disposition can be disabled completely
235
# by setting entry field `content-disposition` to False
236
if entry.get('content-disposition', True):
237
self.filename_from_headers(entry, opener)
239
log.info('Content-disposition disabled for %s' % entry['title'])
240
self.filename_ext_from_mime(entry)
241
# TODO: LAST resort, try to scrap url for filename?
243
def filename_from_headers(self, entry, response):
244
"""Checks entry filename if it's found from content-disposition"""
245
from flexget.utils.tools import encode_html, decode_html
248
data = str(response.info())
250
# try to decode/encode, afaik this is against the specs but some servers do it anyway
252
data = data.decode('utf-8')
253
log.debug('response info UTF-8 decoded')
257
log.debug('response info unicoded')
261
# now we should have unicode string, let's convert into proper format where non-ascii
263
data = encode_html(data)
265
filename = email.message_from_string(data).get_filename(failobj=False)
267
log.error('Failed to decode filename from response: %s' % ''.join(['%02x' % ord(x) for x in data]))
270
filename = decode_html(filename)
271
log.debug('Found filename from headers: %s' % filename)
272
if 'filename' in entry:
273
log.debug('Overriding filename %s with %s from content-disposition' % (entry['filename'], filename))
274
entry['filename'] = filename
276
def filename_ext_from_mime(self, entry):
277
"""Tries to set filename extension from mime-type"""
278
extension = mimetypes.guess_extension(entry['mime-type'])
280
log.debug('Mimetype guess for %s is %s ' % (entry['mime-type'], extension))
281
if entry.get('filename'):
282
if entry['filename'].endswith(extension):
283
log.debug('Filename %s extension matches to mime-type' % entry['filename'])
285
log.debug('Adding mime-type extension %s to %s' % (extension, entry['filename']))
286
entry['filename'] = entry['filename'] + extension
288
log.debug('Python doesn\'t know extension for mime-type: %s' % entry['mime-type'])
290
def on_feed_output(self, feed):
291
"""Move downloaded content from temp folder to final destination"""
292
for entry in feed.accepted:
294
if feed.manager.options.test:
295
log.info('Would write: %s' % entry['title'])
297
self.output(feed, entry)
298
except PluginWarning, e:
300
log.error('Plugin error while writing: %s' % e)
303
log.exception('Exception while writing: %s' % e)
305
def output(self, feed, entry):
306
"""Moves temp-file into final destination"""
308
config = self.get_config(feed)
310
if 'file' not in entry:
311
log.debug('file missing, entry: %s' % entry)
312
raise PluginError('Entry %s has no temp file associated with' % entry['title'])
315
# use path from entry if has one, otherwise use from download definition parameter
316
path = entry.get('path', config.get('path'))
318
raise PluginError('Unreachable situation?')
320
# override path from command line parameter
321
if feed.manager.options.dl_path:
322
path = feed.manager.options.dl_path
324
# if we still don't have a filename, try making one from title (last resort)
325
if not entry.get('filename'):
326
entry['filename'] = entry['title']
327
log.debug('set filename from title %s' % entry['filename'])
328
if not 'mime-type' in entry:
329
log.warning('Unable to figure proper filename for %s. Using title.' % entry['title'])
331
guess = mimetypes.guess_extension(entry['mime-type'])
333
log.warning('Unable to guess extension with mime-type %s' % guess)
335
self.filename_ext_from_mime(entry)
337
# expand variables in path
338
path = replace_from_entry(path, entry, 'path', log.error)
340
feed.fail(entry, 'Could not set path. Does not contain all fields for string replacement.')
342
path = os.path.expanduser(path)
345
if not os.path.isdir(path):
346
log.info('Creating directory %s' % path)
350
raise PluginError('Cannot create path %s' % path, log)
352
# check that temp file is present
353
if not os.path.exists(entry['file']):
354
tmp_path = os.path.join(feed.manager.config_base, 'temp')
355
log.debug('entry: %s' % entry)
356
log.debug('temp: %s' % ', '.join(os.listdir(tmp_path)))
357
raise PluginWarning("Downloaded temp file '%s' doesn't exist!?" % entry['file'])
359
# combine to full path + filename, replace / from filename (replaces: #208, #325, #353)
360
name = entry.get('filename', entry['title'])
361
for char in '/:<>^*?~':
362
name = name.replace(char, ' ')
363
# remove duplicate spaces
364
name = ' '.join(name.split())
365
destfile = os.path.join(path, name)
366
log.debug('destfile: %s' % destfile)
368
if os.path.exists(destfile):
370
if filecmp.cmp(entry['file'], destfile):
371
log.debug("Identical destination file '%s' already exists", destfile)
373
elif config.get('overwrite'):
374
log.debug("Overwriting already existing file %s" % destfile)
376
log.info('File \'%s\' already exists and is not identical, download failed.' % destfile)
377
feed.fail(entry, 'File \'%s\' already exists and is not identical.' % destfile)
381
log.debug('moving %s to %s' % (entry['file'], destfile))
385
shutil.move(entry['file'], destfile)
387
# ignore permission errors, see ticket #555
389
if not os.path.exists(destfile):
390
raise PluginError('Unable to write %s' % destfile)
391
if err.errno != errno.EPERM:
394
# store final destination as output key
395
entry['output'] = destfile
398
self.cleanup_temp_file(entry)
400
def on_feed_exit(self, feed):
401
"""Make sure all temp files are cleaned up when feed exits"""
402
self.cleanup_temp_files(feed)
404
def on_feed_abort(self, feed):
405
"""Make sure all temp files are cleaned up when feed is aborted."""
406
self.cleanup_temp_files(feed)
408
def cleanup_temp_file(self, entry):
410
if os.path.exists(entry['file']):
411
log.debug('removing temp file %s from %s' % (entry['file'], entry['title']))
412
os.remove(entry['file'])
415
def cleanup_temp_files(self, feed):
416
"""Checks all entries for leftover temp files and deletes them."""
417
for entry in feed.entries + feed.rejected + feed.failed:
418
self.cleanup_temp_file(entry)
420
register_plugin(PluginDownload, 'download')
421
register_parser_option('--dl-path', action='store', dest='dl_path', default=False,
422
metavar='PATH', help='Override path for download plugin. Applies to all executed feeds.')