flexget.plugins.output.download
Covered: 170 lines
Missed: 194 lines
Skipped 99 lines
Percent: 46 %
  1
import os
  2
import time
  3
import urllib
  4
import urllib2
  5
import logging
  6
import mimetypes
  7
import hashlib
  8
import shutil
  9
import sys
 10
from cgi import parse_header
 11
from httplib import BadStatusLine
 12
from requests import RequestException
 13
from flexget.plugin import register_plugin, register_parser_option, get_plugin_by_name, PluginWarning, PluginError
 14
from flexget.utils.tools import decode_html
 15
from flexget.utils.template import RenderError
 17
log = logging.getLogger('download')
 20
class PluginDownload(object):
 22
    """
 23
    Downloads content from entry url and writes it into a file.
 25
    Example::
 27
      download: ~/torrents/
 29
    Allow HTML content:
 31
    By default download plugin reports failure if received content
 32
    is a html. Usually this is some sort of custom error page without
 33
    proper http code and thus entry is assumed to be downloaded
 34
    incorrectly.
 36
    In the rare case you actually need to retrieve html-pages you must
 37
    disable this feature.
 39
    Example::
 41
      download:
 42
        path: ~/something/
 43
        fail_html: no
 45
    You may use commandline parameter --dl-path to temporarily override
 46
    all paths to another location.
 47
    """
 49
    def validator(self):
 50
        """Return config validator"""
 51
        from flexget import validator
 52
        root = validator.factory()
 53
        root.accept('path', allow_replacement=True)
 54
        root.accept('boolean')
 55
        advanced = root.accept('dict')
 56
        advanced.accept('path', key='path', allow_replacement=True)
 57
        advanced.accept('boolean', key='fail_html')
 58
        advanced.accept('boolean', key='overwrite')
 59
        return root
 61
    def process_config(self, config):
 62
        """Return plugin configuration in advanced form"""
 63
        if isinstance(config, basestring):
 64
            config = {'path': config}
 65
        if not isinstance(config, dict):
 66
            config = {}
 67
        config.setdefault('fail_html', True)
 68
        if not config.get('path'):
 69
            config['require_path'] = True
 70
        return config
 72
    def on_process_start(self, feed, config):
 73
        """Register the usable set keywords."""
 74
        set_plugin = get_plugin_by_name('set')
 75
        set_plugin.instance.register_keys({'path': 'text'})
 77
    def on_feed_download(self, feed, config):
 78
        config = self.process_config(config)
 79
        self.get_temp_files(feed, require_path=config.get('require_path', False), fail_html=config['fail_html'])
 81
    def get_temp_file(self, feed, entry, require_path=False, handle_magnets=False, fail_html=True):
 82
        """Download entry content and store in temporary folder.
 84
        :param bool require_path:
 85
          whether or not entries without 'path' field are ignored
 86
        :param bool handle_magnets:
 87
          when used any of urls containing magnet link will replace url,
 88
          otherwise warning is printed.
 89
        :param fail_html:
 90
          fail entries which url respond with html content
 91
        """
 92
        if entry.get('urls'):
 93
            urls = entry.get('urls')
 94
        else:
 95
            urls = [entry['url']]
 96
        errors = []
 97
        for url in urls:
 98
            if url.startswith('magnet:'):
 99
                if handle_magnets:
101
                    log.debug('Accepting magnet url for %s' % entry['title'])
102
                    entry['url'] = url
103
                    break
104
                else:
105
                    log.warning('Can\'t download magnet url')
106
                    errors.append('Magnet URL')
107
                    continue
108
            if require_path and 'path' not in entry:
110
                log.debug('Skipping url %s because there is no path for download' % url)
111
                continue
112
            error = self.process_entry(feed, entry, url)
115
            html_mimes = ['html', 'text/html']
116
            if entry.get('mime-type') in html_mimes and fail_html:
117
                error = 'Unexpected html content received from `%s` - maybe a login page?' % entry['url']
118
                self.cleanup_temp_file(entry)
120
            if not error:
122
                log.debug('Successfully retrieved %s from %s' % (entry['title'], url))
123
                entry['url'] = url
124
                break
125
            else:
126
                errors.append(error)
127
        else:
129
            if require_path and 'path' not in entry:
130
                log.error('%s can\'t be downloaded, no path specified for entry' % entry['title'])
131
                feed.fail(entry, 'no path specified for entry')
132
            else:
133
                feed.fail(entry, ", ".join(errors))
135
    def save_error_page(self, entry, feed, page):
136
        received = os.path.join(feed.manager.config_base, 'received', feed.name)
137
        if not os.path.isdir(received):
138
            os.makedirs(received)
139
        filename = os.path.join(received, '%s.error' % entry['title'].encode(sys.getfilesystemencoding(), 'replace'))
140
        log.error('Error retrieving %s, the error page has been saved to %s' % (entry['title'], filename))
141
        outfile = open(filename, 'w')
142
        try:
143
            outfile.write(page)
144
        finally:
145
            outfile.close()
147
    def get_temp_files(self, feed, require_path=False, handle_magnets=False, fail_html=True):
148
        """Download all feed content and store in temporary folder.
150
        :param bool require_path:
151
          whether or not entries without 'path' field are ignored
152
        :param bool handle_magnets:
153
          when used any of urls containing magnet link will replace url,
154
          otherwise warning is printed.
155
        :param fail_html:
156
          fail entries which url respond with html content
157
        """
158
        for entry in feed.accepted:
159
            self.get_temp_file(feed, entry, require_path, handle_magnets, fail_html)
161
    def process_entry(self, feed, entry, url):
162
        """Processes :entry: by using :url: from it.
163
           Does not fail the :entry: if there is a network issue, instead just log and return a string error."""
164
        try:
165
            if feed.manager.options.test:
166
                log.info('Would download: %s' % entry['title'])
167
            else:
168
                if not feed.manager.unit_test:
169
                    log.info('Downloading: %s' % entry['title'])
170
                self.download_entry(feed, entry, url)
171
        except RequestException, e:
173
            log.warning('RequestException %s' % e)
174
            return 'Request Exception'
176
        except urllib2.HTTPError, e:
177
            log.warning('HTTPError %s' % e.code)
178
            return 'HTTP error'
179
        except urllib2.URLError, e:
180
            log.warning('URLError %s' % e.reason)
181
            return 'URL Error'
182
        except BadStatusLine, e:
183
            log.warning('Failed to reach server. Reason: %s' % getattr(e, 'message', 'N/A'))
184
            return 'BadStatusLine'
185
        except IOError, e:
186
            if hasattr(e, 'reason'):
187
                log.warning('Failed to reach server. Reason: %s' % e.reason)
188
            elif hasattr(e, 'code'):
189
                log.warning('The server couldn\'t fulfill the request. Error code: %s' % e.code)
190
            log.debug('IOError', exc_info=True)
191
            return 'IOError'
192
        except ValueError, e:
194
            msg = 'ValueError %s' % e
195
            log.warning(msg)
196
            log.debug(msg, exc_info=True)
197
            return msg
199
    def download_entry(self, feed, entry, url):
200
        """Downloads :entry: by using :url:
202
        Raises:
203
            Several types of exceptions ...
204
            PluginWarning
205
        """
209
        try:
210
            url = url.encode('latin1')
211
        except UnicodeEncodeError:
212
            log.debug('URL for `%s` could not be encoded in latin1' % entry['title'])
213
            try:
214
                url = url.encode('utf-8')
215
            except:
216
                log.warning('Unable to URL-encode URL for `%s`' % entry['title'])
217
        if not isinstance(url, unicode):
218
            url = urllib.quote(url, safe=':/~?=&%')
219
        log.debug('Downloading url \'%s\'' % url)
222
        auth = None
223
        if 'basic_auth_password' in entry and 'basic_auth_username' in entry:
224
            log.debug('Basic auth enabled. User: %s Password: %s' % (entry['basic_auth_username'], entry['basic_auth_password']))
225
            auth = (entry['basic_auth_username'], entry['basic_auth_password'])
227
        response = feed.requests.get(url, auth=auth, raise_status=False)
228
        if response.status_code != 200:
230
            response.encoding = None
231
            if response.content:
232
                self.save_error_page(entry, feed, response.content)
234
            response.raise_for_status()
235
            return
239
        md5_hash = hashlib.md5('%s%s' % (url, time.time())).hexdigest()
240
        tmp_path = os.path.join(feed.manager.config_base, 'temp')
241
        if not os.path.isdir(tmp_path):
242
            logging.debug('creating tmp_path %s' % tmp_path)
243
            os.mkdir(tmp_path)
244
        datafile = os.path.join(tmp_path, md5_hash)
247
        outfile = open(datafile, 'wb')
248
        try:
249
            for chunk in response.iter_content(decode_unicode=False):
250
                outfile.write(chunk)
251
        except:
254
            outfile.close()
255
            log.debug('Download interrupted, removing datafile')
256
            os.remove(datafile)
257
            raise
258
        else:
259
            outfile.close()
261
            if os.path.getsize(datafile) == 0:
262
                feed.fail(entry, 'File %s is 0 bytes in size' % datafile)
263
                os.remove(datafile)
264
                return
267
            entry['file'] = datafile
268
            log.debug('%s field file set to: %s' % (entry['title'], entry['file']))
271
        entry['mime-type'] = response.headers['content-type']
273
        content_encoding = response.headers.get('content-encoding', '')
274
        decompress = 'gzip' in content_encoding or 'deflate' in content_encoding
275
        if 'content-length' in response.headers and not decompress:
276
            entry['content-length'] = int(response.headers['content-length'])
280
        if entry.get('content-disposition', True):
281
            self.filename_from_headers(entry, response)
282
        else:
283
            log.info('Content-disposition disabled for %s' % entry['title'])
284
        self.filename_ext_from_mime(entry)
287
    def filename_from_headers(self, entry, response):
288
        """Checks entry filename if it's found from content-disposition"""
289
        if not response.headers.get('content-disposition'):
291
            return
292
        filename = parse_header(response.headers['content-disposition'])[1].get('filename')
294
        if filename:
296
            try:
297
                filename = filename.decode('latin1')
298
                log.debug('filename header latin1 decoded')
299
            except UnicodeError:
300
                try:
301
                    filename = filename.decode('utf-8')
302
                    log.debug('filename header UTF-8 decoded')
303
                except UnicodeError:
304
                    pass
305
            filename = decode_html(filename)
306
            log.debug('Found filename from headers: %s' % filename)
307
            if 'filename' in entry:
308
                log.debug('Overriding filename %s with %s from content-disposition' % (entry['filename'], filename))
309
            entry['filename'] = filename
311
    def filename_ext_from_mime(self, entry):
312
        """Tries to set filename extension from mime-type"""
313
        extension = mimetypes.guess_extension(entry['mime-type'])
314
        if extension:
315
            log.debug('Mimetype guess for %s is %s ' % (entry['mime-type'], extension))
316
            if entry.get('filename'):
317
                if entry['filename'].endswith(extension):
318
                    log.debug('Filename %s extension matches to mime-type' % entry['filename'])
319
                else:
320
                    log.debug('Adding mime-type extension %s to %s' % (extension, entry['filename']))
321
                    entry['filename'] = entry['filename'] + extension
322
        else:
323
            log.debug('Python doesn\'t know extension for mime-type: %s' % entry['mime-type'])
325
    def on_feed_output(self, feed, config):
326
        """Move downloaded content from temp folder to final destination"""
327
        config = self.process_config(config)
328
        for entry in feed.accepted:
329
            try:
330
                self.output(feed, entry, config)
331
            except PluginWarning, e:
332
                feed.fail(entry)
333
                log.error('Plugin error while writing: %s' % e)
334
            except Exception, e:
335
                feed.fail(entry)
336
                log.exception('Exception while writing: %s' % e)
338
    def output(self, feed, entry, config):
339
        """Moves temp-file into final destination
341
        Raises:
342
            PluginError if operation fails
343
        """
345
        if 'file' not in entry and not feed.manager.options.test:
346
            log.debug('file missing, entry: %s' % entry)
347
            raise PluginError('Entry `%s` has no temp file associated with' % entry['title'])
349
        try:
351
            path = entry.get('path', config.get('path'))
352
            if not isinstance(path, basestring):
353
                raise PluginError('Invalid `path` in entry `%s`' % entry['title'])
356
            if feed.manager.options.dl_path:
357
                path = feed.manager.options.dl_path
360
            try:
361
                path = os.path.expanduser(entry.render(path))
362
            except RenderError, e:
363
                feed.fail(entry, 'Could not set path. Error during string replacement: %s' % e)
364
                return
367
            if feed.manager.options.test:
368
                log.info('Would write `%s` to `%s`' % (entry['title'], path))
370
                entry['output'] = os.path.join(path, 'TEST_MODE_NO_OUTPUT')
371
                return
374
            if not os.path.isdir(path):
375
                log.info('Creating directory %s' % path)
376
                try:
377
                    os.makedirs(path)
378
                except:
379
                    raise PluginError('Cannot create path %s' % path, log)
382
            if not os.path.exists(entry['file']):
383
                tmp_path = os.path.join(feed.manager.config_base, 'temp')
384
                log.debug('entry: %s' % entry)
385
                log.debug('temp: %s' % ', '.join(os.listdir(tmp_path)))
386
                raise PluginWarning('Downloaded temp file `%s` doesn\'t exist!?' % entry['file'])
389
            if not entry.get('filename'):
390
                entry['filename'] = entry['title']
391
                log.debug('set filename from title %s' % entry['filename'])
392
                if not 'mime-type' in entry:
393
                    log.warning('Unable to figure proper filename for %s. Using title.' % entry['title'])
394
                else:
395
                    guess = mimetypes.guess_extension(entry['mime-type'])
396
                    if not guess:
397
                        log.warning('Unable to guess extension with mime-type %s' % guess)
398
                    else:
399
                        self.filename_ext_from_mime(entry)
402
            name = entry.get('filename', entry['title'])
403
            for char in '/:<>^*?~"':
404
                name = name.replace(char, ' ')
406
            name = ' '.join(name.split())
407
            destfile = os.path.join(path, name)
408
            log.debug('destfile: %s' % destfile)
410
            if os.path.exists(destfile):
411
                import filecmp
412
                if filecmp.cmp(entry['file'], destfile):
413
                    log.debug("Identical destination file '%s' already exists", destfile)
414
                elif config.get('overwrite'):
415
                    log.debug("Overwriting already existing file %s" % destfile)
416
                else:
417
                    log.info('File `%s` already exists and is not identical, download failed.' % destfile)
418
                    feed.fail(entry, 'File `%s` already exists and is not identical.' % destfile)
419
                    return
420
            else:
422
                log.debug('moving %s to %s' % (entry['file'], destfile))
424
                try:
425
                    shutil.move(entry['file'], destfile)
426
                except OSError, err:
428
                    import errno
429
                    if not os.path.exists(destfile):
430
                        raise PluginError('Unable to write %s' % destfile)
431
                    if err.errno != errno.EPERM:
432
                        raise
435
            entry['output'] = destfile
437
        finally:
438
            self.cleanup_temp_file(entry)
440
    def on_feed_exit(self, feed, config):
441
        """Make sure all temp files are cleaned up when feed exits"""
442
        self.cleanup_temp_files(feed)
444
    def on_feed_abort(self, feed, config):
445
        """Make sure all temp files are cleaned up when feed is aborted."""
446
        self.cleanup_temp_files(feed)
448
    def cleanup_temp_file(self, entry):
449
        if 'file' in entry:
450
            if os.path.exists(entry['file']):
451
                log.debug('removing temp file %s from %s' % (entry['file'], entry['title']))
452
                os.remove(entry['file'])
453
            del(entry['file'])
455
    def cleanup_temp_files(self, feed):
456
        """Checks all entries for leftover temp files and deletes them."""
457
        for entry in feed.entries + feed.rejected + feed.failed:
458
            self.cleanup_temp_file(entry)
460
register_plugin(PluginDownload, 'download', api_ver=2)
461
register_parser_option('--dl-path', action='store', dest='dl_path', default=False,
462
                       metavar='PATH', help='Override path for download plugin. Applies to all executed feeds.')