flexget.plugins.output_download
Covered: 97 lines
Missed: 236 lines
Skipped 90 lines
Percent: 29 %
  1
import os
  2
import time
  3
import urllib
  4
import urllib2
  5
import logging
  6
from flexget.plugin import register_plugin, register_parser_option, get_plugin_by_name, PluginWarning, PluginError
  7
from httplib import BadStatusLine
  8
from flexget.utils.tools import urlopener, replace_from_entry
  9
import mimetypes
 11
log = logging.getLogger('download')
 14
class PluginDownload(object):
 16
    """
 17
        Downloads content from entry url and writes it into a file.
 19
        Example:
 21
        download: ~/torrents/
 23
        Allow HTML content:
 25
        By default download plugin reports failure if received content
 26
        is a html. Usually this is some sort of custom error page without
 27
        proper http code and thus entry is assumed to be downloaded
 28
        incorrectly.
 30
        In the rare case you actually need to retrieve html-pages you must
 31
        disable this feature.
 33
        download:
 34
          path: ~/something/
 35
          fail_html: no
 37
        You may use commandline parameter --dl-path to temporarily override
 38
        all paths to another location.
 39
    """
 41
    def validator(self):
 42
        """Return config validator"""
 43
        from flexget import validator
 44
        root = validator.factory()
 45
        root.accept('path', allow_replacement=True)
 46
        root.accept('boolean')
 47
        advanced = root.accept('dict')
 48
        advanced.accept('path', key='path', allow_replacement=True)
 49
        advanced.accept('boolean', key='fail_html')
 50
        advanced.accept('boolean', key='overwrite')
 51
        return root
 53
    def get_config(self, feed):
 54
        """Return plugin configuration in advanced form"""
 55
        config = feed.config['download']
 56
        if isinstance(config, basestring):
 57
            config = {'path': config}
 58
        if not isinstance(config, dict):
 59
            config = {}
 60
        config.setdefault('fail_html', True)
 61
        if not config.get('path'):
 62
            config['require_path'] = True
 63
        return config
 65
    def on_process_start(self, feed):
 66
        """Register the usable set keywords."""
 67
        set_plugin = get_plugin_by_name('set')
 68
        set_plugin.instance.register_keys({'path': 'text'})
 70
    def on_feed_download(self, feed):
 71
        config = self.get_config(feed)
 72
        self.get_temp_files(feed, require_path=config.get('require_path', False), fail_html=config['fail_html'])
 74
    def get_temp_files(self, feed, require_path=False, handle_magnets=False, fail_html=True):
 75
        """Download all feed content and store in temporary folder.
 77
        :require_path: whether or not entries without 'path' field are ignored
 78
        :handle_magnets: when used any of urls containing magnet link will replace url, otherwise warning is printed.
 79
        """
 80
        for entry in feed.accepted:
 81
            if entry.get('urls'):
 82
                urls = entry.get('urls')
 83
            else:
 84
                urls = [entry['url']]
 85
            errors = []
 86
            for url in urls:
 87
                if url.startswith('magnet:'):
 88
                    if handle_magnets:
 90
                        log.debug('Accepting magnet url for %s' % entry['title'])
 91
                        entry['url'] = url
 92
                        break
 93
                    else:
 94
                        log.warning('Can\'t download magnet url')
 95
                        errors.append('Magnet URL')
 96
                        continue
 97
                if require_path and 'path' not in entry:
 99
                    log.debug('Skipping url %s because there is no path for download' % url)
100
                    continue
101
                error = self.process_entry(feed, entry, url)
104
                html_mimes = ['html', 'text/html']
105
                if entry.get('mime-type') in html_mimes and fail_html:
106
                    error = 'Unexpected html content received from `%s` - maybe a login page?' % entry['url']
107
                    self.cleanup_temp_file(entry)
109
                if not error:
111
                    log.debug('Successfully retrieved %s from %s' % (entry['title'], url))
112
                    entry['url'] = url
113
                    break
114
                else:
115
                    errors.append(error)
116
            else:
118
                if require_path and 'path' not in entry:
119
                    log.error('%s can\'t be downloaded, no path specified for entry' % entry['title'])
120
                    feed.fail(entry, 'no path specified for entry')
121
                else:
122
                    feed.fail(entry, ", ".join(errors))
124
    def process_entry(self, feed, entry, url):
125
        """Processes :entry: by using :url: from it.
126
           Does not fail the :entry: if there is a network issue, instead just log and return a string error."""
127
        try:
128
            if feed.manager.options.test:
129
                log.info('Would download: %s' % entry['title'])
130
            else:
131
                if not feed.manager.unit_test:
132
                    log.info('Downloading: %s' % entry['title'])
133
                self.download_entry(feed, entry, url)
134
        except urllib2.HTTPError, e:
135
            log.warning('HTTPError %s' % e.code)
136
            return 'HTTP error'
137
        except urllib2.URLError, e:
138
            log.warning('URLError %s' % e.reason)
139
            return 'URL Error'
140
        except BadStatusLine, e:
141
            log.warning('Failed to reach server. Reason: %s' % e.reason)
142
            return 'BadStatusLine'
143
        except IOError, e:
144
            if hasattr(e, 'reason'):
145
                log.warning('Failed to reach server. Reason: %s' % e.reason)
146
            elif hasattr(e, 'code'):
147
                log.warning('The server couldn\'t fulfill the request. Error code: %s' % e.code)
148
            return 'IOError'
149
        except ValueError, e:
151
            log.warning(e.message)
152
            return e.message
154
    def download_entry(self, feed, entry, url):
155
        """Downloads :entry: by using :url:.
156
        May raise several types of exception(s) or PluginWarning"""
160
        try:
161
            url = url.encode('latin1')
162
        except UnicodeEncodeError:
163
            log.debug('URL for `%s` could not be encoded in latin1' % entry['title'])
164
            try:
165
                url = url.encode('utf-8')
166
            except:
167
                log.warning('Unable to URL-encode URL for `%s`' % entry['title'])
168
        if not isinstance(url, unicode):
169
            url = urllib.quote(url, safe=':/~?=&%')
170
        log.debug('Downloading url \'%s\'' % url)
173
        if 'basic_auth_password' in entry and 'basic_auth_username' in entry:
174
            log.debug('Basic auth enabled. User: %s Password: %s' % (entry['basic_auth_username'], entry['basic_auth_password']))
175
            passman = urllib2.HTTPPasswordMgrWithDefaultRealm()
176
            passman.add_password(None, url, entry['basic_auth_username'], entry['basic_auth_password'])
177
            handlers = [urllib2.HTTPBasicAuthHandler(passman)]
178
        else:
179
            handlers = None
181
        opener = urlopener(url, log, handlers=handlers)
182
        if opener.headers.get('content-encoding') in ('gzip', 'x-gzip', 'deflate'):
183
            import zlib
184
            decompress = zlib.decompressobj(15 + 32).decompress
185
        else:
186
            decompress = None
190
        import hashlib
191
        m = hashlib.md5()
192
        m.update(url)
193
        m.update('%s' % time.time())
194
        tmp_path = os.path.join(feed.manager.config_base, 'temp')
195
        if not os.path.isdir(tmp_path):
196
            logging.debug('creating tmp_path %s' % tmp_path)
197
            os.mkdir(tmp_path)
198
        datafile = os.path.join(tmp_path, m.hexdigest())
200
        def read_chunks(data, buffer_size=1024):
201
            """ Helper generator to iterate over data in chunks """
202
            while True:
203
                chunk = data.read(buffer_size)
204
                if not chunk:
205
                    break
206
                yield chunk
209
        try:
210
            outfile = open(datafile, 'wb')
211
            try:
212
                for chunk in read_chunks(opener):
213
                    outfile.write(decompress(chunk) if decompress else chunk)
214
            except:
217
                outfile.close()
218
                log.debug('Download interrupted, removing datafile')
219
                os.remove(datafile)
220
                raise
221
            outfile.close()
224
            entry['file'] = datafile
225
            log.debug('%s field file set to: %s' % (entry['title'], entry['file']))
226
        finally:
227
            opener.close()
229
        entry['mime-type'] = opener.headers.gettype()
231
        if 'content-length' in opener.headers and not decompress:
232
            entry['content-length'] = int(opener.headers.get('content-length'))
236
        if entry.get('content-disposition', True):
237
            self.filename_from_headers(entry, opener)
238
        else:
239
            log.info('Content-disposition disabled for %s' % entry['title'])
240
        self.filename_ext_from_mime(entry)
243
    def filename_from_headers(self, entry, response):
244
        """Checks entry filename if it's found from content-disposition"""
245
        from flexget.utils.tools import encode_html, decode_html
246
        import email
248
        data = str(response.info())
251
        try:
252
            data = data.decode('utf-8')
253
            log.debug('response info UTF-8 decoded')
254
        except:
255
            try:
256
                data = unicode(data)
257
                log.debug('response info unicoded')
258
            except:
259
                pass
263
        data = encode_html(data)
264
        try:
265
            filename = email.message_from_string(data).get_filename(failobj=False)
266
        except:
267
            log.error('Failed to decode filename from response: %s' % ''.join(['%02x' % ord(x) for x in data]))
268
            return
269
        if filename:
270
            filename = decode_html(filename)
271
            log.debug('Found filename from headers: %s' % filename)
272
            if 'filename' in entry:
273
                log.debug('Overriding filename %s with %s from content-disposition' % (entry['filename'], filename))
274
            entry['filename'] = filename
276
    def filename_ext_from_mime(self, entry):
277
        """Tries to set filename extension from mime-type"""
278
        extension = mimetypes.guess_extension(entry['mime-type'])
279
        if extension:
280
            log.debug('Mimetype guess for %s is %s ' % (entry['mime-type'], extension))
281
            if entry.get('filename'):
282
                if entry['filename'].endswith(extension):
283
                    log.debug('Filename %s extension matches to mime-type' % entry['filename'])
284
                else:
285
                    log.debug('Adding mime-type extension %s to %s' % (extension, entry['filename']))
286
                    entry['filename'] = entry['filename'] + extension
287
        else:
288
            log.debug('Python doesn\'t know extension for mime-type: %s' % entry['mime-type'])
290
    def on_feed_output(self, feed):
291
        """Move downloaded content from temp folder to final destination"""
292
        for entry in feed.accepted:
293
            try:
294
                if feed.manager.options.test:
295
                    log.info('Would write: %s' % entry['title'])
296
                else:
297
                    self.output(feed, entry)
298
            except PluginWarning, e:
299
                feed.fail(entry)
300
                log.error('Plugin error while writing: %s' % e)
301
            except Exception, e:
302
                feed.fail(entry)
303
                log.exception('Exception while writing: %s' % e)
305
    def output(self, feed, entry):
306
        """Moves temp-file into final destination"""
308
        config = self.get_config(feed)
310
        if 'file' not in entry:
311
            log.debug('file missing, entry: %s' % entry)
312
            raise PluginError('Entry %s has no temp file associated with' % entry['title'])
314
        try:
316
            path = entry.get('path', config.get('path'))
317
            if path is None:
318
                raise PluginError('Unreachable situation?')
321
            if feed.manager.options.dl_path:
322
                path = feed.manager.options.dl_path
325
            if not entry.get('filename'):
326
                entry['filename'] = entry['title']
327
                log.debug('set filename from title %s' % entry['filename'])
328
                if not 'mime-type' in entry:
329
                    log.warning('Unable to figure proper filename for %s. Using title.' % entry['title'])
330
                else:
331
                    guess = mimetypes.guess_extension(entry['mime-type'])
332
                    if not guess:
333
                        log.warning('Unable to guess extension with mime-type %s' % guess)
334
                    else:
335
                        self.filename_ext_from_mime(entry)
338
            path = replace_from_entry(path, entry, 'path', log.error)
339
            if not path:
340
                feed.fail(entry, 'Could not set path. Does not contain all fields for string replacement.')
341
                return
342
            path = os.path.expanduser(path)
345
            if not os.path.isdir(path):
346
                log.info('Creating directory %s' % path)
347
                try:
348
                    os.makedirs(path)
349
                except:
350
                    raise PluginError('Cannot create path %s' % path, log)
353
            if not os.path.exists(entry['file']):
354
                tmp_path = os.path.join(feed.manager.config_base, 'temp')
355
                log.debug('entry: %s' % entry)
356
                log.debug('temp: %s' % ', '.join(os.listdir(tmp_path)))
357
                raise PluginWarning("Downloaded temp file '%s' doesn't exist!?" % entry['file'])
360
            name = entry.get('filename', entry['title'])
361
            for char in '/:<>^*?~':
362
                name = name.replace(char, ' ')
364
            name = ' '.join(name.split())
365
            destfile = os.path.join(path, name)
366
            log.debug('destfile: %s' % destfile)
368
            if os.path.exists(destfile):
369
                import filecmp
370
                if filecmp.cmp(entry['file'], destfile):
371
                    log.debug("Identical destination file '%s' already exists", destfile)
372
                    return
373
                elif config.get('overwrite'):
374
                    log.debug("Overwriting already existing file %s" % destfile)
375
                else:
376
                    log.info('File \'%s\' already exists and is not identical, download failed.' % destfile)
377
                    feed.fail(entry, 'File \'%s\' already exists and is not identical.' % destfile)
378
                    return
381
            log.debug('moving %s to %s' % (entry['file'], destfile))
383
            try:
384
                import shutil
385
                shutil.move(entry['file'], destfile)
386
            except OSError, err:
388
                import errno
389
                if not os.path.exists(destfile):
390
                    raise PluginError('Unable to write %s' % destfile)
391
                if err.errno != errno.EPERM:
392
                    raise
395
            entry['output'] = destfile
397
        finally:
398
            self.cleanup_temp_file(entry)
400
    def on_feed_exit(self, feed):
401
        """Make sure all temp files are cleaned up when feed exits"""
402
        self.cleanup_temp_files(feed)
404
    def on_feed_abort(self, feed):
405
        """Make sure all temp files are cleaned up when feed is aborted."""
406
        self.cleanup_temp_files(feed)
408
    def cleanup_temp_file(self, entry):
409
        if 'file' in entry:
410
            if os.path.exists(entry['file']):
411
                log.debug('removing temp file %s from %s' % (entry['file'], entry['title']))
412
                os.remove(entry['file'])
413
            del(entry['file'])
415
    def cleanup_temp_files(self, feed):
416
        """Checks all entries for leftover temp files and deletes them."""
417
        for entry in feed.entries + feed.rejected + feed.failed:
418
            self.cleanup_temp_file(entry)
420
register_plugin(PluginDownload, 'download')
421
register_parser_option('--dl-path', action='store', dest='dl_path', default=False,
422
                       metavar='PATH', help='Override path for download plugin. Applies to all executed feeds.')