flexget.plugins.input_html
Covered: 129 lines
Missed: 28 lines
Skipped 55 lines
Percent: 82 %
  1
import urlparse
  2
import logging
  3
import BeautifulSoup
  4
import urllib
  5
import urllib2
  6
import zlib
  7
from flexget.feed import Entry
  8
from flexget.plugin import *
  9
from flexget.utils.soup import get_soup
 10
from flexget.utils.cached_input import cached
 11
from flexget.utils.tools import urlopener
 13
log = logging.getLogger('html')
 16
class InputHtml(object):
 17
    """
 18
        Parses urls from html page. Usefull on sites which have direct download
 19
        links of any type (mp3, jpg, torrent, ...).
 21
        Many anime-fansubbers do not provide RSS-feed, this works well in many cases.
 23
        Configuration expects url parameter.
 25
        Note: This returns ALL links on url so you need to configure filters
 26
        to match only to desired content.
 27
    """
 29
    def validator(self):
 30
        from flexget import validator
 31
        root = validator.factory()
 32
        root.accept('text')
 33
        advanced = root.accept('dict')
 34
        advanced.accept('url', key='url', required=True)
 35
        advanced.accept('text', key='username')
 36
        advanced.accept('text', key='password')
 37
        advanced.accept('text', key='dump')
 38
        advanced.accept('text', key='title_from')
 39
        regexps = advanced.accept('list', key='links_re')
 40
        regexps.accept('regexp')
 41
        return root
 43
    def build_config(self, config):
 45
        def get_auth_from_url():
 46
            """Moves basic authentication from url to username and password fields"""
 47
            parts = list(urlparse.urlsplit(config['url']))
 48
            split = parts[1].split('@')
 49
            if len(split) > 1:
 50
                auth = split[0].split(':')
 51
                if len(auth) == 2:
 52
                    config['username'], config['password'] = auth[0], auth[1]
 53
                else:
 54
                    log.warning('Invalid basic authentication in url: %s' % config['url'])
 55
                parts[1] = split[1]
 56
                config['url'] = urlparse.urlunsplit(parts)
 58
        if isinstance(config, basestring):
 59
            config = {'url': config}
 60
        get_auth_from_url()
 61
        return config
 64
    @cached('html', 'url')
 65
    @internet(log)
 66
    def on_feed_input(self, feed, config):
 67
        config = self.build_config(config)
 69
        log.debug('InputPlugin html requesting url %s' % config['url'])
 71
        if config.get('username') and config.get('password'):
 72
            log.debug('Basic auth enabled. User: %s Password: %s' % (config['username'], config['password']))
 73
            passman = urllib2.HTTPPasswordMgrWithDefaultRealm()
 74
            passman.add_password(None, config['url'], config['username'], config['password'])
 75
            handlers = [urllib2.HTTPBasicAuthHandler(passman)]
 76
        else:
 77
            handlers = None
 78
        page = urlopener(config['url'], log, handlers=handlers)
 79
        soup = get_soup(page)
 80
        log.debug('Detected encoding %s' % soup.originalEncoding)
 83
        if 'dump' in config:
 84
            name = config['dump']
 85
            log.info('Dumping %s into %s' % (config['url'], name))
 86
            data = soup.prettify()
 87
            f = open(name, 'w')
 88
            f.write(data)
 89
            f.close()
 91
        return self.create_entries(config['url'], soup, config)
 93
    def create_entries(self, pageurl, soup, config):
 95
        queue = []
 96
        duplicates = {}
 97
        duplicate_limit = 4
 99
        def title_exists(title):
100
            """Helper method. Return True if title is already added to entries"""
101
            for entry in queue:
102
                if entry['title'] == title:
103
                    return True
105
        for link in soup.findAll('a'):
107
            if not link.has_key('href'):
108
                continue
110
            if not link.contents:
111
                continue
113
            url = link['href']
116
            regexps = config.get('links_re', None)
117
            if regexps:
118
                import re
119
                accept = False
120
                for regexp in regexps:
121
                    if re.search(regexp, url):
122
                        accept = True
123
                if not accept:
124
                    continue
126
            title = link.contents[0]
129
            if isinstance(title, BeautifulSoup.Tag):
130
                log.debugall('title is tag: %s' % title)
131
                continue
134
            if title is None:
135
                title = link.next.string
136
                if title is None:
137
                    continue
140
            title = title.replace(u'\u200B', u'').strip()
142
            if not title:
143
                continue
146
            if url.startswith('//'):
147
                url = 'http:' + url
148
            elif not url.startswith('http://') or not url.startswith('https://'):
149
                url = urlparse.urljoin(pageurl, url)
151
            title_from = config.get('title_from', 'auto')
152
            if title_from == 'url':
153
                parts = urllib.splitquery(url[url.rfind('/')+1:])
154
                title = urllib.unquote_plus(parts[0])
155
                log.debug('title from url: %s' % title)
156
            elif title_from == 'title':
157
                if not link.has_key('title'):
158
                    safelink = link.encode('ascii', 'ignore')
159
                    safelink = safelink.replace('\n', '')
160
                    safelink = safelink.replace('\r', '')
161
                    log.warning('Link %s doesn\'t have title attribute, ignored.' % safelink)
162
                    continue
163
                title = link['title']
164
                log.debug('title from title: %s' % title)
165
            elif title_from == 'auto':
168
                if title_exists(title):
170
                    if 'index' in title and len(title) < 10:
171
                        continue
172
                    duplicates.setdefault(title, 0)
173
                    duplicates[title] += 1
174
                    if duplicates[title] > duplicate_limit:
175
                        log.info('Link names seem to be useless, auto-enabling \'title_from: url\'. This may not work well, you might need to configure it.')
176
                        config['title_from'] = 'url'
178
                        return self.create_entries(pageurl, soup, config)
179
            elif title_from == 'link' or title_from == 'contents':
181
                log.debug('title from link: %s' % title)
182
                pass
183
            else:
184
                raise PluginError('Unknown title_from value %s' % title_from)
188
            if title.lower().find('.torrent') > 0:
189
                title = title[:title.lower().find('.torrent')]
191
            if title_exists(title):
193
                hash = zlib.crc32(url.encode("utf-8"))
194
                crc32 = '%08X' % (hash & 0xFFFFFFFF)
195
                title = '%s [%s]' % (title, crc32)
197
                if title_exists(title):
198
                    continue
199
                log.debug('uniqued title to %s' % title)
201
            entry = Entry()
202
            entry['url'] = url
203
            entry['title'] = title
205
            queue.append(entry)
208
        return queue
211
register_plugin(InputHtml, 'html', api_ver=2)