flexget.plugins.input.html
Covered: 114 lines
Missed: 62 lines
Skipped 54 lines
Percent: 64 %
  1
import urlparse
  2
import logging
  3
import BeautifulSoup
  4
import urllib
  5
import urllib2
  6
import zlib
  7
import re
  8
from flexget.entry import Entry
  9
from flexget.plugin import register_plugin, internet, PluginError
 10
from flexget.utils.soup import get_soup
 11
from flexget.utils.cached_input import cached
 12
from flexget.utils.tools import urlopener
 14
log = logging.getLogger('html')
 17
class InputHtml(object):
 18
    """
 19
        Parses urls from html page. Usefull on sites which have direct download
 20
        links of any type (mp3, jpg, torrent, ...).
 22
        Many anime-fansubbers do not provide RSS-feed, this works well in many cases.
 24
        Configuration expects url parameter.
 26
        Note: This returns ALL links on url so you need to configure filters
 27
        to match only to desired content.
 28
    """
 30
    def validator(self):
 31
        from flexget import validator
 32
        root = validator.factory()
 33
        root.accept('text')
 34
        advanced = root.accept('dict')
 35
        advanced.accept('url', key='url', required=True)
 36
        advanced.accept('text', key='username')
 37
        advanced.accept('text', key='password')
 38
        advanced.accept('text', key='dump')
 39
        advanced.accept('text', key='title_from')
 40
        regexps = advanced.accept('list', key='links_re')
 41
        regexps.accept('regexp')
 42
        return root
 44
    def build_config(self, config):
 46
        def get_auth_from_url():
 47
            """Moves basic authentication from url to username and password fields"""
 48
            parts = list(urlparse.urlsplit(config['url']))
 49
            split = parts[1].split('@')
 50
            if len(split) > 1:
 51
                auth = split[0].split(':')
 52
                if len(auth) == 2:
 53
                    config['username'], config['password'] = auth[0], auth[1]
 54
                else:
 55
                    log.warning('Invalid basic authentication in url: %s' % config['url'])
 56
                parts[1] = split[1]
 57
                config['url'] = urlparse.urlunsplit(parts)
 59
        if isinstance(config, basestring):
 60
            config = {'url': config}
 61
        get_auth_from_url()
 62
        return config
 64
    @cached('html')
 65
    @internet(log)
 66
    def on_feed_input(self, feed, config):
 67
        config = self.build_config(config)
 69
        log.debug('InputPlugin html requesting url %s' % config['url'])
 71
        if config.get('username') and config.get('password'):
 72
            log.debug('Basic auth enabled. User: %s Password: %s' % (config['username'], config['password']))
 73
            passman = urllib2.HTTPPasswordMgrWithDefaultRealm()
 74
            passman.add_password(None, config['url'], config['username'], config['password'])
 75
            handlers = [urllib2.HTTPBasicAuthHandler(passman)]
 76
        else:
 77
            handlers = None
 78
        page = urlopener(config['url'], log, handlers=handlers)
 79
        soup = get_soup(page)
 80
        log.debug('Detected encoding %s' % soup.originalEncoding)
 83
        if 'dump' in config:
 84
            name = config['dump']
 85
            log.info('Dumping %s into %s' % (config['url'], name))
 86
            data = soup.prettify()
 87
            f = open(name, 'w')
 88
            f.write(data)
 89
            f.close()
 91
        return self.create_entries(config['url'], soup, config)
 93
    def _title_from_link(self, link, log_link):
 94
        title = link.contents[0]
 96
        if isinstance(title, BeautifulSoup.Tag):
 97
            log.debug('link %s content is tag, cannot get title' % log_link)
 98
            return None
100
        if title is None:
101
            title = link.next.string
102
            if title is None:
103
                log.debug('longshot failed for %s' % log_link)
104
                return None
105
        return title
107
    def _title_from_url(self, url):
108
        parts = urllib.splitquery(url[url.rfind('/') + 1:])
109
        title = urllib.unquote_plus(parts[0])
110
        return title
112
    def create_entries(self, page_url, soup, config):
114
        queue = []
115
        duplicates = {}
116
        duplicate_limit = 4
118
        def title_exists(title):
119
            """Helper method. Return True if title is already added to entries"""
120
            for entry in queue:
121
                if entry['title'] == title:
122
                    return True
124
        for link in soup.findAll('a'):
126
            if not link.has_key('href'):
127
                continue
129
            if not link.contents:
130
                continue
132
            url = link['href']
133
            log_link = url
134
            log_link = log_link.replace('\n', '')
135
            log_link = log_link.replace('\r', '')
138
            if url.startswith('//'):
139
                url = 'http:' + url
140
            elif not url.startswith('http://') or not url.startswith('https://'):
141
                url = urlparse.urljoin(page_url, url)
144
            regexps = config.get('links_re', None)
145
            if regexps:
146
                accept = False
147
                for regexp in regexps:
148
                    if re.search(regexp, url):
149
                        accept = True
150
                if not accept:
151
                    continue
153
            title_from = config.get('title_from', 'auto')
154
            if title_from == 'url':
155
                title = self._title_from_url(url)
156
                log.debug('title from url: %s' % title)
157
            elif title_from == 'title':
158
                if not link.has_key('title'):
159
                    log.warning('Link `%s` doesn\'t have title attribute, ignored.' % log_link)
160
                    continue
161
                title = link['title']
162
                log.debug('title from title: %s' % title)
163
            elif title_from == 'auto':
164
                title = self._title_from_link(link, log_link)
165
                if title is None:
166
                    continue
169
                if title_exists(title):
171
                    if 'index' in title and len(title) < 10:
172
                        log.debug('ignored index title %s' % title)
173
                        continue
174
                    duplicates.setdefault(title, 0)
175
                    duplicates[title] += 1
176
                    if duplicates[title] > duplicate_limit:
178
                        from_url = self._title_from_url(url)
179
                        switch_to = 'url'
180
                        for ext in ('.html', '.php'):
181
                            if from_url.endswith(ext):
182
                                switch_to = 'title'
183
                        log.info('Link names seem to be useless, auto-configuring \'title_from: %s\'. '
184
                                 'This may not work well, you might need to configure it yourself.' % switch_to)
185
                        config['title_from'] = switch_to
187
                        return self.create_entries(page_url, soup, config)
188
            elif title_from == 'link' or title_from == 'contents':
190
                title = self._title_from_link(link, log_link)
191
                if title is None:
192
                    continue
193
                log.debug('title from link: %s' % title)
194
            else:
195
                raise PluginError('Unknown title_from value %s' % title_from)
197
            if not title:
198
                log.debug('title could not be determined for %s' % log_link)
199
                continue
202
            title = title.replace(u'\u200B', u'').strip()
206
            if title.lower().find('.torrent') > 0:
207
                title = title[:title.lower().find('.torrent')]
209
            if title_exists(title):
211
                hash = zlib.crc32(url.encode("utf-8"))
212
                crc32 = '%08X' % (hash & 0xFFFFFFFF)
213
                title = '%s [%s]' % (title, crc32)
215
                if title_exists(title):
216
                    continue
217
                log.debug('uniqued title to %s' % title)
219
            entry = Entry()
220
            entry['url'] = url
221
            entry['title'] = title
223
            queue.append(entry)
226
        return queue
229
register_plugin(InputHtml, 'html', api_ver=2)