7
from flexget.feed import Entry
8
from flexget.plugin import *
9
from flexget.utils.soup import get_soup
10
from flexget.utils.cached_input import cached
11
from flexget.utils.tools import urlopener
13
log = logging.getLogger('html')
16
class InputHtml(object):
18
Parses urls from html page. Usefull on sites which have direct download
19
links of any type (mp3, jpg, torrent, ...).
21
Many anime-fansubbers do not provide RSS-feed, this works well in many cases.
23
Configuration expects url parameter.
25
Note: This returns ALL links on url so you need to configure filters
26
to match only to desired content.
30
from flexget import validator
31
root = validator.factory()
33
advanced = root.accept('dict')
34
advanced.accept('url', key='url', required=True)
35
advanced.accept('text', key='username')
36
advanced.accept('text', key='password')
37
advanced.accept('text', key='dump')
38
advanced.accept('text', key='title_from')
39
regexps = advanced.accept('list', key='links_re')
40
regexps.accept('regexp')
43
def build_config(self, config):
45
def get_auth_from_url():
46
"""Moves basic authentication from url to username and password fields"""
47
parts = list(urlparse.urlsplit(config['url']))
48
split = parts[1].split('@')
50
auth = split[0].split(':')
52
config['username'], config['password'] = auth[0], auth[1]
54
log.warning('Invalid basic authentication in url: %s' % config['url'])
56
config['url'] = urlparse.urlunsplit(parts)
58
if isinstance(config, basestring):
59
config = {'url': config}
64
@cached('html', 'url')
66
def on_feed_input(self, feed, config):
67
config = self.build_config(config)
69
log.debug('InputPlugin html requesting url %s' % config['url'])
71
if config.get('username') and config.get('password'):
72
log.debug('Basic auth enabled. User: %s Password: %s' % (config['username'], config['password']))
73
passman = urllib2.HTTPPasswordMgrWithDefaultRealm()
74
passman.add_password(None, config['url'], config['username'], config['password'])
75
handlers = [urllib2.HTTPBasicAuthHandler(passman)]
78
page = urlopener(config['url'], log, handlers=handlers)
80
log.debug('Detected encoding %s' % soup.originalEncoding)
82
# dump received content into a file
85
log.info('Dumping %s into %s' % (config['url'], name))
86
data = soup.prettify()
91
return self.create_entries(config['url'], soup, config)
93
def create_entries(self, pageurl, soup, config):
99
def title_exists(title):
100
"""Helper method. Return True if title is already added to entries"""
102
if entry['title'] == title:
105
for link in soup.findAll('a'):
107
if not link.has_key('href'):
109
# no content in the link
110
if not link.contents:
115
# get only links matching regexp
116
regexps = config.get('links_re', None)
120
for regexp in regexps:
121
if re.search(regexp, url):
126
title = link.contents[0]
129
if isinstance(title, BeautifulSoup.Tag):
130
log.debugall('title is tag: %s' % title)
133
# just unable to get any decent title
135
title = link.next.string
139
# strip unicode white spaces
140
title = title.replace(u'\u200B', u'').strip()
146
if url.startswith('//'):
148
elif not url.startswith('http://') or not url.startswith('https://'):
149
url = urlparse.urljoin(pageurl, url)
151
title_from = config.get('title_from', 'auto')
152
if title_from == 'url':
153
parts = urllib.splitquery(url[url.rfind('/')+1:])
154
title = urllib.unquote_plus(parts[0])
155
log.debug('title from url: %s' % title)
156
elif title_from == 'title':
157
if not link.has_key('title'):
158
safelink = link.encode('ascii', 'ignore')
159
safelink = safelink.replace('\n', '')
160
safelink = safelink.replace('\r', '')
161
log.warning('Link %s doesn\'t have title attribute, ignored.' % safelink)
163
title = link['title']
164
log.debug('title from title: %s' % title)
165
elif title_from == 'auto':
166
# automatic mode, check if title is unique
167
# if there are too many duplicate titles, switch to title_from: url
168
if title_exists(title):
169
# ignore index links as a counter
170
if 'index' in title and len(title) < 10:
172
duplicates.setdefault(title, 0)
173
duplicates[title] += 1
174
if duplicates[title] > duplicate_limit:
175
log.info('Link names seem to be useless, auto-enabling \'title_from: url\'. This may not work well, you might need to configure it.')
176
config['title_from'] = 'url'
177
# start from the beginning ...
178
return self.create_entries(pageurl, soup, config)
179
elif title_from == 'link' or title_from == 'contents':
180
# link from link name
181
log.debug('title from link: %s' % title)
184
raise PluginError('Unknown title_from value %s' % title_from)
186
# in case the title contains xxxxxxx.torrent - foooo.torrent clean it a bit (get up to first .torrent)
188
if title.lower().find('.torrent') > 0:
189
title = title[:title.lower().find('.torrent')]
191
if title_exists(title):
192
# title link should be unique, add CRC32 to end if it's not
193
hash = zlib.crc32(url.encode("utf-8"))
194
crc32 = '%08X' % (hash & 0xFFFFFFFF)
195
title = '%s [%s]' % (title, crc32)
196
# truly duplicate, title + url crc already exists in queue
197
if title_exists(title):
199
log.debug('uniqued title to %s' % title)
203
entry['title'] = title
207
# add from queue to feed
211
register_plugin(InputHtml, 'html', api_ver=2)