8
from flexget.entry import Entry
9
from flexget.plugin import register_plugin, internet, PluginError
10
from flexget.utils.soup import get_soup
11
from flexget.utils.cached_input import cached
12
from flexget.utils.tools import urlopener
14
log = logging.getLogger('html')
17
class InputHtml(object):
19
Parses urls from html page. Usefull on sites which have direct download
20
links of any type (mp3, jpg, torrent, ...).
22
Many anime-fansubbers do not provide RSS-feed, this works well in many cases.
24
Configuration expects url parameter.
26
Note: This returns ALL links on url so you need to configure filters
27
to match only to desired content.
31
from flexget import validator
32
root = validator.factory()
34
advanced = root.accept('dict')
35
advanced.accept('url', key='url', required=True)
36
advanced.accept('text', key='username')
37
advanced.accept('text', key='password')
38
advanced.accept('text', key='dump')
39
advanced.accept('text', key='title_from')
40
regexps = advanced.accept('list', key='links_re')
41
regexps.accept('regexp')
44
def build_config(self, config):
46
def get_auth_from_url():
47
"""Moves basic authentication from url to username and password fields"""
48
parts = list(urlparse.urlsplit(config['url']))
49
split = parts[1].split('@')
51
auth = split[0].split(':')
53
config['username'], config['password'] = auth[0], auth[1]
55
log.warning('Invalid basic authentication in url: %s' % config['url'])
57
config['url'] = urlparse.urlunsplit(parts)
59
if isinstance(config, basestring):
60
config = {'url': config}
66
def on_feed_input(self, feed, config):
67
config = self.build_config(config)
69
log.debug('InputPlugin html requesting url %s' % config['url'])
71
if config.get('username') and config.get('password'):
72
log.debug('Basic auth enabled. User: %s Password: %s' % (config['username'], config['password']))
73
passman = urllib2.HTTPPasswordMgrWithDefaultRealm()
74
passman.add_password(None, config['url'], config['username'], config['password'])
75
handlers = [urllib2.HTTPBasicAuthHandler(passman)]
78
page = urlopener(config['url'], log, handlers=handlers)
80
log.debug('Detected encoding %s' % soup.originalEncoding)
82
# dump received content into a file
85
log.info('Dumping %s into %s' % (config['url'], name))
86
data = soup.prettify()
91
return self.create_entries(config['url'], soup, config)
93
def _title_from_link(self, link, log_link):
94
title = link.contents[0]
96
if isinstance(title, BeautifulSoup.Tag):
97
log.debug('link %s content is tag, cannot get title' % log_link)
99
# longshot from next element (?)
101
title = link.next.string
103
log.debug('longshot failed for %s' % log_link)
107
def _title_from_url(self, url):
108
parts = urllib.splitquery(url[url.rfind('/') + 1:])
109
title = urllib.unquote_plus(parts[0])
112
def create_entries(self, page_url, soup, config):
118
def title_exists(title):
119
"""Helper method. Return True if title is already added to entries"""
121
if entry['title'] == title:
124
for link in soup.findAll('a'):
126
if not link.has_key('href'):
128
# no content in the link
129
if not link.contents:
134
log_link = log_link.replace('\n', '')
135
log_link = log_link.replace('\r', '')
138
if url.startswith('//'):
140
elif not url.startswith('http://') or not url.startswith('https://'):
141
url = urlparse.urljoin(page_url, url)
143
# get only links matching regexp
144
regexps = config.get('links_re', None)
147
for regexp in regexps:
148
if re.search(regexp, url):
153
title_from = config.get('title_from', 'auto')
154
if title_from == 'url':
155
title = self._title_from_url(url)
156
log.debug('title from url: %s' % title)
157
elif title_from == 'title':
158
if not link.has_key('title'):
159
log.warning('Link `%s` doesn\'t have title attribute, ignored.' % log_link)
161
title = link['title']
162
log.debug('title from title: %s' % title)
163
elif title_from == 'auto':
164
title = self._title_from_link(link, log_link)
167
# automatic mode, check if title is unique
168
# if there are too many duplicate titles, switch to title_from: url
169
if title_exists(title):
170
# ignore index links as a counter
171
if 'index' in title and len(title) < 10:
172
log.debug('ignored index title %s' % title)
174
duplicates.setdefault(title, 0)
175
duplicates[title] += 1
176
if duplicates[title] > duplicate_limit:
177
# if from url seems to be bad choice use title
178
from_url = self._title_from_url(url)
180
for ext in ('.html', '.php'):
181
if from_url.endswith(ext):
183
log.info('Link names seem to be useless, auto-configuring \'title_from: %s\'. '
184
'This may not work well, you might need to configure it yourself.' % switch_to)
185
config['title_from'] = switch_to
186
# start from the beginning ...
187
return self.create_entries(page_url, soup, config)
188
elif title_from == 'link' or title_from == 'contents':
189
# link from link name
190
title = self._title_from_link(link, log_link)
193
log.debug('title from link: %s' % title)
195
raise PluginError('Unknown title_from value %s' % title_from)
198
log.debug('title could not be determined for %s' % log_link)
201
# strip unicode white spaces
202
title = title.replace(u'\u200B', u'').strip()
204
# in case the title contains xxxxxxx.torrent - foooo.torrent clean it a bit (get up to first .torrent)
206
if title.lower().find('.torrent') > 0:
207
title = title[:title.lower().find('.torrent')]
209
if title_exists(title):
210
# title link should be unique, add CRC32 to end if it's not
211
hash = zlib.crc32(url.encode("utf-8"))
212
crc32 = '%08X' % (hash & 0xFFFFFFFF)
213
title = '%s [%s]' % (title, crc32)
214
# truly duplicate, title + url crc already exists in queue
215
if title_exists(title):
217
log.debug('uniqued title to %s' % title)
221
entry['title'] = title
225
# add from queue to feed
229
register_plugin(InputHtml, 'html', api_ver=2)