6
from flexget.utils.soup import get_soup
7
from BeautifulSoup import NavigableString, Tag
9
log = logging.getLogger('utils.imdb')
13
"""Return IMDb ID of the given URL. Return None if not valid or if URL is not a string."""
14
if not isinstance(url, basestring):
16
m = re.search(r'((?:nm|tt)[\d]{7})', url)
22
"""Return IMDb URL of the given ID"""
23
return u'http://www.imdb.com/title/%s/' % imdb_id
26
class ImdbSearch(object):
29
# de-prioritize aka matches a bit
30
self.aka_weight = 0.95
31
# prioritize popular matches a bit by depriorizing others
32
self.unpopular_weight = 0.85
33
# de-prioritize tv results
36
self.first_weight = 1.02
41
self.remove = ['imax']
43
self.ignore_types = ['VG']
45
def ireplace(self, str, old, new, count=0):
46
"""Case insensitive string replace"""
47
pattern = re.compile(re.escape(old), re.I)
48
return re.sub(pattern, new, str, count)
50
def smart_match(self, raw_name):
51
"""Accepts messy name, cleans it and uses information available to make smartest and best match"""
52
from flexget.utils.titles.movie import MovieParser
53
parser = MovieParser()
54
parser.data = raw_name
59
log.critical('Failed to parse name from %s' % raw_name)
61
log.debug('smart_match name=%s year=%s' % (name, str(year)))
62
return self.best_match(name, year)
64
def best_match(self, name, year=None):
65
"""Return single movie that best matches name criteria or None"""
66
movies = self.search(name)
69
log.debug('search did not return any movies')
72
# remove all movies below min_match, and different year
73
for movie in movies[:]:
74
if year and movie.get('year'):
75
if movie['year'] != str(year):
76
log.debug('best_match removing %s - %s (wrong year: %s)' % (movie['name'], movie['url'], str(movie['year'])))
79
if movie['match'] < self.min_match:
80
log.debug('best_match removing %s (min_match)' % movie['name'])
83
if movie.get('type', None) in self.ignore_types:
84
log.debug('best_match removing %s (ignored type)' % movie['name'])
89
log.debug('FAILURE: no movies remain')
92
# if only one remains ..
94
log.debug('SUCCESS: only one movie remains')
97
# check min difference between best two hits
98
diff = movies[0]['match'] - movies[1]['match']
99
if diff < self.min_diff:
100
log.debug('unable to determine correct movie, min_diff too small (`%s` <-?-> `%s`)' %
101
(movies[0], movies[1]))
103
log.debug('remain: %s (match: %s) %s' % (m['name'], m['match'], m['url']))
108
def search(self, name):
109
"""Return array of movie details (dict)"""
110
log.debug('Searching: %s' % name)
112
url = u'http://www.imdb.com/find?' + urllib.urlencode({'q': name.encode('latin1'), 's': 'all'})
114
log.warning('Problems with encoding %s, string possibly corrupted? Ignoring troublesome characters.' % name)
115
url = u'http://www.imdb.com/find?' + urllib.urlencode({'q': name.encode('latin1', 'ignore'), 's': 'all'})
117
log.debug('Serch query: %s' % repr(url))
118
page = urllib2.urlopen(url)
119
actual_url = page.geturl()
122
# in case we got redirected to movie page (perfect match)
123
re_m = re.match(r'.*\.imdb\.com/title/tt\d+/', actual_url)
125
actual_url = re_m.group(0)
126
log.debug('Perfect hit. Search got redirected to %s' % actual_url)
130
movie['url'] = actual_url
131
movie['imdb_id'] = extract_id(actual_url)
132
movie['year'] = None # skips year check
136
# the god damn page has declared a wrong encoding
137
soup = get_soup(page)
139
sections = ['Popular Titles', 'Titles (Exact Matches)',
140
'Titles (Partial Matches)', 'Titles (Approx Matches)']
142
for section in sections:
143
section_tag = soup.find('b', text=section)
145
log.debug('section %s not found' % section)
147
log.debug('processing section %s' % section)
149
section_table = section_tag.parent.parent.nextSibling
150
except AttributeError:
151
log.debug('Section %s does not have a table?' % section)
154
links = section_table.findAll('a', attrs={'href': re.compile(r'/title/tt')})
156
log.debug('section %s does not have links' % section)
157
for count, link in enumerate(links):
158
# skip links with div as a parent (not movies, somewhat rare links in additional details)
159
if link.parent.name == u'div':
162
# skip links without text value, these are small pictures before title
163
if len(link.contents) == 1 and not isinstance(link.contents[0], NavigableString):
167
additional = re.findall(r'\((.*?)\)', link.next.next)
168
if len(additional) > 0:
169
movie['year'] = filter(unicode.isdigit, additional[0]) # strip non numbers ie. 2008/I
170
if len(additional) > 1:
171
movie['type'] = additional[1]
173
movie['name'] = unicode(link.contents[0])
174
movie['url'] = 'http://www.imdb.com' + link.get('href')
175
movie['imdb_id'] = extract_id(movie['url'])
176
log.debug('processing name: %s url: %s' % (movie['name'], movie['url']))
178
# calc & set best matching ratio
179
seq = difflib.SequenceMatcher(lambda x: x == ' ', movie['name'].title(), name.title())
182
# deprioritize tv results
183
if movie.get('type') == 'TV':
184
log.debug('deprioritize tv')
185
ratio = ratio * self.tv_weight
187
# check if some of the akas have better ratio
188
for aka in link.parent.findAll('p', attrs={'class': 'find-aka'}):
189
aka = aka.next.string
190
match = re.search(r'".*"', aka)
192
log.debug('aka `%s` is invalid' % aka)
194
aka = match.group(0).replace('"', '')
195
log.trace('processing aka %s' % aka)
196
seq = difflib.SequenceMatcher(lambda x: x == ' ', aka.title(), name.title())
197
aka_ratio = seq.ratio()
198
if aka_ratio > ratio:
199
ratio = aka_ratio * self.aka_weight
200
log.debug('- aka `%s` matches better to `%s` ratio %s (weighted to %s)' %
201
(aka, name, aka_ratio, ratio))
203
# prioritize popular titles
204
if section != sections[0]:
205
ratio = ratio * self.unpopular_weight
207
log.debug('- priorizing popular %s' % movie['url'])
209
# prioritize first item
211
log.debug('- prioritizing first hit `%s`' % movie['url'])
212
ratio = ratio * self.first_weight
215
movie['match'] = ratio
218
movies.sort(key=lambda x: x['match'], reverse=True)
222
class ImdbParser(object):
223
"""Quick-hack to parse relevant imdb details"""
233
self.plot_outline = None
238
self.mpaa_rating = ''
241
return '<ImdbParser(name=%s,imdb_id=%s)>' % (self.name, self.imdb_id)
243
def parse(self, imdb_id):
244
self.imdb_id = extract_id(imdb_id)
245
url = make_url(self.imdb_id)
248
page = urllib2.urlopen(url)
250
raise ValueError('Invalid url %s' % url)
252
soup = get_soup(page)
255
tag_photo = soup.find('div', attrs={'class': 'photo'})
257
tag_img = tag_photo.find('img')
259
self.photo = tag_img.get('src')
260
log.debug('Detected photo: %s' % self.photo)
262
# get rating. Always the first absmiddle.
263
tag_infobar_div = soup.find('div', attrs={'class': 'infobar'})
265
tag_mpaa_rating = tag_infobar_div.find('img', attrs={'class': 'absmiddle'})
267
if tag_mpaa_rating['alt'] != tag_mpaa_rating['title']:
268
# If we've found something of class absmiddle in the infobar,
269
# it should be mpaa_rating, since that's the only one in there.
270
log.warning("MPAA rating alt and title don't match for URL %s - plugin needs an update?" % url)
272
self.mpaa_rating = tag_mpaa_rating['alt']
273
log.debug('Detected mpaa rating: %s' % self.mpaa_rating)
275
log.debug('Unable to match signature of mpaa rating for %s - could be a TV episode, or plugin needs update?' % url)
277
# We should match the infobar, it's an integral part of the IMDB page.
278
log.warning('Unable to get infodiv class for %s - plugin needs update?' % url)
282
tag_name = soup.find('h1')
285
# Handle a page not found in IMDB. tag_name.string is
286
# "<br/> Page Not Found" and there is no next tag. Thus, None.
287
if tag_name.next.string is not None:
288
self.name = tag_name.next.string.strip()
289
log.debug('Detected name: %s' % self.name)
291
log.warning('Unable to get name for %s - plugin needs update?' % url)
294
# detect if movie is eligible for ratings
295
rating_ineligible = soup.find('div', attrs={'class': 'rating-ineligible'})
296
if rating_ineligible:
297
log.debug('movie is not eligible for ratings')
300
tag_votes = soup.find(itemprop='ratingCount')
302
str_votes = ''.join(c for c in tag_votes.string if c.isdigit())
303
self.votes = int(str_votes)
304
log.debug('Detected votes: %s' % self.votes)
306
log.warning('Unable to get votes for %s - plugin needs update?' % url)
309
span_score = soup.find(itemprop='ratingValue')
312
self.score = float(span_score.string)
314
log.debug('tag_score %s is not valid float' % b_score.contents[0])
315
log.debug('Detected score: %s' % self.score)
317
log.warning('Unable to get score for %s - plugin needs update?' % url)
320
for link in soup.findAll('a', attrs={'itemprop': 'genre'}):
321
self.genres.append(unicode(link.contents[0].lower()))
324
for link in soup.findAll('a', attrs={'href': re.compile('^/language/')}):
325
lang = unicode(link.contents[0].lower())
326
if not lang in self.languages:
327
self.languages.append(lang.strip())
330
tag_year = soup.find('a', attrs={'href': re.compile('^/year/\d+')})
332
self.year = int(tag_year.contents[0])
333
log.debug('Detected year: %s' % self.year)
335
tag_year = soup.find('span', text=re.compile(r'^\((?:Video|TV) \d+\)'))
337
m = re.search('(\d{4})', unicode(tag_year))
339
self.year = int(m.group())
340
log.debug('Detected year: %s' % self.year)
342
log.warning('Unable to get year for %s (regexp mismatch) - plugin needs update?' % url)
344
log.warning('Unable to get year for %s (tag not found) - plugin needs update?' % url)
347
tag_cast = soup.find('table', 'cast_list')
349
for actor in tag_cast.findAll('a', href=re.compile('/name/nm')):
350
actor_id = extract_id(actor['href'])
351
actor_name = unicode(actor.contents[0])
352
# tag instead of name
353
if isinstance(actor_name, Tag):
355
self.actors[actor_id] = actor_name
358
h4_director = soup.find('h4', text=re.compile('Director'))
360
for director in h4_director.parent.parent.findAll('a', href=re.compile('/name/nm')):
361
director_id = extract_id(director['href'])
362
director_name = unicode(director.contents[0])
363
# tag instead of name
364
if isinstance(director_name, Tag):
366
self.directors[director_id] = director_name
368
log.debug('Detected genres: %s' % self.genres)
369
log.debug('Detected languages: %s' % self.languages)
370
log.debug('Detected director(s): %s' % ', '.join(self.directors))
371
log.debug('Detected actors: %s' % ', '.join(self.actors))
374
h2_plot = soup.find('h2', text='Storyline')
376
p_plot = h2_plot.findNext('p')
378
self.plot_outline = p_plot.next.string.strip()
379
log.debug('Detected plot outline: %s' % self.plot_outline)
381
log.debug('Plot does not have p-tag')
383
log.debug('Failed to find plot')