flexget.utils.imdb
Covered: 144 lines
Missed: 148 lines
Skipped 92 lines
Percent: 49 %
  1
import difflib
  2
import urllib
  3
import urllib2
  4
import logging
  5
import re
  6
from flexget.utils.soup import get_soup
  7
from BeautifulSoup import NavigableString, Tag
  9
log = logging.getLogger('utils.imdb')
 12
def extract_id(url):
 13
    """Return IMDb ID of the given URL. Return None if not valid or if URL is not a string."""
 14
    if not isinstance(url, basestring):
 15
        return
 16
    m = re.search(r'((?:nm|tt)[\d]{7})', url)
 17
    if m:
 18
        return m.group(1)
 21
def make_url(imdb_id):
 22
    """Return IMDb URL of the given ID"""
 23
    return u'http://www.imdb.com/title/%s/' % imdb_id
 26
class ImdbSearch(object):
 28
    def __init__(self):
 30
        self.aka_weight = 0.95
 32
        self.unpopular_weight = 0.85
 34
        self.tv_weight = 0.75
 36
        self.first_weight = 1.02
 37
        self.min_match = 0.5
 38
        self.min_diff = 0.01
 39
        self.debug = False
 41
        self.remove = ['imax']
 43
        self.ignore_types = ['VG']
 45
    def ireplace(self, str, old, new, count=0):
 46
        """Case insensitive string replace"""
 47
        pattern = re.compile(re.escape(old), re.I)
 48
        return re.sub(pattern, new, str, count)
 50
    def smart_match(self, raw_name):
 51
        """Accepts messy name, cleans it and uses information available to make smartest and best match"""
 52
        from flexget.utils.titles.movie import MovieParser
 53
        parser = MovieParser()
 54
        parser.data = raw_name
 55
        parser.parse()
 56
        name = parser.name
 57
        year = parser.year
 58
        if name == '':
 59
            log.critical('Failed to parse name from %s' % raw_name)
 60
            return None
 61
        log.debug('smart_match name=%s year=%s' % (name, str(year)))
 62
        return self.best_match(name, year)
 64
    def best_match(self, name, year=None):
 65
        """Return single movie that best matches name criteria or None"""
 66
        movies = self.search(name)
 68
        if not movies:
 69
            log.debug('search did not return any movies')
 70
            return None
 73
        for movie in movies[:]:
 74
            if year and movie.get('year'):
 75
                if movie['year'] != str(year):
 76
                    log.debug('best_match removing %s - %s (wrong year: %s)' % (movie['name'], movie['url'], str(movie['year'])))
 77
                    movies.remove(movie)
 78
                    continue
 79
            if movie['match'] < self.min_match:
 80
                log.debug('best_match removing %s (min_match)' % movie['name'])
 81
                movies.remove(movie)
 82
                continue
 83
            if movie.get('type', None) in self.ignore_types:
 84
                log.debug('best_match removing %s (ignored type)' % movie['name'])
 85
                movies.remove(movie)
 86
                continue
 88
        if not movies:
 89
            log.debug('FAILURE: no movies remain')
 90
            return None
 93
        if len(movies) == 1:
 94
            log.debug('SUCCESS: only one movie remains')
 95
            return movies[0]
 98
        diff = movies[0]['match'] - movies[1]['match']
 99
        if diff < self.min_diff:
100
            log.debug('unable to determine correct movie, min_diff too small (`%s` <-?-> `%s`)' %
101
                (movies[0], movies[1]))
102
            for m in movies:
103
                log.debug('remain: %s (match: %s) %s' % (m['name'], m['match'], m['url']))
104
            return None
105
        else:
106
            return movies[0]
108
    def search(self, name):
109
        """Return array of movie details (dict)"""
110
        log.debug('Searching: %s' % name)
111
        try:
112
            url = u'http://www.imdb.com/find?' + urllib.urlencode({'q': name.encode('latin1'), 's': 'all'})
113
        except:
114
            log.warning('Problems with encoding %s, string possibly corrupted? Ignoring troublesome characters.' % name)
115
            url = u'http://www.imdb.com/find?' + urllib.urlencode({'q': name.encode('latin1', 'ignore'), 's': 'all'})
117
        log.debug('Serch query: %s' % repr(url))
118
        page = urllib2.urlopen(url)
119
        actual_url = page.geturl()
121
        movies = []
123
        re_m = re.match(r'.*\.imdb\.com/title/tt\d+/', actual_url)
124
        if re_m:
125
            actual_url = re_m.group(0)
126
            log.debug('Perfect hit. Search got redirected to %s' % actual_url)
127
            movie = {}
128
            movie['match'] = 1.0
129
            movie['name'] = name
130
            movie['url'] = actual_url
131
            movie['imdb_id'] = extract_id(actual_url)
132
            movie['year'] = None # skips year check
133
            movies.append(movie)
134
            return movies
137
        soup = get_soup(page)
139
        sections = ['Popular Titles', 'Titles (Exact Matches)',
140
                    'Titles (Partial Matches)', 'Titles (Approx Matches)']
142
        for section in sections:
143
            section_tag = soup.find('b', text=section)
144
            if not section_tag:
145
                log.debug('section %s not found' % section)
146
                continue
147
            log.debug('processing section %s' % section)
148
            try:
149
                section_table = section_tag.parent.parent.nextSibling
150
            except AttributeError:
151
                log.debug('Section %s does not have a table?' % section)
152
                continue
154
            links = section_table.findAll('a', attrs={'href': re.compile(r'/title/tt')})
155
            if not links:
156
                log.debug('section %s does not have links' % section)
157
            for count, link in enumerate(links):
159
                if link.parent.name == u'div':
160
                    continue
163
                if len(link.contents) == 1 and not isinstance(link.contents[0], NavigableString):
164
                    continue
166
                movie = {}
167
                additional = re.findall(r'\((.*?)\)', link.next.next)
168
                if len(additional) > 0:
169
                    movie['year'] = filter(unicode.isdigit, additional[0]) # strip non numbers ie. 2008/I
170
                if len(additional) > 1:
171
                    movie['type'] = additional[1]
173
                movie['name'] = unicode(link.contents[0])
174
                movie['url'] = 'http://www.imdb.com' + link.get('href')
175
                movie['imdb_id'] = extract_id(movie['url'])
176
                log.debug('processing name: %s url: %s' % (movie['name'], movie['url']))
179
                seq = difflib.SequenceMatcher(lambda x: x == ' ', movie['name'].title(), name.title())
180
                ratio = seq.ratio()
183
                if movie.get('type') == 'TV':
184
                    log.debug('deprioritize tv')
185
                    ratio = ratio * self.tv_weight
188
                for aka in link.parent.findAll('p', attrs={'class': 'find-aka'}):
189
                    aka = aka.next.string
190
                    match = re.search(r'".*"', aka)
191
                    if not match:
192
                        log.debug('aka `%s` is invalid' % aka)
193
                        continue
194
                    aka = match.group(0).replace('"', '')
195
                    log.trace('processing aka %s' % aka)
196
                    seq = difflib.SequenceMatcher(lambda x: x == ' ', aka.title(), name.title())
197
                    aka_ratio = seq.ratio()
198
                    if aka_ratio > ratio:
199
                        ratio = aka_ratio * self.aka_weight
200
                        log.debug('- aka `%s` matches better to `%s` ratio %s (weighted to %s)' %
201
                                  (aka, name, aka_ratio, ratio))
204
                if section != sections[0]:
205
                    ratio = ratio * self.unpopular_weight
206
                else:
207
                    log.debug('- priorizing popular %s' % movie['url'])
210
                if count == 1:
211
                    log.debug('- prioritizing first hit `%s`' % movie['url'])
212
                    ratio = ratio * self.first_weight
215
                movie['match'] = ratio
216
                movies.append(movie)
218
        movies.sort(key=lambda x: x['match'], reverse=True)
219
        return movies
222
class ImdbParser(object):
223
    """Quick-hack to parse relevant imdb details"""
225
    def __init__(self):
226
        self.genres = []
227
        self.languages = []
228
        self.actors = {}
229
        self.directors = {}
230
        self.score = 0.0
231
        self.votes = 0
232
        self.year = 0
233
        self.plot_outline = None
234
        self.name = None
235
        self.url = None
236
        self.imdb_id = None
237
        self.photo = None
238
        self.mpaa_rating = ''
240
    def __str__(self):
241
        return '<ImdbParser(name=%s,imdb_id=%s)>' % (self.name, self.imdb_id)
243
    def parse(self, imdb_id):
244
        self.imdb_id = extract_id(imdb_id)
245
        url = make_url(self.imdb_id)
246
        self.url = url
247
        try:
248
            page = urllib2.urlopen(url)
249
        except ValueError:
250
            raise ValueError('Invalid url %s' % url)
252
        soup = get_soup(page)
255
        tag_photo = soup.find('div', attrs={'class': 'photo'})
256
        if tag_photo:
257
            tag_img = tag_photo.find('img')
258
            if tag_img:
259
                self.photo = tag_img.get('src')
260
                log.debug('Detected photo: %s' % self.photo)
263
        tag_infobar_div = soup.find('div', attrs={'class': 'infobar'})
264
        if tag_infobar_div:
265
            tag_mpaa_rating = tag_infobar_div.find('img', attrs={'class': 'absmiddle'})
266
            if tag_mpaa_rating:
267
                if tag_mpaa_rating['alt'] != tag_mpaa_rating['title']:
270
                    log.warning("MPAA rating alt and title don't match for URL %s - plugin needs an update?" % url)
271
                else:
272
                    self.mpaa_rating = tag_mpaa_rating['alt']
273
                    log.debug('Detected mpaa rating: %s' % self.mpaa_rating)
274
            else:
275
                log.debug('Unable to match signature of mpaa rating for %s - could be a TV episode, or plugin needs update?' % url)
276
        else:
278
            log.warning('Unable to get infodiv class for %s - plugin needs update?' % url)
282
        tag_name = soup.find('h1')
283
        if tag_name:
284
            if tag_name.next:
287
                if tag_name.next.string is not None:
288
                    self.name = tag_name.next.string.strip()
289
                    log.debug('Detected name: %s' % self.name)
290
        else:
291
            log.warning('Unable to get name for %s - plugin needs update?' % url)
295
        rating_ineligible = soup.find('div', attrs={'class': 'rating-ineligible'})
296
        if rating_ineligible:
297
            log.debug('movie is not eligible for ratings')
298
        else:
300
            tag_votes = soup.find(itemprop='ratingCount')
301
            if tag_votes:
302
                str_votes = ''.join(c for c in tag_votes.string if c.isdigit())
303
                self.votes = int(str_votes)
304
                log.debug('Detected votes: %s' % self.votes)
305
            else:
306
                log.warning('Unable to get votes for %s - plugin needs update?' % url)
309
            span_score = soup.find(itemprop='ratingValue')
310
            if span_score:
311
                try:
312
                    self.score = float(span_score.string)
313
                except ValueError:
314
                    log.debug('tag_score %s is not valid float' % b_score.contents[0])
315
                log.debug('Detected score: %s' % self.score)
316
            else:
317
                log.warning('Unable to get score for %s - plugin needs update?' % url)
320
        for link in soup.findAll('a', attrs={'itemprop': 'genre'}):
321
            self.genres.append(unicode(link.contents[0].lower()))
324
        for link in soup.findAll('a', attrs={'href': re.compile('^/language/')}):
325
            lang = unicode(link.contents[0].lower())
326
            if not lang in self.languages:
327
                self.languages.append(lang.strip())
330
        tag_year = soup.find('a', attrs={'href': re.compile('^/year/\d+')})
331
        if tag_year:
332
            self.year = int(tag_year.contents[0])
333
            log.debug('Detected year: %s' % self.year)
334
        else:
335
            tag_year = soup.find('span', text=re.compile(r'^\((?:Video|TV) \d+\)'))
336
            if tag_year:
337
                m = re.search('(\d{4})', unicode(tag_year))
338
                if m:
339
                    self.year = int(m.group())
340
                    log.debug('Detected year: %s' % self.year)
341
                else:
342
                    log.warning('Unable to get year for %s (regexp mismatch) - plugin needs update?' % url)
343
            else:
344
                log.warning('Unable to get year for %s (tag not found) - plugin needs update?' % url)
347
        tag_cast = soup.find('table', 'cast_list')
348
        if tag_cast:
349
            for actor in tag_cast.findAll('a', href=re.compile('/name/nm')):
350
                actor_id = extract_id(actor['href'])
351
                actor_name = unicode(actor.contents[0])
353
                if isinstance(actor_name, Tag):
354
                    actor_name = None
355
                self.actors[actor_id] = actor_name
358
        h4_director = soup.find('h4', text=re.compile('Director'))
359
        if h4_director:
360
            for director in h4_director.parent.parent.findAll('a', href=re.compile('/name/nm')):
361
                director_id = extract_id(director['href'])
362
                director_name = unicode(director.contents[0])
364
                if isinstance(director_name, Tag):
365
                    director_name = None
366
                self.directors[director_id] = director_name
368
        log.debug('Detected genres: %s' % self.genres)
369
        log.debug('Detected languages: %s' % self.languages)
370
        log.debug('Detected director(s): %s' % ', '.join(self.directors))
371
        log.debug('Detected actors: %s' % ', '.join(self.actors))
374
        h2_plot = soup.find('h2', text='Storyline')
375
        if h2_plot:
376
            p_plot = h2_plot.findNext('p')
377
            if p_plot:
378
                self.plot_outline = p_plot.next.string.strip()
379
                log.debug('Detected plot outline: %s' % self.plot_outline)
380
            else:
381
                log.debug('Plot does not have p-tag')
382
        else:
383
            log.debug('Failed to find plot')