flexget.utils.titles.series
Covered: 328 lines
Missed: 0 lines
Skipped 106 lines
Percent: 100 %
  1
import logging
  2
import re
  3
from flexget.utils.titles.parser import TitleParser, ParseWarning
  4
from flexget.utils import qualities
  5
from flexget.utils.tools import ReList
  7
log = logging.getLogger('seriesparser')
 11
log.setLevel(logging.INFO)
 14
class SeriesParser(TitleParser):
 16
    """
 17
    Parse series.
 19
    :name: series name
 20
    :data: data to parse
 21
    :expect_ep: expect series to be in season, ep format (ep_regexps)
 22
    :expect_id: expect series to be in id format (id_regexps)
 23
    """
 25
    separators = '[!/+,:;|~ x-]'
 26
    roman_numeral_re = 'X{0,3}(?:IX|XI{0,4}|VI{0,4}|IV|V|I{1,4})'
 29
    ep_regexps = ReList([TitleParser.re_not_in_word(regexp) for regexp in [
 30
        '(?:series|season|s)\s?(\d{1,3})(?:\s(?:.*\s)?)?(?:episode|ep|e|part|pt)\s?(\d{1,3}|%s)(?:\s?e?(\d{1,2}))?' %
 31
            roman_numeral_re,
 32
        '(?:series|season)\s?(\d{1,3})\s(\d{1,3})\s?of\s?(?:\d{1,3})',
 33
        '(\d{1,2})\s?x\s?(\d+)(?:\s(\d{1,2}))?',
 34
        '(\d{1,3})\s?of\s?(?:\d{1,3})',
 35
        '(?:episode|ep|part|pt)\s?(\d{1,3}|%s)' % roman_numeral_re]])
 36
    unwanted_ep_regexps = ReList([
 37
         '(\d{1,3})\s?x\s?(0+)[^1-9]', # 5x0
 38
         'S(\d{1,3})D(\d{1,3})', # S3D1
 39
         '(\d{1,3})\s?x\s?(all)', # 1xAll
 40
         'season(?:s)?\s?\d\s?(?:&\s?\d)?[\s-]*(?:complete|full)',
 41
         'seasons\s(\d\s){2,}',
 42
         'disc\s\d'])
 44
    id_regexps = ReList([TitleParser.re_not_in_word(regexp) for regexp in [
 45
        '(\d{4})%s(\d+)%s(\d+)' % (separators, separators),
 46
        '(\d+)%s(\d+)%s(\d{4})' % (separators, separators),
 47
        '(\d{4})x(\d+)\.(\d+)',
 48
        '(pt|part)\s?(\d+|%s)' % roman_numeral_re,
 49
        '(\d{1,3})(?:v(?P<version>\d))?']])
 50
    unwanted_id_regexps = ReList([
 51
        'seasons?\s?\d{1,2}'])
 52
    clean_regexps = ReList(['\[.*?\]', '\(.*?\)'])
 54
    ignore_prefixes = [
 55
            '(?:\[[^\[\]]*\])', # ignores group names before the name, eg [foobar] name
 56
            '(?:HD.720p?:)',
 57
            '(?:HD.1080p?:)']
 59
    def __init__(self, name='', identified_by='auto', name_regexps=None, ep_regexps=None, id_regexps=None,
 60
                 strict_name=False, allow_groups=None, allow_seasonless=True):
 61
        """Init SeriesParser.
 63
        :param string name: Name of the series parser is going to try to parse.
 65
        :param string identified_by: What kind of episode numbering scheme is expected, valid values are ep, id and auto (default).
 66
        :param list name_regexps: Regexps for name matching or None (default), by default regexp is generated from name.
 67
        :param list ep_regexps: Regexps detecting episode,season format. Given list is prioritized over built-in regexps.
 68
        :param list id_regexps: Regexps detecting id format. Given list is prioritized over built in regexps.
 69
        :param boolean strict_name: If True name must be immediately be followed by episode identifier.
 70
        :param list allow_groups: Optionally specify list of release group names that are allowed.
 71
        This will also populate attribute `group`.
 72
        """
 74
        self.name = name
 75
        self.data = ''
 76
        self.expect_ep = identified_by == 'ep'
 77
        self.expect_id = identified_by == 'id'
 78
        self.name_regexps = ReList(name_regexps or [])
 79
        self.re_from_name = False
 80
        if ep_regexps:
 81
            self.ep_regexps = ReList(ep_regexps + SeriesParser.ep_regexps)
 82
            self.id_regexps = []
 83
        elif id_regexps:
 84
            self.id_regexps = ReList(id_regexps + SeriesParser.id_regexps)
 85
            self.ep_regexps = []
 86
        self.strict_name = strict_name
 87
        self.allow_groups = allow_groups or []
 88
        self.allow_seasonless = allow_seasonless
 90
        self.field = None
 91
        self._reset()
 93
    def _reset(self):
 95
        self.season = None
 96
        self.episode = None
 97
        self.end_episode = None
 98
        self.id = None
 99
        self.id_groups = None
100
        self.quality = qualities.UNKNOWN
101
        self.proper_count = 0
102
        self.special = False
104
        self.group = None
107
        self.valid = False
109
    def __setattr__(self, name, value):
110
        """
111
        Some conversions when setting attributes.
112
        `self.name` and `self.data` are converted to unicode.
113
        """
114
        if name == 'name' or name == 'data':
115
            if isinstance(value, str):
116
                value = unicode(value)
117
            elif not isinstance(value, unicode):
118
                raise Exception('%s cannot be %s' % (name, repr(value)))
119
        object.__setattr__(self, name, value)
121
    def remove_dirt(self, data):
122
        """Replaces some characters with spaces"""
123
        return re.sub(r'[_.,\[\]\(\): ]+', ' ', data).strip().lower()
125
    def name_to_re(self, name):
126
        """Convert 'foo bar' to '^[^...]*foo[^...]*bar[^...]+"""
129
        blank = r'[\W_]'
130
        ignore = '(?:' + '|'.join(self.ignore_prefixes) + ')?'
132
        name = name.replace('&', '(?:and|&)')
133
        res = re.sub(re.compile(blank + '+', re.UNICODE), ' ', name)
134
        res = res.strip()
136
        res = res.replace(' and ', ' (?:and|&) ')
137
        res = re.sub(' +', blank + '*', res, re.UNICODE)
138
        res = '^' + ignore + blank + '*' + '(' + res + ')' + blank + '+'
139
        return res
141
    def parse(self, data=None, field=None, quality=qualities.UNKNOWN):
143
        self._reset()
144
        self.field = field
145
        self.quality = quality
146
        if data:
147
            self.data = data
148
        if not self.name or not self.data:
149
            raise Exception('SeriesParser initialization error, name: %s data: %s' % \
150
               (repr(self.name), repr(self.data)))
152
        if self.expect_ep and self.expect_id:
153
            raise Exception('Flags expect_ep and expect_id are mutually exclusive')
155
        name = self.remove_dirt(self.name)
158
        if self.parse_unwanted(self.remove_dirt(self.data)):
159
            return
161
        log.debug('name: %s data: %s' % (name, self.data))
164
        name_start = 0
165
        name_end = 0
168
        if not self.name_regexps:
170
            self.name_regexps = ReList([self.name_to_re(name)])
171
            self.re_from_name = True
173
        for name_re in self.name_regexps:
174
            match = re.search(name_re, self.data)
175
            if match:
176
                if self.re_from_name:
177
                    name_start, name_end = match.span(1)
178
                else:
179
                    name_start, name_end = match.span()
181
                log.debug('NAME SUCCESS: %s matched to %s' % (name_re.pattern, self.data))
182
                break
183
        else:
185
            log.debug('FAIL: name regexps %s do not match %s' % ([regexp.pattern for regexp in self.name_regexps],
186
                                                                 self.data))
187
            return
191
        data_stripped = self.data[name_end:] + ' ' + self.data[:name_start]
192
        data_stripped = data_stripped.lower()
193
        log.debug('data stripped: %s' % data_stripped)
196
        if self.allow_groups:
197
            for group in self.allow_groups:
198
                group = group.lower()
199
                for fmt in ['[%s]', '-%s']:
200
                    if fmt % group in data_stripped:
201
                        log.debug('%s is from group %s' % (self.data, group))
202
                        self.group = group
203
                        data_stripped = data_stripped.replace(fmt % group, '')
204
                        break
205
                if self.group:
206
                    break
207
            else:
208
                log.debug('%s is not from groups %s' % (self.data, self.allow_groups))
209
                return # leave invalid
212
        if not quality or quality == qualities.UNKNOWN:
213
            log.debug('parsing quality ->')
214
            quality, remaining = qualities.quality_match(data_stripped)
215
            self.quality = quality
216
            if remaining:
218
                log.debug('quality detected, using remaining data `%s`' % remaining)
219
                data_stripped = remaining
222
        data_stripped = self.remove_words(data_stripped, self.remove + qualities.registry.keys() +
223
                                                         self.codecs + self.sounds, not_in_word=True)
226
        data_parts = re.split('[\W_]+', data_stripped)
228
        for part in data_parts[:]:
229
            if part in self.propers:
230
                self.proper_count += 1
231
                data_parts.remove(part)
232
            elif part in self.specials:
233
                self.special = True
234
                data_parts.remove(part)
236
        data_stripped = ' '.join(data_parts).strip()
238
        log.debug("data for id/ep parsing '%s'" % data_stripped)
240
        ep_match = self.parse_episode(data_stripped)
241
        if ep_match:
243
            if self.strict_name:
244
                if ep_match['match'].start() > 1:
245
                    return
247
            if self.expect_id:
248
                log.debug('found episode number, but expecting id, aborting!')
249
                return
251
            if ep_match['end_episode'] > ep_match['episode'] + 2:
253
                log.debug('Series pack contains too many episodes (%d). Rejecting' %
254
                          (ep_match['end_episode'] - ep_match['episode']))
255
                return
257
            self.season = ep_match['season']
258
            self.episode = ep_match['episode']
259
            self.end_episode = ep_match['end_episode']
260
            self.valid = True
261
            return
263
        log.debug('-> no luck with ep_regexps')
267
        if self.expect_ep:
272
            log.debug('expect_ep enabled')
273
            match = re.search(self.re_not_in_word(r'(0?\d)(\d\d)'), data_stripped, re.IGNORECASE | re.UNICODE)
274
            if match:
276
                if self.strict_name:
277
                    if match.start() > 1:
278
                        return
280
                self.season = int(match.group(1))
281
                self.episode = int(match.group(2))
282
                log.debug(self)
283
                self.valid = True
284
                return
285
            log.debug('-> no luck with the expect_ep')
286
        else:
287
            if self.parse_unwanted_id(data_stripped):
288
                return
289
            for id_re in self.id_regexps:
290
                match = re.search(id_re, data_stripped)
291
                if match:
293
                    if self.strict_name:
294
                        if match.start() - name_end >= 2:
295
                            return
296
                    if 'version' in match.groupdict():
297
                        if match.group('version'):
298
                            self.proper_count = int(match.group('version')) - 1
299
                        self.id = match.group(1)
300
                    else:
301
                        self.id = '-'.join(match.groups())
302
                    self.id_groups = match.groups()
303
                    if self.special:
304
                        self.id += '-SPECIAL'
305
                    self.valid = True
306
                    log.debug('found id \'%s\' with regexp \'%s\'' % (self.id, id_re.pattern))
307
                    return
308
            log.debug('-> no luck with id_regexps')
311
        if self.special:
313
            self.id = data_stripped
314
            self.valid = True
315
            log.debug('found special, setting id to \'%s\'' % self.id)
316
            return
318
        raise ParseWarning('Title \'%s\' looks like series \'%s\' but I cannot find any episode or id numbering' % (self.data, self.name))
320
    def parse_unwanted(self, data):
321
        """Parses data for an unwanted hits. Return True if the data contains unwanted hits."""
322
        for ep_unwanted_re in self.unwanted_ep_regexps:
323
            match = re.search(ep_unwanted_re, data)
324
            if match:
325
                log.debug('unwanted regexp %s matched %s' % (ep_unwanted_re.pattern, match.groups()))
326
                return True
328
    def parse_unwanted_id(self, data):
329
        """Parses data for an unwanted id hits. Return True if the data contains unwanted hits."""
330
        for id_unwanted_re in self.unwanted_id_regexps:
331
            match = re.search(id_unwanted_re, data)
332
            if match:
333
                log.debug('unwanted id regexp %s matched %s' % (id_unwanted_re, match.groups()))
334
                return True
336
    def parse_episode(self, data):
337
        """
338
        Parses :data: for an episode identifier.
339
        If found, returns a dict with keys for season, episode, end_episode and the regexp match object
340
        If no episode id is found returns False
341
        """
344
        for ep_re in self.ep_regexps:
345
            match = re.search(ep_re, data)
347
            if match:
348
                log.debug('found episode number with regexp %s (%s)' % (ep_re.pattern, match.groups()))
349
                matches = match.groups()
350
                if len(matches) >= 2:
351
                    season = matches[0]
352
                    episode = matches[1]
353
                elif self.allow_seasonless:
355
                    season = 1
356
                    episode = matches[0]
357
                else:
359
                    return False
361
                try:
362
                    season = int(season)
363
                    if not episode.isdigit():
364
                        episode = self.roman_to_int(episode)
365
                    else:
366
                        episode = int(episode)
367
                except ValueError:
368
                    log.critical('Invalid episode number match %s returned with regexp `%s`' % (match.groups(), ep_re.pattern))
369
                    raise
370
                end_episode = None
371
                if len(matches) == 3 and matches[2]:
372
                    end_episode = int(matches[2])
373
                    if end_episode <= episode or end_episode > episode + 10:
376
                        end_episode = None
378
                return {'season': season,
379
                        'episode': episode,
380
                        'end_episode': end_episode,
381
                        'match': match}
382
        return False
384
    def roman_to_int(self, roman):
385
        """Converts roman numerals up to 39 to integers"""
386
        roman_map = [('X', 10), ('IX', 9), ('V', 5), ('IV', 4), ('I', 1)]
387
        roman = roman.upper()
390
        for char in roman:
391
            if char not in 'XVI':
392
                raise ValueError('`%s` is not a valid roman numeral' % roman)
395
        i = result = 0
396
        for numeral, integer in roman_map:
397
            while roman[i:i + len(numeral)] == numeral:
398
                result += integer
399
                i += len(numeral)
400
        return result
402
    @property
403
    def identifier(self):
404
        """Return String identifier for parsed episode, eg. S01E02"""
405
        if not self.valid:
406
            raise Exception('Series flagged invalid')
407
        if isinstance(self.season, int) and isinstance(self.episode, int):
408
            return 'S%sE%s' % (str(self.season).zfill(2), str(self.episode).zfill(2))
409
        elif self.id is None:
410
            raise Exception('Series is missing identifier')
411
        else:
412
            return self.id
414
    @property
415
    def proper(self):
416
        return self.proper_count > 0
418
    def __str__(self):
421
        valid = 'INVALID'
422
        if self.valid:
423
            valid = 'OK'
424
        return '<SeriesParser(data=%s,name=%s,id=%s,season=%s,episode=%s,quality=%s,proper=%s,status=%s)>' % \
425
            (self.data, self.name, str(self.id), self.season, self.episode, \
426
             self.quality, self.proper_count, valid)
428
    def __cmp__(self, other):
429
        """Compares quality of parsers, if quality is equal, compares proper_count."""
430
        return cmp((self.quality, self.proper_count), (other.quality, other.proper_count))
432
    def __eq__(self, other):
433
        return self is other