3
from flexget.utils.titles.parser import TitleParser, ParseWarning
4
from flexget.utils import qualities
5
from flexget.utils.tools import ReList
7
log = logging.getLogger('seriesparser')
10
# switch to logging.DEBUG if you want to debug this class (produces quite a bit info ..)
11
log.setLevel(logging.INFO)
14
class SeriesParser(TitleParser):
21
:expect_ep: expect series to be in season, ep format (ep_regexps)
22
:expect_id: expect series to be in id format (id_regexps)
25
separators = '[!/+,:;|~ x-]'
26
roman_numeral_re = 'X{0,3}(?:IX|XI{0,4}|VI{0,4}|IV|V|I{1,4})'
28
# Make sure none of these are found embedded within a word or other numbers
29
ep_regexps = ReList([TitleParser.re_not_in_word(regexp) for regexp in [
30
'(?:series|season|s)\s?(\d{1,3})(?:\s(?:.*\s)?)?(?:episode|ep|e|part|pt)\s?(\d{1,3}|%s)(?:\s?e?(\d{1,2}))?' %
32
'(?:series|season)\s?(\d{1,3})\s(\d{1,3})\s?of\s?(?:\d{1,3})',
33
'(\d{1,2})\s?x\s?(\d+)(?:\s(\d{1,2}))?',
34
'(\d{1,3})\s?of\s?(?:\d{1,3})',
35
'(?:episode|ep|part|pt)\s?(\d{1,3}|%s)' % roman_numeral_re]])
36
unwanted_ep_regexps = ReList([
37
'(\d{1,3})\s?x\s?(0+)[^1-9]', # 5x0
38
'S(\d{1,3})D(\d{1,3})', # S3D1
39
'(\d{1,3})\s?x\s?(all)', # 1xAll
40
'season(?:s)?\s?\d\s?(?:&\s?\d)?[\s-]*(?:complete|full)',
41
'seasons\s(\d\s){2,}',
43
# Make sure none of these are found embedded within a word or other numbers
44
id_regexps = ReList([TitleParser.re_not_in_word(regexp) for regexp in [
45
'(\d{4})%s(\d+)%s(\d+)' % (separators, separators),
46
'(\d+)%s(\d+)%s(\d{4})' % (separators, separators),
47
'(\d{4})x(\d+)\.(\d+)',
48
'(pt|part)\s?(\d+|%s)' % roman_numeral_re,
49
'(\d{1,3})(?:v(?P<version>\d))?']])
50
unwanted_id_regexps = ReList([
51
'seasons?\s?\d{1,2}'])
52
clean_regexps = ReList(['\[.*?\]', '\(.*?\)'])
53
# ignore prefix regexps must be passive groups with 0 or 1 occurrences eg. (?:prefix)?
55
'(?:\[[^\[\]]*\])', # ignores group names before the name, eg [foobar] name
59
def __init__(self, name='', identified_by='auto', name_regexps=None, ep_regexps=None, id_regexps=None,
60
strict_name=False, allow_groups=None, allow_seasonless=True):
63
:param string name: Name of the series parser is going to try to parse.
65
:param string identified_by: What kind of episode numbering scheme is expected, valid values are ep, id and auto (default).
66
:param list name_regexps: Regexps for name matching or None (default), by default regexp is generated from name.
67
:param list ep_regexps: Regexps detecting episode,season format. Given list is prioritized over built-in regexps.
68
:param list id_regexps: Regexps detecting id format. Given list is prioritized over built in regexps.
69
:param boolean strict_name: If True name must be immediately be followed by episode identifier.
70
:param list allow_groups: Optionally specify list of release group names that are allowed.
71
This will also populate attribute `group`.
76
self.expect_ep = identified_by == 'ep'
77
self.expect_id = identified_by == 'id'
78
self.name_regexps = ReList(name_regexps or [])
79
self.re_from_name = False
81
self.ep_regexps = ReList(ep_regexps + SeriesParser.ep_regexps)
84
self.id_regexps = ReList(id_regexps + SeriesParser.id_regexps)
86
self.strict_name = strict_name
87
self.allow_groups = allow_groups or []
88
self.allow_seasonless = allow_seasonless
94
# parse produces these
97
self.end_episode = None
100
self.quality = qualities.UNKNOWN
101
self.proper_count = 0
103
# TODO: group is only produced with allow_groups
106
# false if item does not match series
109
def __setattr__(self, name, value):
111
Some conversions when setting attributes.
112
`self.name` and `self.data` are converted to unicode.
114
if name == 'name' or name == 'data':
115
if isinstance(value, str):
116
value = unicode(value)
117
elif not isinstance(value, unicode):
118
raise Exception('%s cannot be %s' % (name, repr(value)))
119
object.__setattr__(self, name, value)
121
def remove_dirt(self, data):
122
"""Replaces some characters with spaces"""
123
return re.sub(r'[_.,\[\]\(\): ]+', ' ', data).strip().lower()
125
def name_to_re(self, name):
126
"""Convert 'foo bar' to '^[^...]*foo[^...]*bar[^...]+"""
127
# TODO: Still doesn't handle the case where the user wants
128
# "Schmost" and the feed contains "Schmost at Sea".
130
ignore = '(?:' + '|'.join(self.ignore_prefixes) + ')?'
131
# accept either '&' or 'and'
132
name = name.replace('&', '(?:and|&)')
133
res = re.sub(re.compile(blank + '+', re.UNICODE), ' ', name)
135
# check for 'and' surrounded by spaces so it is not replaced within a word or from above replacement
136
res = res.replace(' and ', ' (?:and|&) ')
137
res = re.sub(' +', blank + '*', res, re.UNICODE)
138
res = '^' + ignore + blank + '*' + '(' + res + ')' + blank + '+'
141
def parse(self, data=None, field=None, quality=qualities.UNKNOWN):
142
# Clear the output variables before parsing
145
self.quality = quality
148
if not self.name or not self.data:
149
raise Exception('SeriesParser initialization error, name: %s data: %s' % \
150
(repr(self.name), repr(self.data)))
152
if self.expect_ep and self.expect_id:
153
raise Exception('Flags expect_ep and expect_id are mutually exclusive')
155
name = self.remove_dirt(self.name)
157
# check if data appears to be unwanted (abort)
158
if self.parse_unwanted(self.remove_dirt(self.data)):
161
log.debug('name: %s data: %s' % (name, self.data))
167
# regexp name matching
168
if not self.name_regexps:
169
# if we don't have name_regexps, generate one from the name
170
self.name_regexps = ReList([self.name_to_re(name)])
171
self.re_from_name = True
172
# try all specified regexps on this data
173
for name_re in self.name_regexps:
174
match = re.search(name_re, self.data)
176
if self.re_from_name:
177
name_start, name_end = match.span(1)
179
name_start, name_end = match.span()
181
log.debug('NAME SUCCESS: %s matched to %s' % (name_re.pattern, self.data))
185
log.debug('FAIL: name regexps %s do not match %s' % ([regexp.pattern for regexp in self.name_regexps],
190
# remove series name from raw data, move any prefix to end of string
191
data_stripped = self.data[name_end:] + ' ' + self.data[:name_start]
192
data_stripped = data_stripped.lower()
193
log.debug('data stripped: %s' % data_stripped)
196
if self.allow_groups:
197
for group in self.allow_groups:
198
group = group.lower()
199
for fmt in ['[%s]', '-%s']:
200
if fmt % group in data_stripped:
201
log.debug('%s is from group %s' % (self.data, group))
203
data_stripped = data_stripped.replace(fmt % group, '')
208
log.debug('%s is not from groups %s' % (self.data, self.allow_groups))
209
return # leave invalid
211
# search tags and quality if one was not provided to parse method
212
if not quality or quality == qualities.UNKNOWN:
213
log.debug('parsing quality ->')
214
quality, remaining = qualities.quality_match(data_stripped)
215
self.quality = quality
217
# Remove quality string from data
218
log.debug('quality detected, using remaining data `%s`' % remaining)
219
data_stripped = remaining
221
# Remove unwanted words (qualities and such) from data for ep / id parsing
222
data_stripped = self.remove_words(data_stripped, self.remove + qualities.registry.keys() +
223
self.codecs + self.sounds, not_in_word=True)
226
data_parts = re.split('[\W_]+', data_stripped)
228
for part in data_parts[:]:
229
if part in self.propers:
230
self.proper_count += 1
231
data_parts.remove(part)
232
elif part in self.specials:
234
data_parts.remove(part)
236
data_stripped = ' '.join(data_parts).strip()
238
log.debug("data for id/ep parsing '%s'" % data_stripped)
240
ep_match = self.parse_episode(data_stripped)
244
if ep_match['match'].start() > 1:
248
log.debug('found episode number, but expecting id, aborting!')
251
if ep_match['end_episode'] > ep_match['episode'] + 2:
252
# This is a pack of too many episodes, ignore it.
253
log.debug('Series pack contains too many episodes (%d). Rejecting' %
254
(ep_match['end_episode'] - ep_match['episode']))
257
self.season = ep_match['season']
258
self.episode = ep_match['episode']
259
self.end_episode = ep_match['end_episode']
263
log.debug('-> no luck with ep_regexps')
265
# search for ids later as last since they contain somewhat broad matches
268
# we should be getting season, ep !
269
# try to look up idiotic numbering scheme 101,102,103,201,202
270
# ressu: Added matching for 0101, 0102... It will fail on
272
log.debug('expect_ep enabled')
273
match = re.search(self.re_not_in_word(r'(0?\d)(\d\d)'), data_stripped, re.IGNORECASE | re.UNICODE)
277
if match.start() > 1:
280
self.season = int(match.group(1))
281
self.episode = int(match.group(2))
285
log.debug('-> no luck with the expect_ep')
287
if self.parse_unwanted_id(data_stripped):
289
for id_re in self.id_regexps:
290
match = re.search(id_re, data_stripped)
294
if match.start() - name_end >= 2:
296
if 'version' in match.groupdict():
297
if match.group('version'):
298
self.proper_count = int(match.group('version')) - 1
299
self.id = match.group(1)
301
self.id = '-'.join(match.groups())
302
self.id_groups = match.groups()
304
self.id += '-SPECIAL'
306
log.debug('found id \'%s\' with regexp \'%s\'' % (self.id, id_re.pattern))
308
log.debug('-> no luck with id_regexps')
310
# No id found, check if this is a special
312
# Attempt to set id as the title of the special
313
self.id = data_stripped
315
log.debug('found special, setting id to \'%s\'' % self.id)
318
raise ParseWarning('Title \'%s\' looks like series \'%s\' but I cannot find any episode or id numbering' % (self.data, self.name))
320
def parse_unwanted(self, data):
321
"""Parses data for an unwanted hits. Return True if the data contains unwanted hits."""
322
for ep_unwanted_re in self.unwanted_ep_regexps:
323
match = re.search(ep_unwanted_re, data)
325
log.debug('unwanted regexp %s matched %s' % (ep_unwanted_re.pattern, match.groups()))
328
def parse_unwanted_id(self, data):
329
"""Parses data for an unwanted id hits. Return True if the data contains unwanted hits."""
330
for id_unwanted_re in self.unwanted_id_regexps:
331
match = re.search(id_unwanted_re, data)
333
log.debug('unwanted id regexp %s matched %s' % (id_unwanted_re, match.groups()))
336
def parse_episode(self, data):
338
Parses :data: for an episode identifier.
339
If found, returns a dict with keys for season, episode, end_episode and the regexp match object
340
If no episode id is found returns False
343
# search for season and episode number
344
for ep_re in self.ep_regexps:
345
match = re.search(ep_re, data)
348
log.debug('found episode number with regexp %s (%s)' % (ep_re.pattern, match.groups()))
349
matches = match.groups()
350
if len(matches) >= 2:
353
elif self.allow_seasonless:
354
# assume season 1 if the season was not specified
358
# Return False if we are not allowing seasonless matches and one is found
360
# Convert season and episode to integers
363
if not episode.isdigit():
364
episode = self.roman_to_int(episode)
366
episode = int(episode)
368
log.critical('Invalid episode number match %s returned with regexp `%s`' % (match.groups(), ep_re.pattern))
371
if len(matches) == 3 and matches[2]:
372
end_episode = int(matches[2])
373
if end_episode <= episode or end_episode > episode + 10:
374
# end episode cannot be before start episode
375
# Assume large ranges are not episode packs, ticket #1271 TODO: is this the best way?
377
# Successfully found an identifier, return the results
378
return {'season': season,
380
'end_episode': end_episode,
384
def roman_to_int(self, roman):
385
"""Converts roman numerals up to 39 to integers"""
386
roman_map = [('X', 10), ('IX', 9), ('V', 5), ('IV', 4), ('I', 1)]
387
roman = roman.upper()
389
# Return False if this is not a roman numeral we can translate
391
if char not in 'XVI':
392
raise ValueError('`%s` is not a valid roman numeral' % roman)
394
# Add up the parts of the numeral
396
for numeral, integer in roman_map:
397
while roman[i:i + len(numeral)] == numeral:
403
def identifier(self):
404
"""Return String identifier for parsed episode, eg. S01E02"""
406
raise Exception('Series flagged invalid')
407
if isinstance(self.season, int) and isinstance(self.episode, int):
408
return 'S%sE%s' % (str(self.season).zfill(2), str(self.episode).zfill(2))
409
elif self.id is None:
410
raise Exception('Series is missing identifier')
416
return self.proper_count > 0
419
# for some fucking reason it's impossible to print self.field here, if someone figures out why please
424
return '<SeriesParser(data=%s,name=%s,id=%s,season=%s,episode=%s,quality=%s,proper=%s,status=%s)>' % \
425
(self.data, self.name, str(self.id), self.season, self.episode, \
426
self.quality, self.proper_count, valid)
428
def __cmp__(self, other):
429
"""Compares quality of parsers, if quality is equal, compares proper_count."""
430
return cmp((self.quality, self.proper_count), (other.quality, other.proper_count))
432
def __eq__(self, other):