flexget.plugins.filter_regexp
Covered: 151 lines
Missed: 1 lines
Skipped 49 lines
Percent: 99 %
  1
import logging
  2
import re
  3
from flexget.plugin import register_plugin, priority, get_plugin_by_name
  5
log = logging.getLogger('regexp')
  8
class FilterRegexp(object):
 10
    """
 11
        All possible forms.
 13
        regexp:
 14
          [operation]:           # operation to perform on matches
 15
            - [regexp]           # simple regexp
 16
            - [regexp]: <path>   # override path
 17
            - [regexp]:
 18
                [path]: <path>   # override path
 19
                [not]: <regexp>  # not match
 20
                [from]: <field>  # search from given entry field
 21
            - [regexp]:
 22
                [path]: <path>   # override path
 23
                [not]:           # list of not match regexps
 24
                  - <regexp>
 25
                [from]:          # search only from these fields
 26
                  - <field>
 27
          [operation]:
 28
            - <regexp>
 29
          [rest]: <operation>    # non matching entries are
 30
          [from]:                # search only from these fields for all regexps
 31
            - <field>
 33
        Possible operations: accept, reject, accept_excluding, reject_excluding
 34
    """
 36
    def validator(self):
 37
        from flexget import validator
 39
        def build_list(regexps):
 40
            regexps.accept('regexp')
 43
            bundle = regexps.accept('dict')
 45
            bundle.accept_valid_keys('path', allow_replacement=True, key_type='regexp')
 48
            advanced = bundle.accept_valid_keys('dict', key_type='regexp')
 49
            advanced.accept('path', key='path', allow_replacement=True)
 51
            set = advanced.accept('dict', key='set')
 52
            set.accept_any_key('any')
 54
            advanced.accept('regexp', key='not')
 56
            advanced.accept('text', key='from')
 59
            advanced.accept('list', key='not').accept('regexp')
 62
            advanced.accept('list', key='from').accept('text')
 64
        conf = validator.factory('dict')
 65
        for operation in ['accept', 'reject', 'accept_excluding', 'reject_excluding']:
 66
            regexps = conf.accept('list', key=operation)
 67
            build_list(regexps)
 69
        conf.accept('choice', key='rest').accept_choices(['accept', 'reject'])
 70
        conf.accept('text', key='from')
 71
        return conf
 73
    def get_config(self, feed):
 74
        """
 75
        Returns the config in standard format
 77
        All regexps are turned into dictionaries in the form {regexp: options}
 78
            options is a dict that can (but may not) contain the following keys
 79
                path: will be attached to entries that match
 80
                set: a dict of values to be attached to entries that match via set plugin
 81
                from: a list of fields in entry for the regexps to match against
 82
                not: a list of regexps that if matching, will disqualify the main match
 83
        """
 84
        config = feed.config.get('regexp', {})
 85
        out_config = {}
 86
        if 'rest' in config:
 87
            out_config['rest'] = config['rest']
 89
        for operation, regexps in config.iteritems():
 90
            if operation in ['rest', 'from']:
 91
                continue
 92
            for regexp_item in regexps:
 93
                if not isinstance(regexp_item, dict):
 94
                    regexp = regexp_item
 95
                    regexp_item = {regexp: {}}
 96
                regexp, opts = regexp_item.items()[0]
 98
                if not isinstance(opts, dict):
 99
                    opts = {'path': opts}
101
                if config.get('from'):
102
                    opts.setdefault('from', config['from'])
104
                if 'from' in opts and isinstance(opts['from'], basestring):
105
                    opts['from'] = [opts['from']]
106
                if 'not' in opts and isinstance(opts['not'], basestring):
107
                    opts['not'] = [opts['not']]
109
                regexp = unicode(regexp)
110
                out_config.setdefault(operation, []).append({regexp: opts})
111
        return out_config
113
    @priority(172)
114
    def on_feed_filter(self, feed):
116
        config = self.get_config(feed)
117
        rest = []
118
        for operation, regexps in config.iteritems():
119
            if operation == 'rest':
120
                continue
121
            r = self.filter(feed, operation, regexps)
122
            if not rest:
123
                rest = r
124
            else:
126
                rest = [entry for entry in r if entry in rest]
128
        if 'rest' in config:
129
            rest_method = feed.accept if config['rest'] == 'accept' else feed.reject
130
            for entry in rest:
131
                log.debug('Rest method %s for %s' % (config['rest'], entry['title']))
133
                rest_method(entry, 'regexp `rest`', remember=True)
135
    def matches(self, entry, regexp, find_from=None, not_regexps=None):
136
        """Check if :entry: has any string fields or strings in a list field that match :regexp:.
137
        Optional :find_from: can be given as a list to limit searching fields"""
138
        unquote = ['url']
139
        for field in find_from or entry:
140
            if not entry.get(field):
141
                continue
143
            values = entry[field]
144
            if not isinstance(values, list):
145
                values = [values]
146
            for value in values:
147
                if not isinstance(value, basestring):
148
                    continue
149
                if field in unquote:
150
                    import urllib
151
                    value = urllib.unquote(value)
153
                if re.search(regexp, value, re.IGNORECASE | re.UNICODE):
155
                    for not_regexp in not_regexps or []:
156
                        if self.matches(entry, not_regexp, find_from=[field]):
157
                            break
158
                    else: # None of the not_regexps matched
159
                        return field
160
        return None
162
    def filter(self, feed, operation, regexps):
163
        """
164
            operation - one of 'accept' 'reject' 'accept_excluding' and 'reject_excluding'
165
                accept and reject will be called on the entry if any of the regxps match
166
                _excluding operations will be called if any of the regexps don't match
167
            regexps - list of {regexp: options} dictionaries
169
            Return list of entries that didn't match regexps
170
        """
171
        rest = []
172
        method = feed.accept if 'accept' in operation else feed.reject
173
        match_mode = 'excluding' not in operation
174
        for entry in feed.entries:
175
            for regexp_opts in regexps:
176
                regexp, opts = regexp_opts.items()[0]
179
                field = self.matches(entry, regexp, opts.get('from'), opts.get('not'))
181
                if match_mode == bool(field):
183
                    matchtext = 'regexp \'%s\' ' % regexp + ('matched field \'%s\'' % field if match_mode else 'didn\'t match')
184
                    log.debug('%s for %s' % (matchtext, entry['title']))
186
                    if opts.get('path'):
187
                        entry['path'] = opts['path']
188
                    if opts.get('set'):
189
                        log.debug('adding set: info to entry:"%s" %s' % (entry['title'], opts['set']))
190
                        set = get_plugin_by_name('set')
191
                        set.instance.modify(entry, opts['set'])
192
                    method(entry, matchtext, remember=True)
194
                    break
195
            else:
197
                rest.append(entry)
198
        return rest
200
register_plugin(FilterRegexp, 'regexp')