#! /usr/bin/env python
# -*- coding: utf-8 -*-


"""
Usage: pmatch-add-ins.py < pmatch_rule_file.in > pmatch_rule_file.out

Adds Ins() to Pmatch definitions.

TODO: Real parsing of Pmatch definitions.

Author: Jyrki Niemi (jyrki.niemi@helsinki.fi) 2013–2014
"""


import sys
import codecs
import re

from optparse import OptionParser


class InsAdder(object):

    def __init__(self, opts):
        self._opts = opts
        # FIXME: The patterns should not match and add Ins inside
        # string literals.
        if self._opts.initial_only:
            # FIXME: This does not take into account names that occur
            # as a non-first item in an initial disjunction
            prefix_re = r'(?P<prefix>\s*Define\s+(?:\w|%.)+\s+[^\w"]*)'
        else:
            prefix_re = r'(?P<prefix>)'
        if self._opts.unconditional_ins_regexp:
            self._uncond_ins_re = re.compile(
                prefix_re
                + r'\b(?P<name>' + self._opts.unconditional_ins_regexp
                + r')\b(?=\s*([|\]]|\s(N?[LR]C|EndTag)\())',
                re.VERBOSE | re.UNICODE)
        else:
            self._uncond_ins_re = None
        self._make_cond_regexes(prefix_re)

    def _make_cond_regexes(self, prefix_re=''):
        self._cond_ins_regexes = []
        for ins_spec in self._opts.conditional_ins_regexp:
            name_re, context_spec = ins_spec.split(':', 1)
            self._cond_ins_regexes.append(
                re.compile(prefix_re + r'\b(?P<name>' + name_re + r')\b'
                           + self._make_context_regex(context_spec),
                           re.VERBOSE | re.UNICODE))

    def _make_context_regex(self, context_spec):

        def make_context_alt(alt):
            return (('\\' + alt.rstrip('"').replace(' ', '\\ '))
                    if alt[0] == '"' else alt)

        context_type = '='
        if context_spec[0] == '!':
            context_type = '!'
            context_spec = context_spec[1:]
        # FIXME: This does not work correctly if an alternative
        # contains a |
        context_alts = ('(?:' + '|'.join(make_context_alt(alt)
                                         for alt in context_spec.split('|'))
                        + ')')
        return (r"""(?{context_type}
                      \s*
                      (?: \| (?: [^\[\]] | \[ [^\[\]]+ \] )* \] )?
                      (?: \] \s* )*
                      \s*
                      (?: {context_alts} 
                        | \[ \s* {context_alts}
                             (?: [^\[\]|] | \[ [^\[\]]+ \] )*
                          \] [^\^+*] )
                    )
                 """.format(context_type=context_type,
                            context_alts=context_alts))

    def process_input(self, infiles):
        if isinstance(infiles, list):
            for infile in infiles:
                self._process_file(infile)
        else:
            self._process_file(infiles)

    def _process_file(self, infile):
        if isinstance(infile, basestring):
            with codecs.open(infile, 'r', encoding='utf-8') as inf:
                self._process_input(inf)
        else:
            self._process_input(infile)

    def _process_input(self, infile):
        collect_def = False
        def_group = []
        add_def_nums = set()
        def_text = ''
        for line in infile:
            if not collect_def:
                if line.strip().startswith('!') or not line.strip():
                    if def_group:
                        self._output_define_group(def_group, add_def_nums)
                        def_group = []
                        add_def_nums = set()
                    sys.stdout.write(line)
                    continue
                elif re.search(r'Define\s+', line.strip()):
                    collect_def = True
                    if re.match(r'Define\s+'
                                + self._opts.containing_define_name_regexp
                                + r'\s',
                                line):
                        add_def_nums.add(len(def_group))
                else:
                    sys.stdout.write(line)
            if collect_def:
                def_text += line
                # Ignore the possible weight following the semicolon
                if line.rstrip(' \n0123456789.').endswith(';'):
                    collect_def = False
                    def_group.append(def_text)
                    def_text = ''
        if def_group:
            self._output_define_group(def_group, add_def_nums)

    def _output_define_group(self, def_group, add_def_nums):
        contains_difference = (
            not self._opts.ignore_subtractions
            and any(self._define_contains_difference(def_text)
                    for def_text in def_group))
        for (defnum, def_text) in enumerate(def_group):
            if not contains_difference and defnum in add_def_nums:
                sys.stdout.write(self._add_ins(def_text))
            else:
                sys.stdout.write(def_text)

    def _define_contains_difference(self, def_text):
        def_text = re.sub(r'^\s*!.*$', '', def_text, re.MULTILINE)
        def_text = re.sub(r'"([^\"\\]|\\\")*"|%[-\"]', '', def_text)
        return ('-' in def_text)

    def _add_ins(self, def_text):
        # FIXME: This might not work correctly with in-definition
        # comments
        (def_text, add_count) = self._add_ins_conditional(def_text)
        if self._uncond_ins_re:
            (def_text, add_count2) = self._add_ins_unconditional(def_text)
            add_count += add_count2
        if (self._opts.max_ins_per_define
            and add_count > self._opts.max_ins_per_define):
            def_text = self._remove_extra_ins(def_text)
        return def_text

    def _add_ins_unconditional(self, def_text):
        return self._uncond_ins_re.subn(r'\g<prefix>Ins(\g<name>)', def_text)

    def _add_ins_conditional(self, def_text):
        total_add_count = 0
        for regex in self._cond_ins_regexes:
            def_text, add_count = regex.subn(r'\g<prefix>Ins(\g<name>)',
                                             def_text)
            total_add_count += add_count
        return (def_text, total_add_count)

    def _remove_extra_ins(self, def_text):
        # KLUDGE: Use an ad hoc instance variable _insnum as a
        # non-local variable that the inner function remove_ins can
        # modify.
        self._insnum = 0

        def remove_ins(matchobj):
            self._insnum += 1
            if self._insnum <= self._opts.max_ins_per_define:
                return matchobj.group()
            else:
                return matchobj.group(1)

        return re.sub(r'(?<=\W)Ins\(([^)]+)\)', remove_ins, def_text)


def getopts():
    optparser = OptionParser()
    optparser.add_option('--containing-define-name-regexp', default='TOP')
    optparser.add_option('--unconditional-ins-regexp',
                         default=r'(?:[A-Za-z0-9]|%.)+')
    # Word:" " Num:!"[0-9]"|Num
    optparser.add_option('--conditional-ins-regexp', action='append',
                         default=[])
    optparser.add_option('--max-ins-per-define', type='int')
    optparser.add_option('--initial-only', action='store_true')
    optparser.add_option('--ignore-subtractions', '--ignore-difference-ops',
                         action='store_true')
    (opts, args) = optparser.parse_args()
    return (opts, args)


def main():
    sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
    sys.stdin = codecs.getreader('utf-8')(sys.stdin)
    (opts, args) = getopts()
    ins_adder = InsAdder(opts)
    ins_adder.process_input(args or sys.stdin)


if __name__ == '__main__':
    main()
