#! /usr/bin/env python
# -*- coding: utf-8 -*-


"""
Usage: pmatch-collect-defines.py [options] [Pmatch file] > Pmatch file

Collect and number marked named regular expression definitions in Pmatch files

For more information, run namedb-to-pmatch.py --help

Author: Jyrki Niemi (jyrki.niemi@helsinki.fi) 2013
"""


import sys
import codecs
import re

from optparse import OptionParser
from collections import defaultdict


class PmatchDefineProcessor(object):

    def __init__(self, opts):
        self._opts = opts

    def process_input(self, files):
        if isinstance(files, list):
            for fname in files:
                self.process_input(fname)
        elif isinstance(files, basestring):
            with codecs.open(files, 'r', encoding='utf-8') as f:
                self._process_single_input(f)
        else:
            self._process_single_input(files)

    def _process_single_input(self, file_):
        define_basename_counts = defaultdict(int)
        define_names = []
        for line in file_:
            line_stripped = line.strip()
            if line_stripped.startswith('Define '):
                line = self._process_define_line(line, define_basename_counts,
                                                 define_names)
            if (not line_stripped.startswith('!')
                and self._opts.collected_placeholder in line):
                line = self._add_collected(line, define_names)
            sys.stdout.write(line)
        if self._opts.add_top:
            sys.stdout.write(
                '\nDefine TOP [ ' + ' | '.join(define_names) + ' ] ;\n')

    def _process_define_line(self, line, define_basename_counts, define_names):
        mo = re.match(r'(\s*Define\s+)(\S+)(\s.*)$', line, re.DOTALL)
        (line_begin, defname, line_end) = mo.groups()
        if self._opts.number_placeholder in defname:
            base_defname = defname
            define_basename_counts[base_defname] += 1
            defname = defname.replace(
                self._opts.number_placeholder,
                self._opts.number_format.format(
                    define_basename_counts[defname]))
            define_names.append(defname)
        elif defname.startswith(self._opts.collect_marker):
            defname = defname[len(self._opts.collect_marker):]
            define_names.append(defname)
        return line_begin + defname + line_end

    def _add_collected(self, line, define_names):
        result = line.replace(self._opts.collected_placeholder,
                              ' | '.join(define_names))
        del define_names[:]
        return result


def getopts():
    usage = """%prog [options] [Pmatch file] > Pmatch file
Collect and number marked named regular expression definitions in Pmatch files

Adds numbering to specified regular expression names in Pmatch source files and
collects the numbered and other specified names to a disjuction that can be
used as TOP expression. This is useful in particular for expressions generated
by scripts or macros.

By default, @ in the name of a regular expression definition is replaced with
a running number, and a name prefixed with + is also added to the collected
disjunction. @@ is replaced with the disjunction of collected names since the
preceding @@ or the beginning of input. The markers can be changed with
options.

Reads Pmatch input files specified on the command line or standard input if
none specified. Writes to standard output."""
    optparser = OptionParser(usage=usage)
    optparser.add_option(
        '--number-placeholder', default='@',
        help=('replace TEXT in defined names with a running number'
              ' (default: %default)'), metavar='TEXT')
    optparser.add_option(
        '--number-format', default='3',
        help='use NUM digits for numbers in defined names (default: %default)',
        metavar='NUM')
    optparser.add_option(
        '--collect-marker', default='+',
        help='additionally collect defined names prefiexd with MARKER',
        metavar='MARKER')
    optparser.add_option(
        '--collected-placeholder', default='@@',
        help=('replace TEXT with a disjunction of the defined names collected'
              ' since the previous TEXT (default: %default)'), metavar='TEXT')
    optparser.add_option(
        '--add-top', action='store_true',
        help=('add an explicit definition of "TOP" with a disjunction of the'
              ' collected names'))
    (opts, args) = optparser.parse_args()
    opts.number_format = '{0:0' + opts.number_format + 'd}'
    return (opts, args)


def main():
    sys.stdin = codecs.getreader('utf-8')(sys.stdin)
    sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
    sys.stderr = codecs.getwriter('utf-8')(sys.stderr)
    (opts, args) = getopts()
    proc = PmatchDefineProcessor(opts)
    proc.process_input(args or sys.stdin)


if __name__ == '__main__':
    main()
