#! /usr/bin/env python
# -*- coding: utf-8 -*-

"""
Convert named entity tags between the XML-style tags with attributes
produced in the original Flex+Perl implementation of the Swedish NER
system and the tags produced by the Pmatch implementation, encoding
all information in element names.

Usage: convert-namex-tags.py [options] [input files] > output

Author: Jyrki Niemi (jyrki.niemi@helsinki.fi) 2013
"""

import sys
import codecs
import re

from optparse import OptionParser


class TagConverter(object):

    def __init__(self, opts):
        self._opts = opts
        self._convert_fn = (self._attributes_to_plain_tags
                            if self._opts.target == 'plaintags'
                            else self._plain_tags_to_attributes)

    def process_input(self, files):
        if isinstance(files, list):
            for fname in files:
                self.process_input(fname)
        elif isinstance(files, basestring):
            with codecs.open(files, 'r', encoding='utf-8') as f:
                self._process_single_input(f)
        else:
            self._process_single_input(files)

    def _process_single_input(self, file_):
        for line in file_:
            line = self._convert_fn(line)
            if self._opts.remove_except_tags:
                line = self._remove_except_tags(line)
            sys.stdout.write(line)

    def _remove_except_tags(self, line):
        return re.sub(r'</?Except>', '', line)

    def _plain_tags_to_attributes(self, line):
        return re.sub(r"""(?xi)
                          <(?P<maintag>(?:Ena|Nu|Ti)mex) (?P<subtag>[A-Za-z]+)>
                          (?P<content>.*?)
                          </(?P=maintag)[A-Za-z]*>""",
                      self._make_tags_attributes, line)

    def _make_tags_attributes(self, matchobj):
        maintag = matchobj.group('maintag').upper()
        attrs = self._subtag_to_attributes(matchobj.group('subtag'))
        return ('<' + maintag + ' ' + attrs + '>' + matchobj.group('content')
                + '</' + maintag + '>')

    def _subtag_to_attributes(self, subtag):
        if len(subtag) == 9:
            return self._make_attributes(*self._split_by_length(subtag, 3))
        else:
            alts = self._split_by_length(subtag, 6)
            types = self._make_ambiguous_type(alts, 0, 3)
            subtypes = self._make_ambiguous_type(alts, 3, 6)
            return self._make_attributes(types, subtypes)

    def _split_by_length(self, s, partlen):
        return [s[start:start+partlen] for start in xrange(0, len(s), partlen)]

    def _make_ambiguous_type(self, altlist, elem_begin, elem_end):
        return '/'.join([alt[elem_begin:elem_end] for alt in altlist])
                                                                  
    def _make_attributes(self, type_, subtype, ani=None):
        return ('TYPE="' + type_.upper() + '" SBT="' + subtype.upper() + '"'
                + ((' ANI="' + ani.upper() + '"') if ani else ''))

    def _attributes_to_plain_tags(self, line):
        return re.sub(r"""(?x)
                          <(?P<tag>(?:ENA|NU|TI)MEX)
                          \sTYPE="(?P<type>[A-Za-z/]+)"
                          \sSBT="(?P<sbt>[A-Za-z/]+)"
                          (?:\sANI="(?P<ani>[A-Za-z]+)")?>
                          (?P<content>.*?)
                          </(?P=tag)>""",
                      self._make_plain_tags, line)

    def _make_plain_tags(self, matchobj):
        tagname = (matchobj.group('tag').title()
                   + self._make_tag_type(matchobj.group('type'),
                                         matchobj.group('sbt'),
                                         matchobj.group('ani')))
        return ('<' + tagname + '>' + matchobj.group('content')
                + '</' + tagname + '>')

    def _make_tag_type(self, type_, subtype, ani):
        types = self._titlecase_list(type_.split('/'))
        subtypes = self._titlecase_list(subtype.split('/'))
        return (''.join([elem for tuple_ in zip(types, subtypes)
                         for elem in tuple_])
                + (ani.title() if ani else ''))

    def _titlecase_list(self, lst):
        return [elem.title() for elem in lst]


def getopts():
    usage = """Usage: %prog [options] [input files] > output
Convert named-entity tags between ones with XML-style attributes and ones
encoding all information in the element name, as produced by Pmatch."""
    optparser = OptionParser(usage=usage)
    optparser.add_option(
        '--target', type='choice',
        choices=['attributes', 'plaintags'], default='attributes',
        help=('convert the tags in the input to the output type TYPE, where'
              ' TYPE is either "attributes" (XML-style attributes in the start'
              ' tag; the default) or "plaintags" (all information encoded in'
              ' the element name)'),
        metavar='TYPE')
    optparser.add_option(
        '--remove-except-tags', action='store_true',
        help='remove <Except> and </Except> tags in the input')
    (opts, args) = optparser.parse_args()
    return (opts, args)


def main():
    sys.stdin = codecs.getreader('utf-8')(sys.stdin)
    sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
    sys.stderr = codecs.getwriter('utf-8')(sys.stderr)
    (opts, args) = getopts()
    converter = TagConverter(opts)
    converter.process_input(args or sys.stdin)


if __name__ == '__main__':
    main()
