#! /usr/bin/env python
# -*- coding: utf-8 -*-


"""
List the differences between two sets of recognized names extracted by
extract-tagged-names.py.

Usage: names-diff.py [options] name_file1 name_file2 > output

For more information, run names-diff.py --help

Author: Jyrki Niemi (jyrki.niemi@helsinki.fi) 2013
"""


import sys
import codecs
import re
import errno

from optparse import OptionParser
from collections import defaultdict


class NameFileDiffer(object):

    _diff_type_labels = {'missing': 'Missing',
                         'added': 'Added',
                         'type_diff': 'Different type',
                         'subtype_diff': 'Different subtype',
                         'pos_diff': 'Different offset',
                         'stage_diff': 'Different stage',
                         'context_bug': 'Pmatch context bug',
                         'longer': 'Longer',
                         'longer+type_diff': 'Longer, diff type',
                         'longer+subtype_diff': 'Longer, diff subtype',
                         'shorter': 'Shorter',
                         'shorter+type_diff': 'Shorter, diff type',
                         'shorter+subtype_diff': 'Shorter, diff subtype',
                         'equal': 'Equal',
                         'other': 'Other'}

    def __init__(self, opts):
        self._opts = opts
        self._stats = defaultdict(int)
        self._namecount = [0, 0]
        self._nameinfo_format = ((u'  ' if self._opts.type_labels else '')
                                 + (u'{linenr:7d} {pos:5d}\t{stage_tab}{type}'
                                    u'\t{name}')
                                 + (u'\t{context}' if self._opts.show_context
                                    else ''))
        self._empty_nameinfo = ('\t\t'
                                + ('\t' if self._opts.show_context else ''))
        self._nameinfo_sep = '\t' if self._opts.oneline else '\n'
        self._nameinfo_end = '\n' if self._opts.oneline else '\n\n'
        self._type_label_format = ('{0:20s}\t' if self._opts.oneline
                                   else '{0}:\n')

    @classmethod
    def get_diff_types(cls):
        return cls._diff_type_labels.keys()

    def diff(self, inf1, inf2, outf):
        self._stats = defaultdict(int)
        with self._open_file(inf1) as inf1:
            with self._open_file(inf2) as inf2:
                with self._open_file(outf, 'w') as outf:
                    self._diff(inf1, inf2, outf)
                    if self._opts.summary:
                        self._output_summary(outf)

    def _open_file(self, f, mode='r'):
        if isinstance(f, basestring):
            if f == '-':
                return sys.stdout if mode == 'w' else sys.stdin
            else:
                return codecs.open(f, mode, encoding='utf-8')
        else:
            return f

    def _diff(self, inf1, inf2, outf):
        advance_lines_map = {'missing': [0], 'added': [1], '*': [0, 1]}
        line = [None, None]
        linenum = [0, 0]
        nameinfo = [None, None]
        inf = [inf1, inf2]

        def read_line(num):
            line[num] = inf[num].readline()
            linenum[num] += 1

        def read_lines(nums):
            for num in nums:
                while True:
                    read_line(num)
                    while (line[num]
                           and (line[num] == '\n'
                                or line[num].startswith('#'))):
                        read_line(num)
                    nameinfo[num] = self._get_nameinfo(line[num])
                    if line[num] and not nameinfo[num]:
                        sys.stderr.write(
                            u'Warning: invalid input line {0:d} in file {1}:'
                            u'\n{2}'
                            .format(linenum[num], inf[num].name, line[num]))
                    elif (not line[num] 
                        or not nameinfo[num]['stage'].startswith('-')):
                        break
                if line[num]:
                    self._namecount[num] += 1

        read_lines([0, 1])
        while line[0] or line[1]:
            diff_type = self._check_diff(*nameinfo)
            advance_lines_indices = advance_lines_map.get(
                diff_type, advance_lines_map['*'])
            self._output_diff(
                outf, diff_type,
                [(ind, nameinfo[ind]) for ind in advance_lines_indices])
            read_lines(advance_lines_indices)

    def _get_nameinfo(self, line):
        if not line:
            return {}
        mo = re.match(r'\s*(?P<linenr>\d+)\s+(?P<pos>\d+)'
                      r'\s+((?P<stage>[-+][^\t\n]+)\t)?(?P<type>.+?)'
                      r'\t(?P<name>[^\t\n]+)(\t(?P<context>.*))?', line)
        if not mo:
            return {}
        return dict([(name, int(mo.group(name))) for name in ['linenr', 'pos']]
                    + [(name, mo.group(name) or '')
                       for name in ['type', 'stage', 'name', 'context']])

    def _check_diff(self, nameinfo1, nameinfo2):

        def get_type_diff(nameinfo1, nameinfo2, result_prefix=''):
            type1 = nameinfo1['type']
            type2 = nameinfo2['type']
            if type1 == type2:
                return ''
            elif type1.split(' ')[0] == type2.split(' ')[0]:
                return result_prefix + 'subtype_diff'
            else:
                return result_prefix + 'type_diff'

        if not nameinfo1:
            diff_type = 'added'
        elif not nameinfo2:
            diff_type = 'missing'
        elif nameinfo1['linenr'] < nameinfo2['linenr']:
            diff_type = 'missing'
        elif nameinfo1['linenr'] > nameinfo2['linenr']:
            diff_type = 'added'
        elif nameinfo1['name'] == nameinfo2['name']:
            typediff = get_type_diff(nameinfo1, nameinfo2)
            if typediff:
                diff_type = typediff
            elif nameinfo1['pos'] != nameinfo2['pos']:
                diff_type = 'pos_diff'
            elif (nameinfo1['stage'] and nameinfo2['stage']
                  and nameinfo1['stage'] != nameinfo2['stage']):
                diff_type = 'stage_diff'
            else:
                diff_type = 'equal'
        elif nameinfo1['name'] in nameinfo2['name']:
            if self._results_from_pmatch_context_bug(nameinfo1['name'],
                                                     nameinfo2['name']):
               diff_type = 'context_bug' 
            else:
                diff_type = 'longer' + get_type_diff(nameinfo1, nameinfo2, '+')
        elif nameinfo2['name'] in nameinfo1['name']:
            diff_type = 'shorter' + get_type_diff(nameinfo1, nameinfo2, '+')
        elif nameinfo1['pos'] < nameinfo2['pos']:
            diff_type = 'missing'
        elif nameinfo1['pos'] > nameinfo2['pos']:
            diff_type = 'added'
        else:
            diff_type = 'other'
        self._stats[diff_type] += 1
        return diff_type

    def _results_from_pmatch_context_bug(self, name1, name2):
        lendiff = len(name2) - len(name1)
        return (lendiff > 0 and name2 == (name1[0] * lendiff) + name1)

    def _output_diff(self, outf, diff_type, nameinfos):
        if (not self._opts.details or diff_type not in self._opts.show_types):
            return
        if self._opts.type_labels:
            outf.write(self._type_label_format.format(
                    self._diff_type_labels[diff_type]))
        if self._opts.oneline and nameinfos[0][0] == 1:
            # An added name; the first name is empty
            outf.write(self._empty_nameinfo)
            outf.write(self._nameinfo_sep)
            self._output_nameinfo(outf, nameinfos[0][1])
        else:
            self._output_nameinfo(outf, nameinfos[0][1])
            if len(nameinfos) > 1 or self._opts.oneline:
                outf.write(self._nameinfo_sep)
            if len(nameinfos) > 1:
                self._output_nameinfo(outf, nameinfos[1][1])
            elif self._opts.oneline:
                outf.write(self._empty_nameinfo)
        outf.write(self._nameinfo_end)

    def _output_nameinfo(self, outf, nameinfo):
        nameinfo['stage_tab'] = (nameinfo['stage'] + '\t'
                                 if nameinfo.get('stage') else '')
        outf.write(self._nameinfo_format.format(**nameinfo))

    def _output_summary(self, outf):
        outf.write(
            u'{0:20s}\t{1:7d}\n'.format(
                'Total names in file 1', self._namecount[0]))
        outf.write(
            u'{0:20s}\t{1:7d}'.format(
                'Total names in file 2', self._namecount[1]))
        if self._namecount[0] > 0:
            outf.write(u'\t{0:+7.2f} %'.format(
                    ((self._namecount[1] - self._namecount[0]) * 100.0
                     / self._namecount[0])))
        outf.write('\n')
        for diff_type in self._stats:
            outf.write(
                u'{0:20s}\t{1:7d}'.format(
                    self._diff_type_labels[diff_type], self._stats[diff_type]))
            if self._namecount[0] > 0:
                outf.write(u'\t{0: 7.2f} %'.format(
                        self._stats[diff_type] * 100.0 / self._namecount[0]))
            outf.write('\n')
        pr_stats = []
        if self._namecount[0] == 0 or self._namecount[1] == 0:
            outf.write('No names in one file; omitting precision and recall'
                       ' as meaningless\n')
        else:
          precision = self._stats['equal'] * 100.0 / self._namecount[1]
          recall = self._stats['equal'] * 100.0 / self._namecount[0]
          f_score = 2 * (precision * recall / (precision + recall))
          for label, var in [('Precision', precision),
                             ('Recall', recall),
                             ('F-score', f_score)]:
              outf.write(
                  u'{0:20s}\t\t{1: 7.2f} %\n'.format(
                      label + ' wrt. file 1', var))


def getopts():
    usage = """%prog: [options] name_file1 name_file2 > output

List the differences between the names in name_file1 and name_file2

The input files contain recognized names and their context information as
extracted by extract-tagged-names.py. The differences are classified by type."""
    optparser = OptionParser(usage=usage)
    diff_types = NameFileDiffer.get_diff_types()
    optparser.add_option(
        '--summary', '--totals', action='store_true',
        help='show summary of the differences by type')
    optparser.add_option(
        '--no-details', action='store_false', dest='details', default=True,
        help='do not show the differences themselves')
    optparser.add_option(
        '--oneline', action='store_true',
        help='show each difference on a single line')
    optparser.add_option(
        '--show-equal', action='store_true',
        help='also show names that are the same in the two files')
    optparser.add_option(
        '--show-types', action='append', default=[],
        help=('only show the difference types listed in the comma-separated'
              ' list TYPES; possible types are: ' + ', '.join(diff_types)
              + ' (default: show all types except equal)'), metavar='TYPES')
    optparser.add_option(
        '--no-type-labels', action='store_false', dest='type_labels',
        default=True,
        help='omit difference type labels from the output')
    optparser.add_option(
        '--no-context', action='store_false', dest='show_context', default=True,
        help='omit context information in differences')
    (opts, args) = optparser.parse_args()
    if not opts.details:
        opts.summary = True
    if opts.show_types:
        split_types = []
        for type_ in opts.show_types:
            types = re.split(r'\s*,\s*', type_)
            for type1 in types:
                if type1 not in diff_types:
                    raise OptionValueError('Invalid difference type "' + type1)
                else:
                    split_types.append(type1)
        opts.show_types = set(split_types)
    elif opts.show_equal:
        opts.show_types = set(diff_types)
    else:
        opts.show_types = set(diff_types) - set(['equal'])
    return (opts, args)


def main_main():
    input_encoding = output_encoding = 'utf-8'
    sys.stdin = codecs.getreader(input_encoding)(sys.stdin)
    sys.stdout = codecs.getwriter(output_encoding)(sys.stdout)
    sys.stderr = codecs.getwriter(output_encoding)(sys.stderr)
    (opts, args) = getopts()
    differ = NameFileDiffer(opts)
    differ.diff(args[0], args[1], sys.stdout)


def main():
    try:
        main_main()
    except IOError, e:
        if e.errno == errno.EPIPE:
            sys.stderr.write('Broken pipe\n')
        else:
            sys.stderr.write(str(e) + '\n')
    except KeyboardInterrupt, e:
        sys.stderr.write('Interrupted\n')
    except:
        raise


if __name__ == "__main__":
    main()
