from sys import stdin, argv, stderr, stdout
from re import findall

def get_lemma(string, convert_type):
    if convert_type == 'ftb':
        lemma_parts = []
        if '[' in string and not string.startswith('['):
            lemma_parts.append(string[:string.index('[')])
        word_id_strs = findall('\[WORD_ID=.[^\[]*', string)
        lemma_parts += [ word_id_str[9:][:-1] for word_id_str in word_id_strs ]
        if ']' in string and not string.endswith(']'):
            lemma_parts.append(string.split(']')[-1].split(':')[0])
        return '#'.join(lemma_parts)
    else:
        return string[:string.find('\t')]

def get_label(string, convert_type):
    if convert_type == 'ftb':
        # Remove everything up to the start of the last lemma.
        string = string[string.rfind('[WORD_ID=') + len('[WORD_ID='):]
    
        # Remove the last lemma.
        label = string[string.find(']') + 1:]

        # Add sub label separators.
        label = label.replace('][',']|[')

        sub_labels = label.split('|')

        sub_labels = filter(lambda x: x.find("STYLE=") == -1, sub_labels)
        sub_labels = filter(lambda x: x.find("DRV=") == -1, sub_labels)

        label = '|'.join(sub_labels).lstrip(']|')
        
        return label

    else:
        return string[string.find('\t'):]

def get_lemmas(analyses, convert_type):
    return [(get_label(a, convert_type), get_lemma(a, convert_type)) 
            for a in analyses]

def get_labels(analyses, convert_type):
    return [get_label(a, convert_type) for a in analyses]

def filter_ftb_analyses(analyses):
    min_wbs = min(map(lambda x: x.count('[WORD_ID='), analyses))
    return list(filter(lambda x: x.count('[WORD_ID=') == min_wbs, analyses))

def fix_omorfi_analysis(analysis):
    # OmorFi sometimes leaks a part of the WORD_ID into between the tags, like:
    # ---	[WORD_ID=-][POS=PUNCTUATION]--[STYLE=NONSTANDARD]
    # We fix this case here.
    if '][' not in analysis:
        return analysis
    end_of_first_tag = analysis.index('][') + 1
    word_id = analysis[:end_of_first_tag]
    rest_of_tags = analysis[end_of_first_tag:]
    cleaned_tags = ''
    extra_word_id_bits = ''
    while rest_of_tags != '':
        if rest_of_tags.startswith('[') and ']' in rest_of_tags:
            next_tag_start = rest_of_tags.index(']') + 1
            cleaned_tags += rest_of_tags[:next_tag_start]
            rest_of_tags = rest_of_tags[next_tag_start:]
        elif '[' in rest_of_tags and ']' in rest_of_tags:
            next_tag_start = rest_of_tags.index('[')
            extra_word_id_bits = rest_of_tags[:next_tag_start]
            rest_of_tags = rest_of_tags[next_tag_start:]
        else:
            cleaned_tags += rest_of_tags
            break
    if extra_word_id_bits != '':
        word_id = word_id[:-1] + extra_word_id_bits + ']'
    return word_id + cleaned_tags

def convert(pname, ifile, convert_type):
    wf       = ''
    analyses = []

#    stderr.write("%s: Reading from STDIN. Writing to STDOUT\n" % (pname))

    for line in ifile:
        line = line.strip()

        if line == '':
            if wf != '':
                if convert_type == 'ftb':
                    if (len(analyses) == 0):
                        print('%s\t_\t_\t_\t_' % wf)
                        wf, analyses = '', []
                        continue
                    analyses = filter_ftb_analyses(analyses)
                    lemmas = get_lemmas(analyses, convert_type)
                    lemma_str = str(lemmas).replace(' ','')
                    labels = get_labels(analyses, convert_type)
                    feats = '_'

                    if labels != []:
                        label_feats = map(lambda x: "OMORFI_FEAT:" + x, labels)
                        feats = ' '.join(label_feats)

                    label_str = '_' 

                    if labels != []:
                        label_str = ' '.join(labels)
                
                    print('%s\t%s\t%s\t%s\t%s' % (wf, feats, '_', label_str, lemma_str))

                    wf, analyses = '', []
                    continue
            else:
                print('')
                stdout.flush()
                continue

        
        # elif (convert_type == 'ftb' and 
        #       line == 'OMORFI_VERSION_≥_14_©_GNU_GPL_V3'):
        #     print('')

        # elif convert_type == 'tdt' and line.find('<END>') != -1:
        #     print('')

        else:
            if convert_type == 'ftb':
                try:
                    wf, analysis = line.split('\t')
                    wf, analysis = wf.strip(), analysis.strip()
                except ValueError:
                    analyses = []
                    wf, analysis = '', ''

                if wf == analysis or analysis == wf + '+?':
                    analyses = []
                else:
                    analyses.append(fix_omorfi_analysis(analysis))
            else:
                wf, lemma, label = line.split('\t')

                if label == '+?':
                    analyses = []
                else:
                    analyses.append(lemma + '\t' + label)

if __name__=='__main__':

    convert_type = 'ftb'

    if len(argv) == 2:
        convert_type = argv[1]
    elif len(argv) != 1:
        stderr.write('USE: cat indata | %s (ftb|tdt) > outdata\n' % argv[0])
        exit(1)
        
    if not convert_type in ['ftb','tdt']:
        stderr.write('Unknown conversion type %s. Should be ftb or tdt.' % 
                     convert_type)
        exit(1)

    convert(argv[0], stdin, convert_type)
