#! /usr/bin/env python3

# - Correct frequent erronoeus lemmas
# - Replace hashes marking morpheme boundaries (#) with hyphens in lemma forms whenever necessary
# ( Otherwise remove hashes in lemma forms )

from sys import stdin, stdout, stderr, argv, path
import re, os

subs = [
    ("ntelu", "nnella"), 
    ("ntely", "nnellä"), 
    ("ltelu", "llella"), 
    ("ltely", "llellä"), 
    ("rtelu", "rrella"), 
    ("rtely", "rrellä"), 
    ("ppelu", "pella"), 
    ("ppely", "pellä"), 
    ("ttelu", "tella"), 
    ("ttely", "tellä"), 
    ("kkelu", "kella"), 
    ("kkely", "kellä"), 
    ("tely", "dellä"), 
    ("telu", "della"), 
    ("kelu", "ella"), 
    ("kely", "ellä"), 
    ("elu", "ella"), 
    ("ely", "ellä"), 
    ("ilu", "illa"), 
    ("ily", "illä"), 
    ("ltaamis", "llata"), 
    ("ltäämis", "llätä"), 
    ("bbaamis", "bata"), 
    ("bbäämis", "bätä"), 
    ("ggaamis", "gata"), 
    ("ggäämis", "gätä"), 
    ("ppaamis", "pata"), 
    ("ppäämis", "pätä"), 
    ("ttaamis", "tata"), 
    ("ttäämis", "tätä"), 
    ("ppaamis", "kata"), 
    ("ppäämis", "kätä"), 
    ("paamis", "vata"), 
    ("päämis", "vätä"), 
    ("toamis", "dota"), 
    ("taamis", "data"), 
    ("täämis", "dätä"),
    ("koamis", "ota"), 
    ("kaamis", "ata"), 
    ("käämis", "ätä"), 
    ("lkenemis", "ljeta"), 
    ("kenemis", "eta"), 
    ("enemis", "eta"), 
    ("enemis", "etä"), 
    ("itsemis", "ita"), 
    ("itsemis", "itä"), 
    ("amis", "ta"),
    ("ämis", "tä"),
    ("kemis", "hdä"),
    ("mis", "da"),
    ("mis", "dä"),
    ("mis", "a"),
    ("mis", "ä"),
    ("is", "inen"),
    ("s", "nen"),
    ("s", "kset"),
    ("uden", "us"),
    ("yden", "ys"),
    # Pluralized lemmas
    ("kulu", "kulut"),
    ("olo", "olot"),
    ("tila", "tilat"),
    ("kilpailu", "kilpailut"),
    ("kisa", "kisat"),
    ("saksi", "sakset"),
    ("hää", "häät"),
    ("juhla", "juhlat"),
    ("housu", "housut"),
    ("hius", "hiukset"),
    ("markkina", "markkinat"),
    ("päivä", "päivät"),
    ("suhde", "suhteet"),
    ("resurssi", "resurssit"),
    ("voima", "voimat"),
    ("kasvo", "kasvot"),
    ("lasi", "lasit"),
    ("tieto", "tiedot"),
    ]

regex_filename = os.path.join(path[0], 'lemma-errors.tsv')
regexes = [ line.strip().split('\t') for line in open(regex_filename, 'r').read().strip().split('\n') ]

def inf2prefix(wform, lemma_new):
    for ( w_end, l_end ) in subs:
        if wform.lower().startswith(lemma_new.replace(l_end+'#', w_end)):
            lemma_new = lemma_new.replace(l_end+'#', w_end+'#')
    return lemma_new

def fix_nouns(wform, lemma_new):
    for ( w_patt, l_patt, l_new ) in regexes:
        l_regex = re.compile(l_patt+'\\Z')
        if re.search(l_regex, lemma_new) != None:
            w_regex = re.compile(w_patt+'.*')
            if re.fullmatch(w_regex, wform.lower()):
                lemma_new = re.sub(l_regex, l_new, lemma_new)
    return lemma_new

def correct(wform, lemma, morph, semtag):
    lemma_new = ''
    lemma = lemma.replace('#-', '#')
    lemma = lemma.replace('#', '#|')
    if wform.startswith('-') == True and lemma.startswith('-') == False:
        lemma = '-'+lemma
    for m in lemma.split('|'):
        lemma_new = lemma_new + m

        if wform.lower().startswith( lemma_new[:-1] ) == False:
            lemma_new = inf2prefix(wform, lemma_new)

        if wform.lower().startswith(lemma_new.replace('-#', '-')):
            lemma_new = lemma_new.replace('-#', '-')

        if wform.lower().startswith(lemma_new.replace('#', '-')):
            lemma_new = lemma_new.replace('#', '-')

        if wform.lower().startswith(lemma_new.replace('-#', '')):
            lemma_new = lemma_new.replace('-#', '')
            
        lemma_new = lemma_new.rstrip('#')
    lemma_new = fix_nouns(wform, lemma_new)

    # Restore hyphens removed by OMorFi and FinnPOS
    if '-' in wform and '-' not in lemma_new:
        pfx = wform.lower().split('-')[0]
        if lemma_new.startswith(pfx):
            lemma_new = pfx + '-' + lemma_new[len(pfx):]
    
    print('\t'.join([wform, lemma_new, morph, semtag]))


for line in stdin:
    line = line.strip()

    if line == '':
        print('')
        stdout.flush()
    else:
        wform, lemma, morph, semtag = line.split('\t')
        correct(wform, lemma, morph, semtag)
