import nltk
from nltk import WordNetLemmatizer, pos_tag, WordPunctTokenizer, data
from nltk.corpus import wordnet


def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    if treebank_tag.startswith('V'):
        return wordnet.VERB
    if treebank_tag.startswith('N'):
        return wordnet.NOUN
    if treebank_tag.startswith('R'):
        return wordnet.ADV
    return wordnet.NOUN


def lemmatize_word_list(word_list):
    lemmatizer = WordNetLemmatizer()
    lemmatizered_word_list = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(pos)) for word, pos in pos_tag(word_list)]
    return lemmatizered_word_list


def starts_with_verb(word_list):
    if len(word_list) <= 0:
        return False
    word_list = ['He'] + word_list
    count = 0
    for word, pos in pos_tag(word_list):
        treebank_tag = get_wordnet_pos(pos)
        if count == 1:
            return treebank_tag.startswith('V') or treebank_tag.startswith('v')
        count += 1


def word_tokenizer(sentence):
    words = WordPunctTokenizer().tokenize(sentence)
    return words


def split_sentence(paragraph):
    tokenizer = data.load('tokenizers/punkt/english.pickle')
    sentences = tokenizer.tokenize(paragraph)
    return sentences


def download_nltk_data():
    nltk.download('punkt')


def overlap_two_seq(word_list, word_list2):
    for word in word_list:
        for word2 in word_list2:
            if (word in word2 and len(word) > 1) or (word2 in word and len(word2) > 1):
                return True
    return False
