from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
from nltk.tokenize import word_tokenize
import string
import re


# Create stop words feature list and add extra features
from nltk.corpus import stopwords
stops = stopwords.words('english')
x = [i.split("'")for i in stops]
stops = [i[0] for i in x]
stops = list(set(stops))
puncts = ['!', ':', '...', '.', '%', '$', "'", '"', ';']
stops.extend(puncts)

# Tokenize words
def tokenize(s):
    s = s.lower()
    token = TweetTokenizer()
    return token.tokenize(s)

# Tag parts of speech for each word
def pos_tagger(s):
    return [i[1] for i in nltk.pos_tag(s)]

# Find POS tagging tendencies to determine sentence structure patterns
def skip_grams(s):
    grams = []
#     for i in skipgrams(s, 3, 2):
    for i in skipgrams(s, 2, 1):
        con_ngram = '-'.join(i)
        grams.append(con_ngram.replace('$', ''))
    return grams

def stop_words_filter(s, stops):
    return [i for i in s if i in stops]

def removing_items(text):
    text=re.sub(r'https?://[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", text)
    text=re.sub(r'&#?://[\w/:%#\$&\?\(\)~\.=\+\-…]+', "", text)
#     text=re.sub('RT', "", text)
#     text=re.sub(r'[!-~]', "", text)
#     text=re.sub(r'[︰-＠]', "", text)
    text=re.sub('\n', " ", text)
    return text

def clean_up_content(content, rem_stop_words=False):
    content = removing_items(content)
    # remove stopp workds, make it lowercase
    tokens = word_tokenize(content)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    if rem_stop_words:
        words = [w for w in words if not w in stops]
    return ' '.join(words)
