# -*- coding: utf-8 -*-

#Created on Sun Apr 19 23:41:39 2020 @author: WIndows 10

import nltk
import numpy as np
import random
import string
import os

import urllib.request
import re

import sys
import unicodedata
from sklearn.preprocessing import StandardScaler
import  re
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence
from collections import Counter
import unduhack
CURRENCIES = {'$': 'USD', 'zł': 'PLN', '£': 'GBP', '¥': 'JPY', '฿': 'THB',
              '₡': 'CRC', '₦': 'NGN', '₩': 'KRW', '₪': 'ILS', '₫': 'VND',
              '€': 'EUR', '₱': 'PHP', '₲': 'PYG', '₴': 'UAH', '₹': 'INR'}

EMAIL_REGEX = re.compile(
        r"(?:^|(?<=[^\w@.)]))([\w+-](\.(?!\.))?)*?[\w+-]@(?:\w-?)*?\w+(\.([a-z]{2,})){1,3}(?:$|(?=\b))",
        flags=re.IGNORECASE | re.UNICODE)
PHONE_REGEX = re.compile(r'(?:^|(?<=[^\w)]))(\+?1[ .-]?)?(\(?\d{3}\)?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?'
                         r'|[#x-])\s?\d{2,6})?(?:$|(?=\W))')
NUMBERS_REGEX = re.compile(r'(?:^|(?<=[^\w,.]))[+–-]?(([1-9]\d{0,2}(,\d{3})+(\.\d*)?)|([1-9]\d{0,2}([ .]\d{3})+(,\d*)?)'
                           r'|(\d*?[.,]\d+)|\d+)(?:$|(?=\b))')
CURRENCY_REGEX = re.compile('({})+'.format('|'.join(re.escape(c) for c in CURRENCIES)))
LINEBREAK_REGEX = re.compile(r'((\r\n)|[\n\v])+')
NONBREAKING_SPACE_REGEX = re.compile(r'(?!\n)\s+')
URL_REGEX = re.compile(r"(?:^|(?<![\w/.]))"
                       # protocol identifier
                       # r"(?:(?:https?|ftp)://)"  <-- alt?
                       r"(?:(?:https?://|ftp://|www\d{0,3}\.))"
                       # user:pass authentication
                       r"(?:\S+(?::\S*)?@)?"
                       r"(?:"
                       # IP address exclusion
                       # private & local networks
                       r"(?!(?:10|127)(?:\.\d{1,3}){3})"
                       r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
                       r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
                       # IP address dotted notation octets
                       # excludes loopback network 0.0.0.0
                       # excludes reserved space >= 224.0.0.0
                       # excludes network & broadcast addresses
                       # (first & last IP address of each class)
                       r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
                       r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
                       r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
                       r"|"
                       # host name
                       r"(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)"
                       # domain name
                       r"(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*"
                       # TLD identifier
                       r"(?:\.(?:[a-z\u00a1-\uffff]{2,}))"
                       r")"
                       # port number
                       r"(?::\d{2,5})?"
                       # resource path
                       r"(?:/\S*)?"
                       r"(?:$|(?![\w?!+&/]))",
                       flags=re.UNICODE | re.IGNORECASE)  # source: https://gist.github.com/dperini/729294
SHORT_URL_REGEX = re.compile(r"(?:^|(?<![\w/.]))"
                             # optional scheme
                             r"(?:(?:https?://)?)"
                             # domain
                             r"(?:\w-?)*?\w+(?:\.[a-z]{2,12}){1,3}"
                             r"/"
                             # hash
                             r"[^\s.,?!'\"|+]{2,12}"
                             r"(?:$|(?![\w?!+&/]))",
                             flags=re.IGNORECASE)

PUNCTUATION_TRANSLATE_UNICODE = dict.fromkeys((i for i in range(sys.maxunicode)
                                               if unicodedata.category(chr(i)).startswith('P')), u' ')


def normalize_whitespace(text: str):
    """
    Given ``text`` str, replace one or more spacings with a single space, and one
    or more linebreaks with a single newline. Also strip leading/trailing whitespace.

    Args:
        text (str): raw ``urdu`` text
    Returns:
        str: returns a ``str`` object containing normalized text.
    """
    return NONBREAKING_SPACE_REGEX.sub(' ', LINEBREAK_REGEX.sub(r'\n', text)).strip()

def remove_punctuation(text: str, marks=None) -> str:
    """
    Remove punctuation from ``text`` by replacing all instances of ``marks`` with whitespace.

    Args:
        text (str): raw text
        marks (str): If specified, remove only the characters in this string,
            e.g. ``marks=',;:'`` removes commas, semi-colons, and colons.
            Otherwise, all punctuation marks are removed.
    Returns:
        str: returns a ``str`` object containing normalized text.
    Note:
        When ``marks=None``, Python's built-in :meth:`str.translate()` is
        used to remove punctuation; otherwise, a regular expression is used
        instead. The former's performance is about 5-10x faster.
    """
    if marks:
        return re.sub('[{}]+'.format(re.escape(marks)), ' ', text, flags=re.UNICODE)

    return text.translate(PUNCTUATION_TRANSLATE_UNICODE)

def remove_accents(text: str):
    """
    Remove accents from any accented unicode characters in ``text`` str, either by
    transforming them into ascii equivalents or removing them entirely.

    Args:
        text (str): raw urdu text
    Returns:
        str
    """
    return ''.join(c for c in text if not unicodedata.combining(c))
from typing import Dict
CORRECT_URDU_CHARACTERS: Dict = {'آ': ['ﺁ', 'ﺂ'],
                                 'أ': ['ﺃ'],
                                 'ا': ['ﺍ', 'ﺎ', ],
                                 'ب': ['ﺏ', 'ﺐ', 'ﺑ', 'ﺒ'],
                                 'پ': ['ﭖ', 'ﭘ', 'ﭙ', ],
                                 'ت': ['ﺕ', 'ﺖ', 'ﺗ', 'ﺘ'],
                                 'ٹ': ['ﭦ', 'ﭧ', 'ﭨ', 'ﭩ'],
                                 'ث': ['ﺛ', 'ﺜ', 'ﺚ'],
                                 'ج': ['ﺝ', 'ﺞ', 'ﺟ', 'ﺠ'],
                                 'ح': ['ﺡ', 'ﺣ', 'ﺤ', 'ﺢ'],
                                 'خ': ['ﺧ', 'ﺨ', 'ﺦ'],
                                 'د': ['ﺩ', 'ﺪ'],
                                 'ذ': ['ﺬ', 'ﺫ'],
                                 'ر': ['ﺭ', 'ﺮ'],
                                 'ز': ['ﺯ', 'ﺰ', ],
                                 'س': ['ﺱ', 'ﺲ', 'ﺳ', 'ﺴ', ],
                                 'ش': ['ﺵ', 'ﺶ', 'ﺷ', 'ﺸ'],
                                 'ص': ['ﺹ', 'ﺺ', 'ﺻ', 'ﺼ', ],
                                 'ض': ['ﺽ', 'ﺾ', 'ﺿ', 'ﻀ'],
                                 'ط': ['ﻃ', 'ﻄ'],
                                 'ظ': ['ﻅ', 'ﻇ', 'ﻈ'],
                                 'ع': ['ﻉ', 'ﻊ', 'ﻋ', 'ﻌ', ],
                                 'غ': ['ﻍ', 'ﻏ', 'ﻐ', ],
                                 'ف': ['ﻑ', 'ﻒ', 'ﻓ', 'ﻔ', ],
                                 'ق': ['ﻕ', 'ﻖ', 'ﻗ', 'ﻘ', ],
                                 'ل': ['ﻝ', 'ﻞ', 'ﻟ', 'ﻠ', ],
                                 'م': ['ﻡ', 'ﻢ', 'ﻣ', 'ﻤ', ],
                                 'ن': ['ﻥ', 'ﻦ', 'ﻧ', 'ﻨ', ],
                                 'چ': ['ﭺ', 'ﭻ', 'ﭼ', 'ﭽ'],
                                 'ڈ': ['ﮈ', 'ﮉ'],
                                 'ڑ': ['ﮍ', 'ﮌ'],
                                 'ژ': ['ﮋ', ],
                                 'ک': ['ﮎ', 'ﮏ', 'ﮐ', 'ﮑ', 'ﻛ', 'ك'],
                                 'گ': ['ﮒ', 'ﮓ', 'ﮔ', 'ﮕ'],
                                 'ں': ['ﮞ', 'ﮟ'],
                                 'و': ['ﻮ', 'ﻭ', 'ﻮ', ],
                                 'ؤ': ['ﺅ'],
                                 'ھ': ['ﮪ', 'ﮬ', 'ﮭ', 'ﻬ', 'ﻫ', 'ﮫ'],
                                 'ہ': ['ﻩ', 'ﮦ', 'ﻪ', 'ﮧ', 'ﮩ', 'ﮨ', 'ه', ],
                                 'ۂ': [],
                                 'ۃ': ['ة'],
                                 'ء': ['ﺀ'],
                                 'ی': ['ﯼ', 'ى', 'ﯽ', 'ﻰ', 'ﻱ', 'ﻲ', 'ﯾ', 'ﯿ', 'ي'],
                                 'ئ': ['ﺋ', 'ﺌ', ],
                                 'ے': ['ﮮ', 'ﮯ', 'ﻳ', 'ﻴ', ],
                                 'ۓ': [],
                                 '۰': ['٠'],
                                 '۱': ['١'],
                                 '۲': ['٢'],
                                 '۳': ['٣'],
                                 '۴': ['٤'],
                                 '۵': ['٥'],
                                 '۶': ['٦'],
                                 '۷': ['٧'],
                                 '۸': ['٨'],
                                 '۹': ['٩'],
                                 '۔': [],
                                 '؟': [],
                                 '٫': [],
                                 '،': [],
                                 'لا': ['ﻻ', 'ﻼ'],
                                 '': ['ـ']

                                 }

_TRANSLATOR = {}
for key, value in CORRECT_URDU_CHARACTERS.items():
    _TRANSLATOR.update(dict.fromkeys(map(ord, value), key))

def normalize_characters(text: str) -> str:
    """
    The most important function in the UrduHack normalize_characters. You can use this function separately to normalize
    a piece of text to a proper specified Urdu range (0600-06FF). This provides the functionality
    to replace wrong arabic characters with correct urdu characters and fixed the combine|join characters issue.

    Replace ``urdu`` text characters with correct ``unicode`` characters.

    Args:
        text (str): raw ``urdu`` text
    Returns:
        str: returns a ``str`` object containing normalized text.
    """
    return text.translate(_TRANSLATOR)



COMBINE_URDU_CHARACTERS: Dict[str, str] = {"آ": "آ",
                                           "أ": "أ",
                                           "ۓ": "ۓ",
                                           }


def normalize_combine_characters(text: str) -> str:
    """
    Replace combine|join ``urdu`` characters with single unicode character

    Args:
        text (str): raw ``urdu`` text
    Returns:
        str: returns a ``str`` object containing normalized text.
    """
    for _key, _value in COMBINE_URDU_CHARACTERS.items():
        text = text.replace(_key, _value)
    return text

stop_words = frozenset("""
آ آئی آئیں آئے آتا آتی آتے آداب آدھ آدھا آدھی آدھے آس
 آمدید آنا آنسہ آنی آنے آپ آگے آہ آہا آیا اب ابھی ابے
 اتوار ارب اربویں ارے اس اسکا اسکی اسکے اسی اسے اف افوہ الاول البتہ
 الثانی الحرام السلام الف المکرم ان اندر انکا انکی انکے انہوں انہی انہیں
 اوئے اور اوپر اوہو اپ اپنا اپنوں اپنی اپنے اپنےآپ اکبر اکثر اگر اگرچہ
 اگست اہاہا ایسا ایسی ایسے ایک بائیں بار بارے بالکل باوجود باہر بج بجے
 بخیر برسات بشرطیکہ بعض بغیر بلکہ بن بنا بناؤ بند بڑی بھر بھریں
 بھی بہار بہت بہتر بیگم تاکہ تاہم تب تجھ تجھی تجھے ترا تری
 تلک تم تمام تمہارا تمہاروں تمہاری تمہارے تمہیں تو تک تھا تھی تھیں تھے
 تہائی تیرا تیری تیرے تین جا جاؤ جائیں جائے جاتا جاتی جاتے جانی جانے
 جب جبکہ جدھر جس جسے جن جناب جنہوں جنہیں جو جہاں جی جیسا
 جیسوں جیسی جیسے جیٹھ حالانکہ حالاں حصہ حضرت خاطر خالی خدا خزاں خواہ خوب
 خود دائیں درمیان دریں دو دوران دوسرا دوسروں دوسری دوشنبہ دوں دکھائیں دگنا دی
 دیئے دیا دیتا دیتی دیتے دیر دینا دینی دینے دیکھو دیں دیے دے ذریعے
 رکھا رکھتا رکھتی رکھتے رکھنا رکھنی رکھنے رکھو رکھی رکھے رہ رہا رہتا
 رہتی رہتے رہنا رہنی رہنے رہو رہی رہیں رہے ساتھ سامنے ساڑھے سب سبھی
 سراسر سلام سمیت سوا سوائے سکا سکتا سکتے سہ سہی سی سے شام شاید
 شکریہ صاحب صاحبہ صرف ضرور طرح طرف طور علاوہ عین فروری فقط فلاں
 فی قبل قطا لائی لائے لاتا لاتی لاتے لانا لانی لایا لو لوجی لوگوں
 لگ لگا لگتا لگتی لگی لگیں لگے لہذا لی لیا لیتا لیتی لیتے لیکن
 لیں لیے لے ماسوا مت مجھ مجھی مجھے محترم محترمی محض مرا مرحبا
 مری مرے مزید مس مسز مسٹر مطابق مطلق مل منٹ منٹوں مکرمی مگر
 مگھر مہربانی میرا میروں میری میرے میں نا نزدیک نما نو نومبر نہ نہیں
 نیز نیچے نے و وار واسطے واقعی والا والوں والی والے واہ وجہ ورنہ
 وعلیکم وغیرہ ولے وگرنہ وہ وہاں وہی وہیں ویسا ویسے ویں پاس
 پایا پر پس پلیز پون پونا پونی پونے پھاگن پھر پہ پہر پہلا پہلی
 پہلے پیر پیچھے چاہئے چاہتے چاہیئے چاہے چلا چلو چلیں چلے چناچہ چند چونکہ
 چوگنی چکی چکیں چکے چہارشنبہ چیت ڈالنی ڈالنے ڈالے کئے کا کاتک کاش کب
 کبھی کدھر کر کرتا کرتی کرتے کرم کرنا کرنے کرو کریں کرے کس
 کسی کسے کل کم کن کنہیں کو کوئی کون کونسا کونسے کچھ کہ کہا
 کہاں کہہ کہی کہیں کہے کی کیا کیسا کیسے کیونکر کیونکہ کیوں کیے کے
 گئی گئے گا گرما گرمی گنا گو گویا گھنٹا گھنٹوں گھنٹے گی گیا
 ہائیں ہائے ہاڑ ہاں ہر ہرچند ہرگز ہزار ہفتہ ہم ہمارا ہماری ہمارے ہمی
 ہمیں ہو ہوئی ہوئیں ہوئے ہوا ہوبہو ہوتا ہوتی ہوتیں ہوتے ہونا ہونگے ہونی
 ہونے ہوں ہی ہیلو ہیں ہے یا یات یعنی یک یہ یہاں یہی یہیں
""".split())

#import section
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence
import pandas as pd
import numpy as np
import io
import re, string, unicodedata
import nltk
#import contractions
#import inflect
import sys

from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

# functions sections

def remove_str(string):
    pattern = re.compile(r"([a-z]|[A-Z]|[0-9])")
    
    return pattern.sub(r'',string)
    
    
   

def remove_url(string):
    
       
    p= re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', string)
    
    return p

def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\U0001F4AA-\U0001F923"
                           u"\U0001F928-\U0001F92A"
                           u"\U0001F97A"
                           u"\U0001F92B"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

def remove_special(string):
    p=re.sub('\?|\.|\!|\;|\:|\#|\؟|\‘|\’|\(|\)|\@|-|_|\/|\|\*|\"|\،|\"|\...|\$', '', string)
    return p

d=[]
real_data22=pd.DataFrame()

d1=[]
real_data=pd.DataFrame()
real_data2=pd.DataFrame()
data=pd.read_csv("D:\Final\datalast1.csv")#,encoding="iso 8859-1")
data=pd.read_csv("D:\Final\datasetlast.csv")#,encoding="iso 8859-1")
data=pd.read_csv("d:\Finaldata\Final\datalastfinal1.csv")#,encoding="iso 8859-1")

data=pd.read_csv("d:\Finaldata\Final\datalastclean12.csv")


data22=pd.read_csv("E:\latest\drytv\drytv.csv")

data34=pd.read_csv("D:\Finaldata\Final\datanew\chartrigram_charbigram.csv")

data35=pd.read_csv("F:\Finaldata\Final\datanew\chartrigram_charbigram_charunigram.csv")

data36=pd.read_csv("D:\Final\last\dagofwords.csv")

data37=pd.read_csv("D:\Final\last\dfidf202.csv")


#data36=pd.read_csv("D:\Finaldata\Final\last\dag_of_72features_wraper.csv")

#data37=pd.read_csv("D:\Finaldata\Final\last\dfidf_67features_wraper.csv")

data38=pd.read_csv("d:\Final\last\word2vec71features.csv")
data39=pd.read_csv("d:\Final\last\word2vec.csv")

data39=pd.read_csv("F:\Finaldata\Final\last\dasttext_with30features.csv")

data40=pd.read_csv("d:\Final\last\combination\dag_of_word_with_tfidf_combined.csv")

data41=pd.read_csv("d:\Final\last\combination\word2vec_bagofwords.csv")

data42=pd.read_csv("F:\Finaldata\Final\last\combination\dasttext_bagofwords.csv")

data42=pd.read_csv("d:\Final\last\combination\word2vec_tfidf.csv")

data44=pd.read_csv("F:\Finaldata\Final\last\combination\dasttext_tfidf.csv")

data45=pd.read_csv("F:\Finaldata\Final\last\combination\word2vec_fasttext_combined.csv")

data43=pd.read_csv("d:\Final\last\combination\word2vec_bag_ofword_tfidf_combined.csv")

X = data.iloc[:, :-1].values
Y = data['label']

X1=data36.iloc[:, :-1].values
X2=data37.iloc[:, :-1].values
X3=data38.iloc[:, :-1].values

X4=data38.iloc[:, :-1].values


X5=data39.iloc[:, :-1].values

X6=data40.iloc[:, :-1].values

X7=data41.iloc[:, :-1].values

X8=data42.iloc[:, :-1].values

X9=data43.iloc[:, :-1].values


X10=data44.iloc[:, :-1].values

X11=data45.iloc[:, :-1].values

X12=data46.iloc[:, :-1].values


#data22=pd.read_csv("F:\Finaldata\datanew\sample8.csv")
#print(data22['message'][0])

import tensorflow_hub as hub
import tensorflow as tf
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

corpus=[]
para1=[]
para2=[]
para3=[]
wordfreq = {}
wordfreq1={}
filtered_token=[]
filtered_token1=[]

filtered_token2=[]
filtered_token3=[]
sentence=[]
sentence1=[]

sentence2=[]
sentence3=[]
normalize_corpus2=[]
normalize_corpus1=[]
normalize_corpus3=[]

words1=[]

print(len(data))

for i1 in range(0,len(data)):
    txt1=data['message'][i1]
    
    para3.append(txt1)
    
    sentence_tokens4=nltk.tokenize.wordpunct_tokenize(txt1)
    words1.append(sentence_tokens4)
    
    filtered_tokens2 = [token1 for token1 in sentence_tokens4 if token1 not in stop_words and token1!='']
    
    sentence2=' ' .join(filtered_tokens2)
    normalize_corpus1.append(sentence2)
    real_data=real_data.append({'Label':data['label'][i],'Message':sentence2},ignore_index=True)
    
    
real_data2.to_csv("E:\latest\pti\ptifinalclean1.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")    
    
words=[]

no_tokens=[]
txt=data['message'][0]
print(len(data))
for i in range(0,len(data)):
    txt=data['message'][i]
    
    #para1.append(txt)
    
       
    sentence_tokens = nltk.tokenize.wordpunct_tokenize(txt)
    words.append(sentence_tokens)          
    filtered_tokens = [token for token in sentence_tokens if token not in stop_words and token!='']
    token12=[]
    for token in filtered_tokens:
        
        if(token=='==========' or token=='=' or token=='==' or token=='===' or token=='+++' or token=='>>>' or token=='++=' or token=='~~~' or token=='NAME' or token=='=====' or token=='íӀօѵҽ' or token=='>>>>' or token=='Ñāāì' or token=='×=×' or token=='×=×=' or token=='```' or token=='|==' or token==' '):
            continue 
        else:
            token12.append(token)
            no_tokens.append(token)
            
            
    sentence=' ' .join(token12)
    
    normalize_corpus1.append(sentence)
    real_data=real_data.append({'Label':data['label'][i],'Message':sentence},ignore_index=True)

real_data.to_csv("d:\Finaldata\Final\datalastcleanlast.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")


no_tokens = nltk.tokenize.wordpunct_tokenize(normalize_corpus1)

from sklearn.model_selection import train_test_split
train_corpus,test_corpus,train_label_names,test_label_names = train_test_split(np.array(real_data['Message']),
np.array(data['label']),test_size=0.3, random_state=42)
train_corpus.shape, test_corpus.shape
#train_corpus1,test_corpus1,train_label_names1,test_label_names1 = train_test_split(np.array(real_data2['Message']),
#np.array(real_data22['Label']),test_size=0.0001, random_state=42)

from collections import Counter
trd = dict(Counter(train_label_names))
tsd = dict(Counter(test_label_names))

print("Shape of training data = ", train_corpus.shape)
data.sample(10)

data2['message'] = data2['message'] + " ."
data2['message'].head()

if not os.path.exists("d:/Finaldata/train1"):
    os.makedirs("d:/Finaldata/train1")

#print(range(0,data2.shape[0],6))
for i in range(0,data2.shape[0],6):
    text = u"\n".join(data2['message'][i:i+6].tolist())
    encoded_unicode = text.encode("utf8")
    #print(encoded_unicode)
    #texts = " ".join(data['message'].tolist())
    fp = open("d:/Finaldata/train1/"+str(i)+".encoded_unicode","wb")
    fp.write(encoded_unicode)
    fp.close()

elmo = hub.Module("https://tfhub.dev/google/elmo/3", trainable=True)
print(len(tokenized_corpus))


texts = " ".join(data2['message'].tolist())
words = texts.split(" ")
print("Number of tokens in Training data = ",len(words))
dictionary = Counter(words)
print("Size of Vocab",len(dictionary))
sorted_vocab = ["<S>","</S>","<UNK>"]
sorted_vocab.extend([pair[0] for pair in dictionary.most_common()])
 
text = "\n".join(sorted_vocab)
encoded_unicode = text.encode("utf8")
fp = open("d:/Finaldata/vocab.txt","wb")
fp.write(encoded_unicode)
fp.close()

((pd.DataFrame([[key, trd[key], tsd[key]] for key in trd],
columns=['Target Label', 'Train Count', 'Test Count']).sort_values(by=['Train Count', 'Test Count'],
        ascending=False)))


                                                                   
from sklearn.model_selection import train_test_split
train_corpus1,test_corpus1,train_label_names1,test_label_names1 = train_test_split(np.array(X1),
np.array(real_data['Label']),test_size=0.6, random_state=42)
print(train_corpus1.shape, test_corpus1.shape)
#train_corpus1,test_corpus1,train_label_names1,test_label_names1 = train_test_split(np.array(real_data22['Message']),
#np.array(real_data22['Label']),test_size=0.0001, random_state=42)

from collections import Counter
trd = dict(Counter(train_label_names1))
tsd = dict(Counter(test_label_names1))

((pd.DataFrame([[key, trd[key], tsd[key]] for key in trd],
columns=['Target Label', 'Train Count', 'Test Count']).sort_values(by=['Train Count', 'Test Count'],
        ascending=False)))
                                                                   
                                                                   
from sklearn.model_selection import train_test_split
train_corpus2,test_corpus2,train_label_names2,test_label_names2 = train_test_split(np.array(X2),
np.array(real_data['Label']),test_size=0.5, random_state=42)
train_corpus2.shape, test_corpus2.shape
#train_corpus1,test_corpus1,train_label_names1,test_label_names1 = train_test_split(np.array(real_data22['Message']),
#np.array(real_data22['Label']),test_size=0.0001, random_state=42)

from collections import Counter
trd = dict(Counter(train_label_names2))
tsd = dict(Counter(test_label_names2))

((pd.DataFrame([[key, trd[key], tsd[key]] for key in trd],
columns=['Target Label', 'Train Count', 'Test Count']).sort_values(by=['Train Count', 'Test Count'],
        ascending=False)))
                                                                   
                                                                   
from sklearn.model_selection import train_test_split
train_corpus3,test_corpus3,train_label_names3,test_label_names3 = train_test_split(np.array(X3),
np.array(real_data['Label']),test_size=0.5, random_state=42)
print(train_corpus3.shape, test_corpus3.shape)
#train_corpus1,test_corpus1,train_label_names1,test_label_names1 = train_test_split(np.array(real_data22['Message']),
#np.array(real_data22['Label']),test_size=0.0001, random_state=42)

from collections import Counter
trd = dict(Counter(train_label_names3))
tsd = dict(Counter(test_label_names3))

((pd.DataFrame([[key, trd[key], tsd[key]] for key in trd],
columns=['Target Label', 'Train Count', 'Test Count']).sort_values(by=['Train Count', 'Test Count'],
        ascending=False)))
                                                                   
from sklearn.model_selection import train_test_split
train_corpus4,test_corpus4,train_label_names4,test_label_names4 = train_test_split(np.array(X3),
np.array(real_data['Label']),test_size=0.7, random_state=42)
print(train_corpus3.shape, test_corpus3.shape)
#train_corpus1,test_corpus1,train_label_names1,test_label_names1 = train_test_split(np.array(real_data22['Message']),
#np.array(real_data22['Label']),test_size=0.0001, random_state=42)

from collections import Counter
trd = dict(Counter(train_label_names4))
tsd = dict(Counter(test_label_names4))

((pd.DataFrame([[key, trd[key], tsd[key]] for key in trd],
columns=['Target Label', 'Train Count', 'Test Count']).sort_values(by=['Train Count', 'Test Count'],
        ascending=False)))
                                                                   
# bag of words model implementation

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score

cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0)
cv_train_features1 = cv.fit_transform(train_corpus1)
#cv_train_features=cv_train_features.toarray()

cv_test_features1 = cv.fit_transform(test_corpus1)
#cv_test_features=cv_test_features.toarray()

cv_matrix = cv.fit_transform(normalize_corpus1)
cv_matrix = cv_matrix.toarray()

min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(cv_matrix)

vocab = cv.get_feature_names()
df1=pd.DataFrame(cv_matrix,columns=vocab)
df1.to_csv('d:/Finaldata/latest/final_bof.csv', sep=',', index=False,encoding="utf-32")
#df1.to_csv("F:\Finaldata\Final\data\gofword.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")
# transform test articles into features
cv_test_features = cv.transform(test_corpus)

print('BOW model:> Train features shape:', cv_matrix.shape)
print('BOW model:> Train features shape:', cv_train_features.shape)

print(' Test features shape:', cv_test_features.shape)

# Naïve Bayes Classifier
    from sklearn.naive_bayes import MultinomialNB
    mnb = MultinomialNB(alpha=1)
    mnb.fit(cv_train_features, train_label_names)
    mnb.fit(cv_matrix,real_data['Label'])
    
    mnb_bow_cv_scores = cross_val_score(mnb, cv_train_features, train_label_names, cv=10)
    mnb_bow_cv_scores=cross_val_score(mnb,cv_matrix,real_data['Label'],cv=10,scoring='accuracy')
    mnb_bow_cv_scores1=cross_val_score(mnb,cv_matrix,real_data['Label'],cv=10,scoring='precision_macro')
    mnb_bow_cv_scores2=cross_val_score(mnb,cv_matrix,real_data['Label'],cv=10,scoring='recall_macro')
    mnb_bow_cv_scores3=cross_val_score(mnb,cv_matrix,real_data['Label'],cv=10,scoring='f1_macro')
    mnb_bow_cv_scores4=cross_val_score(mnb,cv_matrix,real_data['Label'],cv=10,scoring='roc_auc')
    
    
    
    
    mnb_bow_cv_mean_score = np.mean(mnb_bow_cv_scores)
    mnb_bow_cv_mean_score1 = np.mean(mnb_bow_cv_scores1)
    mnb_bow_cv_mean_score2 = np.mean(mnb_bow_cv_scores2)
    mnb_bow_cv_mean_score3 = np.mean(mnb_bow_cv_scores3)
    mnb_bow_cv_mean_score4 = np.mean(mnb_bow_cv_scores4)
    
    
    
    print('CV Accuracy (10-fold):', mnb_bow_cv_scores)
    print('Mean CV Accuracy:', mnb_bow_cv_mean_score)
    
    print('CV Precision (10-fold):', mnb_bow_cv_scores1)
    print('Mean CV Precision:', mnb_bow_cv_mean_score1)
    
    print('CV Recall (10-fold):', mnb_bow_cv_scores2)
    print('Mean CV Recall:', mnb_bow_cv_mean_score2)
    
    print('CV f1-score (10-fold):', mnb_bow_cv_scores3)
    print('Mean f1-score:', mnb_bow_cv_mean_score3)
    
    print('CV auc (10-fold):', mnb_bow_cv_scores4)
    print('Mean auc:', mnb_bow_cv_mean_score4)
    
    
    mnb_bow_test_score = mnb.score(cv_test_features, test_label_names)
    print('Test Accuracy:', mnb_bow_test_score)
    
    
sent_pred=mnb.predict(cv_test_features)

from sklearn.metrics import confusion_matrix
cm=confusion_matrix(test_label_names,sent_pred)


tp=cm[1,1]
fp=cm[0,1]
tn=cm[0,0]
fn=cm[1,0]
print(tp,fp,tn,fn)
precision=tp/(tp+fp)
recall=tp/(tp+fn)
accuracy=(tp+tn)/(tp+tn+fp+fn)
f_measure=(2*recall*precision)/(recall+precision)
print(precision,recall,accuracy,f_measure)

mnb_pred=mnb.predict(cv_test_features)

import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report

fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(2):
    
    fpr[i], tpr[i], _ = roc_curve(test_label_names,mnb_pred)
    roc_auc[i] = auc(fpr[i], tpr[i])

#print(roc_auc_score(test_label_names,mnb_pred))
plt.figure()
plt.plot(fpr[1], tpr[1])
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.show()

naive_bag_roc=roc_curve(test_label_names,mnb_pred)
print(naive_bag_roc)
df4=pd.DataFrame(naive_bag_roc)
target_names=['Not offensive','offensive']
print(classification_report(test_label_names,mnb_pred, target_names=target_names))

report = classification_report(test_label_names, mnb_pred, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\mydataset\data\gofwordnaivereport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")
'''import matplotlib.pyplot as plt
import numpy as np

def show_values(pc, fmt="%.2f", **kw):
    
    Heatmap with text in each cell with matplotlib's pyplot
    Source: https://stackoverflow.com/a/25074150/395857 
    By HYRY
    
    from itertools import izip
    pc.update_scalarmappable()
    ax = pc.get_axes()
    #ax = pc.axes# FOR LATEST MATPLOTLIB
    #Use zip BELOW IN PYTHON 3
    for p, color, value in izip(pc.get_paths(), pc.get_facecolors(), pc.get_array()):
        x, y = p.vertices[:-2, :].mean(0)
        if np.all(color[:3] > 0.5):
            color = (0.0, 0.0, 0.0)
        else:
            color = (1.0, 1.0, 1.0)
        ax.text(x, y, fmt % value, ha="center", va="center", color=color, **kw)


def cm2inch(*tupl):
    
    Specify figure size in centimeter in matplotlib
    Source: https://stackoverflow.com/a/22787457/395857
    By gns-ank
    
    inch = 2.54
    if type(tupl[0]) == tuple:
        return tuple(i/inch for i in tupl[0])
    else:
        return tuple(i/inch for i in tupl)


def heatmap(AUC, title, xlabel, ylabel, xticklabels, yticklabels, figure_width=40, figure_height=20, correct_orientation=False, cmap='RdBu'):
    
    Inspired by:
    - https://stackoverflow.com/a/16124677/395857 
    - https://stackoverflow.com/a/25074150/395857
    

    # Plot it out
    fig, ax = plt.subplots()    
    #c = ax.pcolor(AUC, edgecolors='k', linestyle= 'dashed', linewidths=0.2, cmap='RdBu', vmin=0.0, vmax=1.0)
    c = ax.pcolor(AUC, edgecolors='k', linestyle= 'dashed', linewidths=0.2, cmap=cmap)

    # put the major ticks at the middle of each cell
    ax.set_yticks(np.arange(AUC.shape[0]) + 0.5, minor=False)
    ax.set_xticks(np.arange(AUC.shape[1]) + 0.5, minor=False)
    # set tick labels
    #ax.set_xticklabels(np.arange(1,AUC.shape[1]+1), minor=False)
    ax.set_xticklabels(xticklabels, minor=False)
    ax.set_yticklabels(yticklabels, minor=False)

    # set title and x/y labels
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)      

    # Remove last blank column
    plt.xlim( (0, AUC.shape[1]) )

    # Turn off all the ticks
    ax = plt.gca()    
    for t in ax.xaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False
    for t in ax.yaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False

    # Add color bar
    plt.colorbar(c)

    # Add text in each cell 
    show_values(c)

    # Proper orientation (origin at the top left instead of bottom left)
    if correct_orientation:
        ax.invert_yaxis()
        ax.xaxis.tick_top()       

    # resize 
    fig = plt.gcf()
    #fig.set_size_inches(cm2inch(40, 20))
    #fig.set_size_inches(cm2inch(40*4, 20*4))
    fig.set_size_inches(cm2inch(figure_width, figure_height))
    
    
def plot_classification_report(classification_report, title='Classification report ', cmap='RdBu'):
    
    Plot scikit-learn classification report.
    Extension based on https://stackoverflow.com/a/31689645/395857 
    
    lines = classification_report.split('\n')

    classes = []
    plotMat = []
    support = []
    class_names = []
    for line in lines[2 : (len(lines) - 2)]:
        t = line.strip().split()
        if len(t) < 2: continue
        classes.append(t[0])
        v = [float(x) for x in t[1: len(t) - 1]]
        support.append(int(t[-1]))
        class_names.append(t[0])
        print(v)
        plotMat.append(v)

    print('plotMat: {0}'.format(plotMat))
    print('support: {0}'.format(support))

    xlabel = 'Metrics'
    ylabel = 'Classes'
    xticklabels = ['Precision', 'Recall', 'F1-score']
    yticklabels = ['{0} ({1})'.format(class_names[idx], sup) for idx, sup  in enumerate(support)]
    figure_width = 25
    figure_height = len(class_names) + 7
    correct_orientation = False    
    heatmap(np.array(plotMat), title, xlabel, ylabel, xticklabels, yticklabels, figure_width, figure_height, correct_orientation, cmap=cmap)
def main():
    sampleClassificationReport = """             precision    recall  f1-score   support

          Acacia       0.62      1.00      0.76        66
          Blossom       0.93      0.93      0.93        40
          Camellia       0.59      0.97      0.73        67
          Daisy       0.47      0.92      0.62       272
          Echium       1.00      0.16      0.28       413

        avg / total       0.77      0.57      0.49       858"""


    plot_classification_report(sampleClassificationReport)
    plt.savefig('test_plot_classif_report.png', dpi=200, format='png', bbox_inches='tight')
    plt.close()

if __name__ == "__main__":
    main()
'''
#adaboost implementation
from sklearn.ensemble import AdaBoostClassifier

# Import Support Vector Classifier
from sklearn.svm import SVC
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
svc=SVC(probability=True, kernel='linear')

# Create adaboost classifer object
abc =AdaBoostClassifier(n_estimators=100,random_state=0)

abc.fit(cv_matrix,Y)

    abc_bow_cv_scores=cross_val_score(abc,cv_matrix,Y,cv=10,scoring='accuracy')
    abc_bow_cv_scores1=cross_val_score(abc,cv_matrix,Y,cv=10,scoring='precision_macro')
    abc_bow_cv_scores2=cross_val_score(abc,cv_matrix,Y,cv=10,scoring='recall_macro')
    abc_bow_cv_scores3=cross_val_score(abc,cv_matrix,Y,cv=10,scoring='f1_macro')
    abc_bow_cv_scores4=cross_val_score(abc,cv_matrix,Y,cv=10,scoring='roc_auc')
    
    
    
    
    abc_bow_cv_mean_score = np.mean(abc_bow_cv_scores)
    abc_bow_cv_mean_score1 = np.mean(abc_bow_cv_scores1)
    abc_bow_cv_mean_score2 = np.mean(abc_bow_cv_scores2)
    abc_bow_cv_mean_score3 = np.mean(abc_bow_cv_scores3)
    abc_bow_cv_mean_score4 = np.mean(abc_bow_cv_scores4)
    
    
    
    print('CV Accuracy (10-fold):', abc_bow_cv_scores)
    print('Mean CV Accuracy:', abc_bow_cv_mean_score)
    
    print('CV Precision (10-fold):', abc_bow_cv_scores1)
    print('Mean CV Precision:', abc_bow_cv_mean_score1)
    
    print('CV Recall (10-fold):', abc_bow_cv_scores2)
    print('Mean CV Recall:', abc_bow_cv_mean_score2)
    
    print('CV f1-score (10-fold):', abc_bow_cv_scores3)
    print('Mean f1-score:', abc_bow_cv_mean_score3)
    
    print('CV auc (10-fold):', abc_bow_cv_scores4)
    print('Mean auc:', abc_bow_cv_mean_score4)


# Logistic Regressionmnb_
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from statistics import mean, stdev 
from sklearn import preprocessing 
lr = LogisticRegression(penalty='l2', max_iter=2000, C=1, random_state=42,solver='lbfgs')

#mnb.fit(cv_matrix,real_data['Label'])
lr.fit(cv_matrix, Y)
lr_bow_cv_scores = cross_val_score(lr, cv_matrix, real_data['Label'], cv=10)
lr_bow_cv_mean_score = np.mean(lr_bow_cv_scores)
print('CV Accuracy (10-fold):', lr_bow_cv_scores)
print('Mean CV Accuracy:', lr_bow_cv_mean_score)
lr_bow_test_score = lr.score(cv_test_features, test_label_names)
print('Test Accuracy:', lr_bow_test_score)

 
    lr.fit(X4,real_data['Label'])
    #lr_bow_cv_scores = cross_val_score(lr, cv_matrix, real_data['Label'], cv=10)
    x = X4
    y=real_data['Label']
    scaler = preprocessing.MinMaxScaler() 
    x_scaled = scaler.fit_transform(x)
    
    
    
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) 
    lst_accu_stratified = [] 
   
    for train_index, test_index in skf.split(X4, real_data['Label']): 
        x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index] 
        y_train_fold, y_test_fold = y[train_index], y[test_index] 
        lr.fit(x_train_fold, y_train_fold) 
        lst_accu_stratified.append(lr.score(x_test_fold, y_test_fold)) 
        
        
     print('List of possible accuracy:', lst_accu_stratified) 
     print('\nMaximum Accuracy That can be obtained from this model is:', 
      max(lst_accu_stratified)*100, '%') 
     print('\nMinimum Accuracy:', 
      min(lst_accu_stratified)*100, '%') 
     print('\nOverall Accuracy:', 
      mean(lst_accu_stratified)*100, '%') 
     print('\nStandard Deviation is:', stdev(lst_accu_stratified))   
    
    #skf = StratifiedKFold(lr,X,real_data['Label'],cv=10, shuffle=True, random_state=1)
    lr_bow_cv_scores=cross_val_score(lr,cv_matrix,Y,cv=10,scoring='accuracy')
    lr_bow_cv_scores1=cross_val_score(lr,cv_matrix,Y,cv=10,scoring='precision_macro')
    lr_bow_cv_scores2=cross_val_score(lr,cv_matrix,Y,cv=10,scoring='recall_macro')
    lr_bow_cv_scores3=cross_val_score(lr,cv_matrix,Y,cv=10,scoring='f1_macro')
    lr_bow_cv_scores4=cross_val_score(lr,cv_matrix,Y,cv=10,scoring='roc_auc')
    
    
    
    
    lr_bow_cv_mean_score = np.mean(lr_bow_cv_scores)
    lr_bow_cv_mean_score1 = np.mean(lr_bow_cv_scores1)
    lr_bow_cv_mean_score2 = np.mean(lr_bow_cv_scores2)
    lr_bow_cv_mean_score3 = np.mean(lr_bow_cv_scores3)
    lr_bow_cv_mean_score4 = np.mean(lr_bow_cv_scores4)
    
    
    
    print('CV Accuracy (10-fold):', lr_bow_cv_scores)
    print('Mean CV Accuracy:', lr_bow_cv_mean_score)
    
    print('CV Precision (10-fold):', lr_bow_cv_scores1)
    print('Mean CV Precision:', lr_bow_cv_mean_score1)
    
    print('CV Recall (10-fold):', lr_bow_cv_scores2)
    print('Mean CV Recall:', lr_bow_cv_mean_score2)
    
    print('CV f1-score (10-fold):', lr_bow_cv_scores3)
    print('Mean f1-score:', lr_bow_cv_mean_score3)
    
    print('CV auc (10-fold):', lr_bow_cv_scores4)
    print('Mean auc:', lr_bow_cv_mean_score4)




lr_pred=lr.predict(cv_test_features)

from sklearn.metrics import classification_report

target_names=['Not offensive','offensive']
print(classification_report(test_label_names,lr_pred, target_names=target_names))

report = classification_report(test_label_names, lr_pred, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\Final\data\gofwordlogisticeport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")

from sklearn.metrics import confusion_matrix
lr=confusion_matrix(test_label_names,lr_pred)


tp=lr[1,1]
fp=lr[0,1]
tn=lr[0,0]
fn=lr[1,0]
print(tp,fp,tn,fn)
precision=tp/(tp+fp)
recall=tp/(tp+fn)
accuracy=(tp+tn)/(tp+tn+fp+fn)
f_measure=(2*recall*precision)/(recall+precision)
print(precision,recall,accuracy,f_measure)

# Support Vector Machines
from sklearn.svm import LinearSVC
svm = LinearSVC(penalty='l2', C=1, random_state=42)
svm.fit(cv_matrix, Y)
svm_bow_cv_scores = cross_val_score(svm, cv_matrix, Y, cv=10)
svm_bow_cv_mean_score = np.mean(svm_bow_cv_scores)
print('CV Accuracy (10-fold):', svm_bow_cv_scores)
print('Mean CV Accuracy:', svm_bow_cv_mean_score)
svm_bow_test_score = svm.score(cv_test_features, test_label_names)
print('Test Accuracy:', svm_bow_test_score)


     skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1) 
     lst_accu_stratified = [] 
   
    for train_index, test_index in skf.split(X, real_data['Label']): 
        x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index] 
        y_train_fold, y_test_fold = y[train_index], y[test_index] 
        svm.fit(x_train_fold, y_train_fold) 
        lst_accu_stratified.append(lr.score(x_test_fold, y_test_fold)) 
        
        
     print('List of possible accuracy:', lst_accu_stratified) 
     print('\nMaximum Accuracy That can be obtained from this model is:', 
      max(lst_accu_stratified)*100, '%') 
     print('\nMinimum Accuracy:', 
      min(lst_accu_stratified)*100, '%') 
     print('\nOverall Accuracy:', 
      mean(lst_accu_stratified)*100, '%') 
     print('\nStandard Deviation is:', stdev(lst_accu_stratified))   


    svm.fit(X2,real_data['Label'])
    #lr_bow_cv_scores = cross_val_score(lr, cv_matrix, real_data['Label'], cv=10)
    svm_bow_cv_scores=cross_val_score(svm,cv_train_features,train_label_names,cv=10,scoring='accuracy')
    svm_bow_cv_scores1=cross_val_score(svm,cv_train_features,train_label_names,cv=10,scoring='precision_macro')
    svm_bow_cv_scores2=cross_val_score(svm,cv_train_features,train_label_names,cv=10,scoring='recall_macro')
    svm_bow_cv_scores3=cross_val_score(svm,cv_train_features,train_label_names,cv=10,scoring='f1_macro')
    svm_bow_cv_scores4=cross_val_score(svm,cv_train_features,train_label_names,cv=10,scoring='roc_auc')
    
    
    
    
    svm_bow_cv_mean_score = np.mean(svm_bow_cv_scores)
    svm_bow_cv_mean_score1 = np.mean(svm_bow_cv_scores1)
    svm_bow_cv_mean_score2 = np.mean(svm_bow_cv_scores2)
    svm_bow_cv_mean_score3 = np.mean(svm_bow_cv_scores3)
    svm_bow_cv_mean_score4 = np.mean(svm_bow_cv_scores4)
    
    
    
    print('CV Accuracy (10-fold):', svm_bow_cv_scores)
    print('Mean CV Accuracy:', svm_bow_cv_mean_score)
    
    print('CV Precision (10-fold):', svm_bow_cv_scores1)
    print('Mean CV Precision:', svm_bow_cv_mean_score1)
    
    print('CV Recall (10-fold):', svm_bow_cv_scores2)
    print('Mean CV Recall:', svm_bow_cv_mean_score2)
    
    print('CV f1-score (10-fold):', svm_bow_cv_scores3)
    print('Mean f1-score:', svm_bow_cv_mean_score3)
    
    print('CV auc (10-fold):', svm_bow_cv_scores4)
    print('Mean auc:', svm_bow_cv_mean_score4)


svm_pred=svm.predict(cv_test_features)

from sklearn.metrics import confusion_matrix
sm=confusion_matrix(test_label_names,svm_pred)


tp=sm[1,1]
fp=sm[0,1]
tn=sm[0,0]
fn=sm[1,0]
print(tp,fp,tn,fn)
precision=tp/(tp+fp)
recall=tp/(tp+fn)
accuracy=(tp+tn)/(tp+tn+fp+fn)
f_measure=(2*recall*precision)/(recall+precision)
print(precision,recall,accuracy,f_measure)

print(classification_report(test_label_names,svm_pred, target_names=target_names))

report = classification_report(test_label_names, svm_pred, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\mydataset\data\gofwordsvmeport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")

# Multilayer perception

from sklearn.neural_network import MLPRegressor
regr = MLPRegressor(random_state=1, max_iter=500)
regr.fit(cv_matrix,Y)
regr_bow_cv_scores=cross_val_score(regr,cv_matrix,Y,cv=10,scoring='accuracy')
regr_bow_cv_mean_score = np.mean(regr_bow_cv_scores)

print('CV Accuracy (10-fold):', regr_bow_cv_scores)
print('Mean CV Accuracy:', regr_bow_cv_mean_score)

# SVM with Stochastic Gradient Descent

from sklearn.linear_model import SGDClassifier
svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=50, random_state=42)
svm_sgd.fit(cv_matrix, Y)
svmsgd_bow_cv_scores = cross_val_score(svm_sgd, cv_matrix, Y, cv=10)
svmsgd_bow_cv_mean_score = np.mean(svmsgd_bow_cv_scores)
print('CV Accuracy (10-fold):', svmsgd_bow_cv_scores)
print('Mean CV Accuracy:', svmsgd_bow_cv_mean_score)
svmsgd_bow_test_score = svm_sgd.score(cv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_bow_test_score)

    svm_sgd.fit(X2,real_data['Label'])
    #lr_bow_cv_scores = cross_val_score(lr, cv_matrix, real_data['Label'], cv=10)
    svmgd_bow_cv_scores=cross_val_score(svm_sgd,cv_train_features,train_label_names,cv=10,scoring='accuracy')
    svmgd_bow_cv_scores1=cross_val_score(svm_sgd,cv_train_features,train_label_names,cv=10,scoring='precision_macro')
    svmgd_bow_cv_scores2=cross_val_score(svm_sgd,cv_train_features,train_label_names,cv=10,scoring='recall_macro')
    svmgd_bow_cv_scores3=cross_val_score(svm_sgd,cv_train_features,train_label_names,cv=10,scoring='f1_macro')
    svmgd_bow_cv_scores4=cross_val_score(svm_sgd,cv_train_features,train_label_names,cv=10,scoring='roc_auc')
    
    
    
    
    svmgd_bow_cv_mean_score = np.mean(svmgd_bow_cv_scores)
    svmgd_bow_cv_mean_score1 = np.mean(svmgd_bow_cv_scores1)
    svmgd_bow_cv_mean_score2 = np.mean(svmgd_bow_cv_scores2)
    svmgd_bow_cv_mean_score3 = np.mean(svmgd_bow_cv_scores3)
    svmgd_bow_cv_mean_score4 = np.mean(svmgd_bow_cv_scores4)
    
    
    
    print('CV Accuracy (10-fold):', svmgd_bow_cv_scores)
    print('Mean CV Accuracy:', svmgd_bow_cv_mean_score)
    
    print('CV Precision (10-fold):', svmgd_bow_cv_scores1)
    print('Mean CV Precision:', svmgd_bow_cv_mean_score1)
    
    print('CV Recall (10-fold):', svmgd_bow_cv_scores2)
    print('Mean CV Recall:', svmgd_bow_cv_mean_score2)
    
    print('CV f1-score (10-fold):', svmgd_bow_cv_scores3)
    print('Mean f1-score:', svmgd_bow_cv_mean_score3)
    
    print('CV auc (10-fold):', svmgd_bow_cv_scores4)
    print('Mean auc:', svmgd_bow_cv_mean_score4)




svm_sgd_pred=svm_sgd.predict(cv_test_features)

print(classification_report(test_label_names,svm_sgd_pred, target_names=target_names))

report = classification_report(test_label_names, svm_sgd_pred, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\mydataset\data\gofwordstochosticport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")


from sklearn.neural_network import MLPRegressor
regr = MLPRegressor(random_state=1, max_iter=500)

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=10, random_state=42)

from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
svm = LinearSVC(penalty='l2', C=1, random_state=42)
model1 = LogisticRegression()
model2 = LinearSVC(penalty='l2', C=1, random_state=42)
model3=SGDClassifier()
model4=RandomForestClassifier()
model = VotingClassifier(estimators=[('lr', model1), ('svm', model2),('svm_sgd',model3)], voting='hard')

model = VotingClassifier(estimators=[('lr', model1)], voting='soft')


model.fit(X9, Y)

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=50, random_state=42)
from sklearn.ensemble import VotingClassifier


# create the sub models
estimators = []
model1 = LogisticRegression()
estimators.append(('logistic', model1))
#model2 = DecisionTreeClassifier()
model2 = SVC()
estimators.append(('svm', model2))
model3=SGDClassifier()
estimators.append(('svm_sgd', model3))

seed = 7
kfold = model_selection.KFold(n_splits=10)

model = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, cv_matrix, Y, cv=kfold)
print(results.mean())

def forward_selection(data, target, significance_level=0.05):
    initial_features = data.columns.tolist()
    best_features = []
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if(min_p_value<significance_level):
            best_features.append(new_pval.idxmin())
        else:
            break
    return best_features

forward_selection(cv_matrix,Y)

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score

from mlxtend.feature_selection import SequentialFeatureSelector

feature_selector = SequentialFeatureSelector(model,
           k_features=72,
           forward=False,
           verbose=2,
           scoring='accuracy',
           cv=0)
features = feature_selector.fit(avg_wv_train_features1,Y)

filtered_features= list(features.k_feature_idx_)]
                                                                   
filtered_features=features.k_feature_idx_
filtered_features12=features.k_feature_idx_

list34=[0,1,4,9,11,13,15,16,19,20,21,24,25,31,36,41,60,61,65,73,75,82,83,86,87,88,93,94]

f=np.delete(avg_wv_train_features1,list34, axis=1)



filter_features4=avg_wv_train_features1[:, [features.k_feature_idx_]]

for i in avg_wv_train_features1:
    print(i)
    filter_features5=
    print(i)

filter_features4=avg_wv_train_features1.[:, [features.k_feature_idx_]]

filtered_features1=avg_wv_train_features1.columns[list(features.k_feature_idx_)]
#avg_wv_train_features1
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression
# Sequential Forward Selection(sfs)
sfs = SFS(model,
          k_features=72,
          forward=True,
          floating=False,
          scoring = 'r2',
          cv = 0)
model.fit(avg_wv_train_features1, Y)


cv=RepeatedStratifiedKFold(n_splits=10,n_repeats=10,random_state=1)

scores_acc=cross_val_score(model,avg_wv_train_features1,Y,scoring='accuracy',cv=kfold,n_jobs=1,error_score='raise')
scores_acc_mean=np.mean(scores_acc)
print('CV Accuracy (10-fold):',scores_acc)
print('Mean CV Accuracy:',scores_acc_mean)






scores_acc1=cross_val_score(model,X9,Y,scoring='precision_macro',cv=cv,n_jobs=1,error_score='raise')
scores_acc_mean1=np.mean(scores_acc1)
print('CV Precision (10-fold):',scores_acc1)
print('Mean CV Precision:',scores_acc_mean1)


scores_acc2=cross_val_score(model,X9,Y,scoring='recall_macro',cv=cv,n_jobs=1,error_score='raise')
scores_acc_mean2=np.mean(scores_acc2)
print('CV Recall (10-fold):',scores_acc2)
print('Mean CV Recall:',scores_acc_mean2)


scores_acc3=cross_val_score(model,X9,Y,scoring='f1_macro',cv=cv,n_jobs=1,error_score='raise')
scores_acc_mean3=np.mean(scores_acc3)
print('CV F-score (10-fold):',scores_acc3)
print('Mean CV F-score:',scores_acc_mean3)

scores_acc4=cross_val_score(model,X9,Y,scoring='roc_auc',cv=cv,n_jobs=1,error_score='raise')
scores_acc_mean4=np.mean(scores_acc4)
print('CV Roc value (10-fold):',scores_acc4)
print('Mean CV Roc:',scores_acc_mean4)

def forward_selection(data, target, significance_level=0.05):
    initial_features = data.columns.tolist()
    best_features = []
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if(min_p_value<significance_level):
            best_features.append(new_pval.idxmin())
        else:
            break
    return best_features




rfc_w2c_test_score = model.score(cv_test_features, test_label_names)
print('Test Accuracy:', rfc_w2c_test_score)

    rfc_w2c_cv_scores=cross_val_score(model,real_data['Label'],scoring='accuracy')
    rfc_w2c_cv_scores1=cross_val_score(model,real_data['Label'],scoring='precision_macro')
    rfc_w2c_cv_scores2=cross_val_score(model,real_data['Label'],scoring='recall_macro')
    rfc_w2c_cv_scores3=cross_val_score(model,real_data['Label'],scoring='f1_macro')
    rfc_w2c_cv_scores4=cross_val_score(model,real_data['Label'],scoring='roc_auc')

    
    rfc_w2c_cv_mean_score = np.mean(rfc_w2c_cv_scores)
    rfc_w2c_cv_mean_score1 = np.mean(rfc_w2c_cv_scores1)
    rfc_w2c_cv_mean_score2 = np.mean(rfc_w2c_cv_scores2)
    rfc_w2c_cv_mean_score3 = np.mean(rfc_w2c_cv_scores3)
    rfc_w2c_cv_mean_score4 = np.mean(rfc_w2c_cv_scores4)
    
    
    
    print('CV Accuracy (10-fold):', rfc_w2c_cv_scores)
    print('Mean CV Accuracy:', rfc_w2c_cv_mean_score)
    
    print('CV Precision (10-fold):', rfc_w2c_cv_scores1)
    print('Mean CV Precision:', rfc_w2c_cv_mean_score1)
    
    print('CV Recall (10-fold):', rfc_w2c_cv_scores2)
    print('Mean CV Recall:', rfc_w2c_cv_mean_score2)
    
    print('CV f1-score (10-fold):', rfc_w2c_cv_scores3)
    print('Mean f1-score:', rfc_w2c_cv_mean_score3)
    
    print('CV auc (10-fold):', rfc_w2c_cv_scores4)
    print('Mean auc:', rfc_w2c_cv_mean_score4)


def Stacking(model,train,y,test,n_fold):
    
   folds=StratifiedKFold(n_splits=n_fold,random_state=None)
   test_pred=np.empty((test.shape[0],1),float)
   train_pred=np.empty((0,1),float)
   for train_indices,val_indices in folds.split(train,y.values):
       x_train,x_val=train.iloc[train_indices],train.iloc[val_indices]
       y_train,y_val=y.iloc[train_indices],y.iloc[val_indices]

       model.fit(X=x_train,y=y_train)
       train_pred=np.append(train_pred,model.predict(x_val))
       test_pred=np.append(test_pred,model.predict(test))
   return test_pred.reshape(-1,1),train_pred

model1 = LogisticRegression()
model2 = LinearSVC()
model3=SGDClassifier()

#model1 = tree.DecisionTreeClassifier(random_state=1)

test_pred1 ,train_pred1=Stacking(model=model1,n_fold=10, train=cv_train_features,test=cv_test_features,y=train_label_names)

train_pred1=pd.DataFrame(train_pred1)
test_pred1=pd.DataFrame(test_pred1)

# Random Forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=10, random_state=42)
rfc.fit(cv_train_features, train_label_names)
rfc_bow_cv_scores = cross_val_score(rfc, cv_train_features, train_label_names, cv=10)
rfc_bow_cv_mean_score = np.mean(rfc_bow_cv_scores)
print('CV Accuracy (10-fold):', rfc_bow_cv_scores)
print('Mean CV Accuracy:', rfc_bow_cv_mean_score)
rfc_bow_test_score = rfc.score(cv_test_features, test_label_names)
print('Test Accuracy:', rfc_bow_test_score)

    rfc.fit(X2,real_data['Label'])
    #lr_bow_cv_scores = cross_val_score(lr, cv_matrix, real_data['Label'], cv=10)
    rfc_bow_cv_scores=cross_val_score(rfc,cv_train_features, train_label_names,cv=10,scoring='accuracy')
    rfc_bow_cv_scores1=cross_val_score(rfc,cv_train_features, train_label_names,cv=10,scoring='precision_macro')
    rfc_bow_cv_scores2=cross_val_score(rfc,cv_train_features, train_label_names,cv=10,scoring='recall_macro')
    rfc_bow_cv_scores3=cross_val_score(rfc,cv_train_features, train_label_names,cv=10,scoring='f1_macro')
    rfc_bow_cv_scores4=cross_val_score(rfc,cv_train_features, train_label_names,cv=10,scoring='roc_auc')
    
    
    
    
    rfc_bow_cv_mean_score = np.mean(rfc_bow_cv_scores)
    rfc_bow_cv_mean_score1 = np.mean(rfc_bow_cv_scores1)
    rfc_bow_cv_mean_score2 = np.mean(rfc_bow_cv_scores2)
    rfc_bow_cv_mean_score3 = np.mean(rfc_bow_cv_scores3)
    rfc_bow_cv_mean_score4 = np.mean(rfc_bow_cv_scores4)
    
    
    
    print('CV Accuracy (10-fold):', rfc_bow_cv_scores)
    print('Mean CV Accuracy:', rfc_bow_cv_mean_score)
    
    print('CV Precision (10-fold):', rfc_bow_cv_scores1)
    print('Mean CV Precision:', rfc_bow_cv_mean_score1)
    
    print('CV Recall (10-fold):', rfc_bow_cv_scores2)
    print('Mean CV Recall:', rfc_bow_cv_mean_score2)
    
    print('CV f1-score (10-fold):', rfc_bow_cv_scores3)
    print('Mean f1-score:', rfc_bow_cv_mean_score3)
    
    print('CV auc (10-fold):', rfc_bow_cv_scores4)
    print('Mean auc:', rfc_bow_cv_mean_score4)


rfc_pred=rfc.predict(cv_test_features)

from sklearn.metrics import confusion_matrix
sm=confusion_matrix(test_label_names,rfc)


tp=sm[1,1]
fp=sm[0,1]
tn=sm[0,0]
fn=sm[1,0]
print(tp,fp,tn,fn)
precision=tp/(tp+fp)
recall=tp/(tp+fn)
accuracy=(tp+tn)/(tp+tn+fp+fn)
f_measure=(2*recall*precision)/(recall+precision)
print(precision,recall,accuracy,f_measure)


print(classification_report(test_label_names,rfc_pred, target_names=target_names))

report = classification_report(test_label_names, rfc_pred, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\mydataset\data\gofwordrandomforestport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")


# Gradient Boosting Machines
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=10, random_state=42)
gbc.fit(cv_train_features, train_label_names)
gbc_bow_cv_scores = cross_val_score(gbc, cv_train_features, train_label_names, cv=10)
gbc_bow_cv_mean_score = np.mean(gbc_bow_cv_scores)
print('CV Accuracy (10-fold):', gbc_bow_cv_scores)
print('Mean CV Accuracy:', gbc_bow_cv_mean_score)
gbc_bow_test_score = gbc.score(cv_test_features, test_label_names)
print('Test Accuracy:', gbc_bow_test_score)

gbc_pred=gbc.predict(cv_test_features)

print(classification_report(test_label_names,gbc_pred, target_names=target_names))

report = classification_report(test_label_names, svm_sgd_pred, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\mydataset\data\gofwordgradientport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")

#bagging
from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier(n_estimators=10, random_state=42)
bagging.fit(cv_train_features, train_label_names)
bagging_bow_cv_scores = cross_val_score(bagging, cv_train_features, train_label_names, cv=10)
bagging_bow_cv_mean_score = np.mean(bagging_bow_cv_scores)
print('CV Accuracy (10-fold):', bagging_bow_cv_scores)
print('Mean CV Accuracy:', bagging_bow_cv_mean_score)
bagging_bow_test_score = bagging.score(cv_test_features, test_label_names)
print('Test Accuracy:', bagging_bow_test_score)
bagging_pred=bagging.predict(cv_test_features)

print(classification_report(test_label_names,bagging_pred, target_names=target_names))


#Tf IDF implementation

from sklearn.feature_extraction.text import TfidfVectorizer
# build BOW features on train articles
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0)
tv_train_features = tv.fit_transform(train_corpus)

tv_matrix = tv.fit_transform(normalize_corpus1)
tv_matrix = tv_matrix.toarray()
vocab = tv.get_feature_names()
df8=pd.DataFrame(np.round(tv_matrix,2), columns=vocab)

tfile = open('test.txt', 'a')
tfile.write(df8.to_string())
tfile.close()

df8.to_csv("F:\Finaldata\Final\data1\last_IDF3.csv", na_rep="NAN!",encoding="utf-32")

df8.to_csv('F:/Finaldata/Final/data1/final_idf12.txt', sep=',', index=False,encoding="utf-32")
#df1=pd.DataFrame(cv_matrix,columns=vocab)
df8.to_csv("F:\Finaldata\Final\data\last_IDF3.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")
tv_test_features = tv.transform(test_corpus)
print('TFIDF model:> Train features shape:', tv_matrix.shape)
print('TFIDF model:> Train features shape:', tv_train_features.shape)
print(' Test features shape:', tv_test_features.shape)


# Naïve Bayes
mnb = MultinomialNB(alpha=1)
mnb.fit(tv_train_features, train_label_names)
mnb_tfidf_cv_scores = cross_val_score(mnb, tv_train_features, train_label_names, cv=10)
mnb_tfidf_cv_mean_score = np.mean(mnb_tfidf_cv_scores)
print('CV Accuracy (10-fold):', mnb_tfidf_cv_scores)
print('Mean CV Accuracy:', mnb_tfidf_cv_mean_score)
mnb_tfidf_test_score = mnb.score(tv_test_features, test_label_names)
print('Test Accuracy:', mnb_tfidf_test_score)
mnb_pred=mnb.predict(tv_test_features)

from sklearn.metrics import confusion_matrix
sm=confusion_matrix(test_label_names,mnb_pred)


tp=sm[1,1]
fp=sm[0,1]
tn=sm[0,0]
fn=sm[1,0]
print(tp,fp,tn,fn)
precision=tp/(tp+fp)
recall=tp/(tp+fn)
accuracy=(tp+tn)/(tp+tn+fp+fn)
f_measure=(2*recall*precision)/(recall+precision)
print(precision,recall,accuracy,f_measure)


print(classification_report(test_label_names,mnb_pred, target_names=target_names))

report = classification_report(test_label_names, mnb_pred, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\mydataset\data\datanaivefeport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")

#bagging
from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier(n_estimators=10, random_state=42)
bagging.fit(tv_train_features, train_label_names)
bagging_bow_cv_scores = cross_val_score(bagging, tv_train_features, train_label_names, cv=10)
bagging_bow_cv_mean_score = np.mean(bagging_bow_cv_scores)
print('CV Accuracy (10-fold):', bagging_bow_cv_scores)
print('Mean CV Accuracy:', bagging_bow_cv_mean_score)
bagging_bow_test_score = bagging.score(cv_test_features, test_label_names)
print('Test Accuracy:', bagging_bow_test_score)
bagging_pred=bagging.predict(tv_test_features)

print(classification_report(test_label_names,bagging_pred, target_names=target_names))

# Logistic Regression
lr = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=42,solver='lbfgs')
lr.fit(tv_matrix, Y)
lr_tfidf_cv_scores = cross_val_score(lr, tv_matrix, Y, cv=10)
lr_tfidf_cv_mean_score = np.mean(lr_tfidf_cv_scores)
print('CV Accuracy (10-fold):', lr_tfidf_cv_scores)
print('Mean CV Accuracy:', lr_tfidf_cv_mean_score)
lr_tfidf_test_score = lr.score(tv_test_features, test_label_names)
print('Test Accuracy:', lr_tfidf_test_score)

lr_pred=lr.predict(tv_test_features)

    lr.fit(tv_matrix,real_data['Label'])
    #lr_bow_cv_scores = cross_val_score(lr, cv_matrix, real_data['Label'], cv=10)
    lr_tfidf_cv_scores=cross_val_score(lr,tv_matrix, Y,cv=10,scoring='accuracy')
    lr_tfidf_cv_scores1=cross_val_score(lr,tv_matrix, Y,cv=10,scoring='precision_macro')
    lr_tfidf_cv_scores2=cross_val_score(lr,tv_matrix, Y,cv=10,scoring='recall_macro')
    lr_tfidf_cv_scores3=cross_val_score(lr,tv_matrix, Y,cv=10,scoring='f1_macro')
    lr_tfidf_cv_scores4=cross_val_score(lr,tv_matrix, Y,cv=10,scoring='roc_auc')
    
    
    
    
    lr_tfidf_cv_mean_score = np.mean(lr_tfidf_cv_scores)
    lr_tfidf_cv_mean_score1 = np.mean(lr_tfidf_cv_scores1)
    lr_tfidf_cv_mean_score2 = np.mean(lr_tfidf_cv_scores2)
    lr_tfidf_cv_mean_score3 = np.mean(lr_tfidf_cv_scores3)
    lr_tfidf_cv_mean_score4 = np.mean(lr_tfidf_cv_scores4)
    
    
    
    print('CV Accuracy (10-fold):', lr_tfidf_cv_scores)
    print('Mean CV Accuracy:', lr_tfidf_cv_mean_score)
    
    print('CV Precision (10-fold):', lr_tfidf_cv_scores1)
    print('Mean CV Precision:', lr_tfidf_cv_mean_score1)
    
    print('CV Recall (10-fold):', lr_tfidf_cv_scores2)
    print('Mean CV Recall:', lr_tfidf_cv_mean_score2)
    
    print('CV f1-score (10-fold):', lr_tfidf_cv_scores3)
    print('Mean f1-score:', lr_tfidf_cv_mean_score3)
    
    print('CV auc (10-fold):', lr_tfidf_cv_scores4)
    print('Mean auc:', lr_tfidf_cv_mean_score4)


from sklearn.metrics import confusion_matrix
sm=confusion_matrix(test_label_names,lr_pred)


tp=sm[1,1]
fp=sm[0,1]
tn=sm[0,0]
fn=sm[1,0]
print(tp,fp,tn,fn)
precision=tp/(tp+fp)
recall=tp/(tp+fn)
accuracy=(tp+tn)/(tp+tn+fp+fn)
f_measure=(2*recall*precision)/(recall+precision)
print(precision,recall,accuracy,f_measure)


print(classification_report(test_label_names,lr_pred, target_names=target_names))

report = classification_report(test_label_names, lr_pred, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\mydataset\data\datatflogisticport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")

# Support Vector Machines
svm = LinearSVC(penalty='l2', C=1, random_state=42)
svm.fit(tv_matrix, Y)
svm_tfidf_cv_scores = cross_val_score(svm, tv_matrix, Y, cv=10)
svm_tfidf_cv_mean_score = np.mean(svm_tfidf_cv_scores)
print('CV Accuracy (10-fold):', svm_tfidf_cv_scores)
print('Mean CV Accuracy:', svm_tfidf_cv_mean_score)
svm_tfidf_test_score = svm.score(tv_test_features, test_label_names)
print('Test Accuracy:', svm_tfidf_test_score)

    svm.fit(X12,real_data['Label'])
    #lr_bow_cv_scores = cross_val_score(lr, cv_matrix, real_data['Label'], cv=10)
    svm_tfidf_cv_scores=cross_val_score(svm,tv_train_features, train_label_names,cv=10,scoring='accuracy')
    svm_tfidf_cv_scores1=cross_val_score(svm,tv_train_features, train_label_names,cv=10,scoring='precision_macro')
    svm_tfidf_cv_scores2=cross_val_score(svm,tv_train_features, train_label_names,cv=10,scoring='recall_macro')
    svm_tfidf_cv_scores3=cross_val_score(svm,tv_train_features, train_label_names,cv=10,scoring='f1_macro')
    svm_tfidf_cv_scores4=cross_val_score(svm,tv_train_features, train_label_names,cv=10,scoring='roc_auc')
    
    
    
    
    svm_tfidf_cv_mean_score = np.mean(svm_tfidf_cv_scores)
    svm_tfidf_cv_mean_score1 = np.mean(svm_tfidf_cv_scores1)
    svm_tfidf_cv_mean_score2 = np.mean(svm_tfidf_cv_scores2)
    svm_tfidf_cv_mean_score3 = np.mean(svm_tfidf_cv_scores3)
    svm_tfidf_cv_mean_score4 = np.mean(svm_tfidf_cv_scores4)
    
    
    
    print('CV Accuracy (10-fold):', svm_tfidf_cv_scores)
    print('Mean CV Accuracy:', svm_tfidf_cv_mean_score)
    
    print('CV Precision (10-fold):', svm_tfidf_cv_scores1)
    print('Mean CV Precision:', svm_tfidf_cv_mean_score1)
    
    print('CV Recall (10-fold):', svm_tfidf_cv_scores2)
    print('Mean CV Recall:', svm_tfidf_cv_mean_score2)
    
    print('CV f1-score (10-fold):', svm_tfidf_cv_scores3)
    print('Mean f1-score:', svm_tfidf_cv_mean_score3)
    
    print('CV auc (10-fold):', svm_tfidf_cv_scores4)
    print('Mean auc:', svm_tfidf_cv_mean_score4)

svm_pred=svm.predict(tv_test_features)

from sklearn.metrics import confusion_matrix
sm=confusion_matrix(test_label_names,svm_pred)


tp=sm[1,1]
fp=sm[0,1]
tn=sm[0,0]
fn=sm[1,0]
print(tp,fp,tn,fn)
precision=tp/(tp+fp)
recall=tp/(tp+fn)
accuracy=(tp+tn)/(tp+tn+fp+fn)
f_measure=(2*recall*precision)/(recall+precision)
print(precision,recall,accuracy,f_measure)


print(classification_report(test_label_names,svm_pred, target_names=target_names))

report = classification_report(test_label_names, svm_pred, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\mydataset\data\datatsvmreport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")
# SVM with Stochastic Gradient Descent
svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=100, random_state=42)
svm_sgd.fit(tv_matrix, Y)
svmsgd_tfidf_cv_scores = cross_val_score(svm_sgd, tv_matrix, Y, cv=10)
svmsgd_tfidf_cv_mean_score = np.mean(svmsgd_tfidf_cv_scores)
print('CV Accuracy (10-fold):', svmsgd_tfidf_cv_scores)
print('Mean CV Accuracy:', svmsgd_tfidf_cv_mean_score)
svmsgd_tfidf_test_score = svm_sgd.score(tv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_tfidf_test_score)

    svm_sgd.fit(X3,real_data['Label'])
    #lr_bow_cv_scores = cross_val_score(lr, cv_matrix, real_data['Label'], cv=10)
    svmgd_tfidf_cv_scores=cross_val_score(svm_sgd,tv_train_features, train_label_names,cv=10,scoring='accuracy')
    svmgd_tfidf_cv_scores1=cross_val_score(svm_sgd,tv_train_features, train_label_names,cv=10,scoring='precision_macro')
    svmgd_tfidf_cv_scores2=cross_val_score(svm_sgd,tv_train_features, train_label_names,cv=10,scoring='recall_macro')
    svmgd_tfidf_cv_scores3=cross_val_score(svm_sgd,tv_train_features, train_label_names,cv=10,scoring='f1_macro')
    svmgd_tfidf_cv_scores4=cross_val_score(svm_sgd,tv_train_features, train_label_names,cv=10,scoring='roc_auc')
    
    
    
    
    svmgd_tfidf_cv_mean_score = np.mean(svmgd_tfidf_cv_scores)
    svmgd_tfidf_cv_mean_score1 = np.mean(svmgd_tfidf_cv_scores1)
    svmgd_tfidf_cv_mean_score2 = np.mean(svmgd_tfidf_cv_scores2)
    svmgd_tfidf_cv_mean_score3 = np.mean(svmgd_tfidf_cv_scores3)
    svmgd_tfidf_cv_mean_score4 = np.mean(svmgd_tfidf_cv_scores4)
    
    
    
    print('CV Accuracy (10-fold):', svmgd_tfidf_cv_scores)
    print('Mean CV Accuracy:', svmgd_tfidf_cv_mean_score)
    
    print('CV Precision (10-fold):', svmgd_tfidf_cv_scores1)
    print('Mean CV Precision:', svmgd_tfidf_cv_mean_score1)
    
    print('CV Recall (10-fold):', svmgd_tfidf_cv_scores2)
    print('Mean CV Recall:', svmgd_tfidf_cv_mean_score2)
    
    print('CV f1-score (10-fold):', svmgd_tfidf_cv_scores3)
    print('Mean f1-score:', svmgd_tfidf_cv_mean_score3)
    
    print('CV auc (10-fold):', svmgd_tfidf_cv_scores4)
    print('Mean auc:', svmgd_tfidf_cv_mean_score4)


sgd_pred=svm_sgd.predict(tv_test_features)

print(classification_report(test_label_names,sgd_pred, target_names=target_names))

report = classification_report(test_label_names, sgd_pred, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\mydataset\data\datatsvm_sgdreport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")

from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
model1 = LogisticRegression()
model2 = LinearSVC()
model3=SGDClassifier()
model = VotingClassifier(estimators=[('lr', model1), ('svc', model2),('svm_sgd',model3)], voting='hard')


model.fit(tv_matrix, Y)

def forward_selection(data, target,model,significance_level=0.05):
    initial_features = data.columns.tolist()
    best_features = []
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model1 = model.OLS(target, sm.add_constant(data[best_features+[new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if(min_p_value<significance_level):
            best_features.append(new_pval.idxmin())
        else:
            break
    return best_features

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression
# Sequential Forward Selection(sfs)
sfs = SFS(model,
          k_features=72,
          forward=True,
          floating=False,
          scoring = 'r2',
          cv = 0)

new_feature=pd.DataFrame()
new_feature=forward_selection(df1,Y,model)



cv=RepeatedStratifiedKFold(n_splits=10,n_repeats=10,random_state=1)

scores_acc=cross_val_score(model,tv_matrix,Y,scoring='accuracy',cv=cv,n_jobs=1,error_score='raise')
scores_acc_mean=np.mean(scores_acc)
print('CV Accuracy (10-fold):',scores_acc)
print('Mean CV Accuracy:',scores_acc_mean)

scores_acc1=cross_val_score(model,tv_matrix,Y,scoring='precision_macro',cv=cv,n_jobs=1,error_score='raise')
scores_acc_mean1=np.mean(scores_acc1)
print('CV Precision (10-fold):',scores_acc1)
print('Mean CV Precision:',scores_acc_mean1)


scores_acc2=cross_val_score(model,tv_matrix,Y,scoring='recall_macro',cv=cv,n_jobs=1,error_score='raise')
scores_acc_mean2=np.mean(scores_acc2)
print('CV Recall (10-fold):',scores_acc2)
print('Mean CV Recall:',scores_acc_mean2)


scores_acc3=cross_val_score(model,tv_matrix,Y,scoring='f1_macro',cv=cv,n_jobs=1,error_score='raise')
scores_acc_mean3=np.mean(scores_acc3)
print('CV F-score (10-fold):',scores_acc3)
print('Mean CV F-score:',scores_acc_mean3)

scores_acc4=cross_val_score(model,tv_matrix,Y,scoring='roc_auc',cv=cv,n_jobs=1,error_score='raise')
scores_acc_mean4=np.mean(scores_acc1)
print('CV Roc value (10-fold):',scores_acc4)
print('Mean CV Roc:',scores_acc_mean4)



# Random Forest
rfc = RandomForestClassifier(n_estimators=10, random_state=42)
rfc.fit(tv_train_features, train_label_names)
rfc_tfidf_cv_scores = cross_val_score(rfc, tv_train_features, train_label_names, cv=10)
rfc_tfidf_cv_mean_score = np.mean(rfc_tfidf_cv_scores)
print('CV Accuracy (10-fold):', rfc_tfidf_cv_scores)
print('Mean CV Accuracy:', rfc_tfidf_cv_mean_score)
rfc_tfidf_test_score = rfc.score(tv_test_features, test_label_names)
print('Test Accuracy:', rfc_tfidf_test_score)

rfc_pred=rfc.predict(tv_test_features)


    rfc.fit(X3,real_data['Label'])
    #lr_bow_cv_scores = cross_val_score(lr, cv_matrix, real_data['Label'], cv=10)
    rfc_tfidf_cv_scores=cross_val_score(rfc,tv_train_features, train_label_names,cv=10,scoring='accuracy')
    rfc_tfidf_cv_scores1=cross_val_score(rfc,tv_train_features, train_label_names,cv=10,scoring='precision_macro')
    rfc_tfidf_cv_scores2=cross_val_score(rfc,tv_train_features, train_label_names,cv=10,scoring='recall_macro')
    rfc_tfidf_cv_scores3=cross_val_score(rfc,tv_train_features, train_label_names,cv=10,scoring='f1_macro')
    rfc_tfidf_cv_scores4=cross_val_score(rfc,tv_train_features, train_label_names,cv=10,scoring='roc_auc')
    
    
    
    
    rfc_tfidf_cv_mean_score = np.mean(rfc_tfidf_cv_scores)
    rfc_tfidf_cv_mean_score1 = np.mean(rfc_tfidf_cv_scores1)
    rfc_tfidf_cv_mean_score2 = np.mean(rfc_tfidf_cv_scores2)
    rfc_tfidf_cv_mean_score3 = np.mean(rfc_tfidf_cv_scores3)
    rfc_tfidf_cv_mean_score4 = np.mean(rfc_tfidf_cv_scores4)
    
    
    
    print('CV Accuracy (10-fold):', rfc_tfidf_cv_scores)
    print('Mean CV Accuracy:', rfc_tfidf_cv_mean_score)
    
    print('CV Precision (10-fold):', rfc_tfidf_cv_scores1)
    print('Mean CV Precision:', rfc_tfidf_cv_mean_score1)
    
    print('CV Recall (10-fold):', rfc_tfidf_cv_scores2)
    print('Mean CV Recall:', rfc_tfidf_cv_mean_score2)
    
    print('CV f1-score (10-fold):', rfc_tfidf_cv_scores3)
    print('Mean f1-score:', rfc_tfidf_cv_mean_score3)
    
    print('CV auc (10-fold):', rfc_tfidf_cv_scores4)
    print('Mean auc:', rfc_tfidf_cv_mean_score4)

from sklearn.metrics import confusion_matrix
sm=confusion_matrix(test_label_names,rfc_pred)


tp=sm[1,1]
fp=sm[0,1]
tn=sm[0,0]
fn=sm[1,0]
print(tp,fp,tn,fn)
precision=tp/(tp+fp)
recall=tp/(tp+fn)
accuracy=(tp+tn)/(tp+tn+fp+fn)
f_measure=(2*recall*precision)/(recall+precision)
print(precision,recall,accuracy,f_measure)


print(classification_report(test_label_names,rfc_pred, target_names=target_names))

report = classification_report(test_label_names, rfc_pred, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\mydataset\data\datafrfcreport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")

# Gradient Boosting
gbc = GradientBoostingClassifier(n_estimators=10, random_state=42)
gbc.fit(tv_train_features, train_label_names)
gbc_tfidf_cv_scores = cross_val_score(gbc, tv_train_features, train_label_names, cv=10)
gbc_tfidf_cv_mean_score = np.mean(gbc_tfidf_cv_scores)
print('CV Accuracy (10-fold):', gbc_tfidf_cv_scores)
print('Mean CV Accuracy:', gbc_tfidf_cv_mean_score)
gbc_tfidf_test_score = gbc.score(tv_test_features, test_label_names)
print('Test Accuracy:', gbc_tfidf_test_score)

gbc_pred=gbc.predict(tv_test_features)

print(classification_report(test_label_names,gbc_pred, target_names=target_names))

report = classification_report(test_label_names, gbc_pred, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\mydataset\data\datafgbcreport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")

# n-gram implementation

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score

bv = CountVectorizer(analyzer='char',ngram_range=(3,3),min_df=0.0,max_df=1.0)
bv_train_features = bv.fit_transform(train_corpus)

#df4=pd.DataFrame(train_corpus)
#df5=pd.DataFrame(test_corpus)
bv_matrix = bv.fit_transform(normalize_corpus1)
bv_matrix = bv_matrix.toarray()
vocab = bv.get_feature_names()
df3=pd.DataFrame(bv_matrix, columns=vocab)
print('N-gram model:> Train features shape:', bv_matrix.shape)

bv_matrix1 = bv.fit_transform(normalize_corpus1)
bv_matrix1 = bv_matrix1.toarray()
#vocab = bv.get_feature_names()
#df3=pd.DataFrame(bv_matrix, columns=vocab)
print('N-gram model:> Train features shape:', bv_matrix1.shape)

bv_matrix2 = bv.fit_transform(normalize_corpus1)
bv_matrix2 = bv_matrix2.toarray()
#vocab = bv.get_feature_names()
#df3=pd.DataFrame(bv_matrix, columns=vocab)
print('N-gram model:> Train features shape:', bv_matrix2.shape)

bv_matrix3=np.hstack((bv_matrix1,bv_matrix2))



print('N-gram model:> Train features shape:', bv_matrix3.shape)


bv_matrix4=np.hstack((bv_matrix,bv_matrix1,bv_matrix2))



print('N-gram model:> Train features shape:', bv_matrix4.shape)
df3.to_csv("F:\Finaldata\Final\datanew\char_bigram223.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")
#df3.to_csv('F:/Finaldata/Final/data1/final_bigram.txt', sep=',', index=False,encoding="utf-32")

# transform test articles into features
bv_test_features = bv.transform(test_corpus)

print('N-gram model:> Train features shape:', bv_matrix.shape)

print('N-gram model:> Train features shape:', bv_train_features.shape,
' Test features shape:', bv_test_features.shape)

# Naïve Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB(alpha=1)
mnb.fit(bv_train_features, train_label_names)
mnb_ngram_scores = cross_val_score(mnb, bv_train_features, train_label_names, cv=10)




mnb_ngram_mean_score = np.mean(mnb_ngram_scores)
print('CV Accuracy (10-fold):', mnb_ngram_scores)
print('Mean CV Accuracy:', mnb_ngram_mean_score)
mnb_ngram_test_score = mnb.score(bv_test_features, test_label_names)
print('Test Accuracy:', mnb_ngram_test_score)

mnb_pred=mnb.predict(bv_test_features)
from sklearn.metrics import confusion_matrix
sm=confusion_matrix(test_label_names,mnb_pred)


tp=sm[1,1]
fp=sm[0,1]
tn=sm[0,0]
fn=sm[1,0]
print(tp,fp,tn,fn)
precision=tp/(tp+fp)
recall=tp/(tp+fn)
accuracy=(tp+tn)/(tp+tn+fp+fn)
f_measure=(2*recall*precision)/(recall+precision)
print(precision,recall,accuracy,f_measure)


print(classification_report(test_label_names,mnb_pred, target_names=target_names))

report = classification_report(test_label_names, mnb_pred, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\mydataset\data\gramnaiveport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")

#bagging
from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier(n_estimators=10, random_state=42)
bagging.fit(bv_train_features, train_label_names)
bagging_bow_cv_scores = cross_val_score(bagging, bv_train_features, train_label_names, cv=10)
bagging_bow_cv_mean_score = np.mean(bagging_bow_cv_scores)
print('CV Accuracy (10-fold):', bagging_bow_cv_scores)
print('Mean CV Accuracy:', bagging_bow_cv_mean_score)
bagging_bow_test_score = bagging.score(bv_test_features, test_label_names)
print('Test Accuracy:', bagging_bow_test_score)
bagging_pred=bagging.predict(bv_test_features)

print(classification_report(test_label_names,bagging_pred, target_names=target_names))

# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=42)
lr.fit(bv_matrix, Y)
lr_ngram_cv_scores = cross_val_score(lr, bv_matrix, Y, cv=10)
lr_ngram_cv_mean_score = np.mean(lr_ngram_cv_scores)
print('CV Accuracy (10-fold):', lr_ngram_cv_scores)
print('Mean CV Accuracy:', lr_ngram_cv_mean_score)
lr_ngram_test_score = lr.score(bv_test_features, test_label_names)
print('Test Accuracy:', lr_ngram_test_score)

lr_pred=lr.predict(bv_test_features)


    lr.fit(bv_matrix,real_data['Label'])
    #lr_bow_cv_scores = cross_val_score(lr, cv_matrix, real_data['Label'], cv=10)
    lr_ngram_cv_scores=cross_val_score(lr,bv_matrix, Y,cv=10,scoring='accuracy')
    lr_ngram_cv_scores1=cross_val_score(lr,bv_matrix, Y,cv=10,scoring='precision_macro')
    lr_ngram_cv_scores2=cross_val_score(lr,bv_matrix, Y,cv=10,scoring='recall_macro')
    lr_ngram_cv_scores3=cross_val_score(lr,bv_matrix, Y,cv=10,scoring='f1_macro')
    lr_ngram_cv_scores4=cross_val_score(lr,bv_matrix, Y,cv=10,scoring='roc_auc')
    
    
    
    
    lr_ngram_cv_mean_score = np.mean(lr_ngram_cv_scores)
    lr_ngram_cv_mean_score1 = np.mean(lr_ngram_cv_scores1)
    lr_ngram_cv_mean_score2 = np.mean(lr_ngram_cv_scores2)
    lr_ngram_cv_mean_score3 = np.mean(lr_ngram_cv_scores3)
    lr_ngram_cv_mean_score4 = np.mean(lr_ngram_cv_scores4)
    
    
    
    print('CV Accuracy (10-fold):', lr_ngram_cv_scores)
    print('Mean CV Accuracy:', lr_ngram_cv_mean_score)
    
    print('CV Precision (10-fold):', lr_ngram_cv_scores1)
    print('Mean CV Precision:', lr_ngram_cv_mean_score1)
    
    print('CV Recall (10-fold):', lr_ngram_cv_scores2)
    print('Mean CV Recall:', lr_ngram_cv_mean_score2)
    
    print('CV f1-score (10-fold):', lr_ngram_cv_scores3)
    print('Mean f1-score:', lr_ngram_cv_mean_score3)
    
    print('CV auc (10-fold):', lr_ngram_cv_scores4)
    print('Mean auc:', lr_ngram_cv_mean_score4)

from sklearn.metrics import confusion_matrix
sm=confusion_matrix(test_label_names,lr_pred)


tp=sm[1,1]
fp=sm[0,1]
tn=sm[0,0]
fn=sm[1,0]
print(tp,fp,tn,fn)
precision=tp/(tp+fp)
recall=tp/(tp+fn)
accuracy=(tp+tn)/(tp+tn+fp+fn)
f_measure=(2*recall*precision)/(recall+precision)
print(precision,recall,accuracy,f_measure)


print(classification_report(test_label_names,lr_pred, target_names=target_names))

report = classification_report(test_label_names, lr_pred, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\mydataset\data\gramnlogiseport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")
# Support Vector Machines
from sklearn.svm import LinearSVC
svm = LinearSVC(penalty='l2', C=1, random_state=42)
svm.fit(bv_matrix, Y)
svm_ngram_cv_scores = cross_val_score(svm, bv_matrix, Y, cv=10)
svm_ngram_cv_mean_score = np.mean(svm_ngram_cv_scores)
print('CV Accuracy (10-fold):', svm_ngram_cv_scores)
print('Mean CV Accuracy:', svm_ngram_cv_mean_score)
svm_ngram_test_score = svm.score(bv_test_features, test_label_names)
print('Test Accuracy:', svm_ngram_test_score)

#from sklearn.datasets import make_regressions
from sklearn.feature_selection import mutual_info_classif

fs = mutual_info_classif(bv_matrix4,real_data['Label'])



    svm.fit(X3,real_data['Label'])
    #lr_bow_cv_scores = cross_val_score(lr, cv_matrix, real_data['Label'], cv=10)
    svm_ngram_cv_scores=cross_val_score(svm,bv_train_features, train_label_names,cv=10,scoring='accuracy')
    svm_ngram_cv_scores1=cross_val_score(svm,bv_train_features, train_label_names,cv=10,scoring='precision_macro')
    svm_ngram_cv_scores2=cross_val_score(svm,bv_train_features, train_label_names,cv=10,scoring='recall_macro')
    svm_ngram_cv_scores3=cross_val_score(svm,bv_train_features, train_label_names,cv=10,scoring='f1_macro')
    svm_ngram_cv_scores4=cross_val_score(svm,bv_train_features, train_label_names,cv=10,scoring='roc_auc')
    
    
    
    
    svm_ngram_cv_mean_score = np.mean(svm_ngram_cv_scores)
    svm_ngram_cv_mean_score1 = np.mean(svm_ngram_cv_scores1)
    svm_ngram_cv_mean_score2 = np.mean(svm_ngram_cv_scores2)
    svm_ngram_cv_mean_score3 = np.mean(svm_ngram_cv_scores3)
    svm_ngram_cv_mean_score4 = np.mean(svm_ngram_cv_scores4)
    
    
    
    print('CV Accuracy (10-fold):', svm_ngram_cv_scores)
    print('Mean CV Accuracy:', svm_ngram_cv_mean_score)
    
    print('CV Precision (10-fold):', svm_ngram_cv_scores1)
    print('Mean CV Precision:', svm_ngram_cv_mean_score1)
    
    print('CV Recall (10-fold):', svm_ngram_cv_scores2)
    print('Mean CV Recall:', svm_ngram_cv_mean_score2)
    
    print('CV f1-score (10-fold):', svm_ngram_cv_scores3)
    print('Mean f1-score:', svm_ngram_cv_mean_score3)
    
    print('CV auc (10-fold):', svm_ngram_cv_scores4)
    print('Mean auc:', svm_ngram_cv_mean_score4)

svm_pred=svm.predict(bv_test_features)
from sklearn.metrics import confusion_matrix
sm=confusion_matrix(test_label_names,svm_pred)


tp=sm[1,1]
fp=sm[0,1]
tn=sm[0,0]
fn=sm[1,0]
print(tp,fp,tn,fn)
precision=tp/(tp+fp)
recall=tp/(tp+fn)
accuracy=(tp+tn)/(tp+tn+fp+fn)
f_measure=(2*recall*precision)/(recall+precision)
print(precision,recall,accuracy,f_measure)


print(classification_report(test_label_names,svm_pred, target_names=target_names))

report = classification_report(test_label_names, svm_pred, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\mydataset\data\gramnsvmreport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")
# SVM with Stochastic Gradient Descent

from sklearn.linear_model import SGDClassifier
svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=50, random_state=42)
svm_sgd.fit(bv_matrix, Y)
svmsgd_ngram_cv_scores = cross_val_score(svm_sgd, bv_matrix, Y, cv=10)
svmsgd_ngram_cv_mean_score = np.mean(svmsgd_ngram_cv_scores)
print('CV Accuracy (10-fold):', svmsgd_ngram_cv_scores)
print('Mean CV Accuracy:', svmsgd_ngram_cv_mean_score)
svmsgd_ngram_test_score = svm_sgd.score(bv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_ngram_test_score)

sgd_pred=svm_sgd.predict(bv_test_features)

    svm_sgd.fit(bv_matrix,real_data['Label'])
    #lr_bow_cv_scores = cross_val_score(lr, cv_matrix, real_data['Label'], cv=10)
    svmgd_ngram_cv_scores=cross_val_score(svm_sgd,bv_train_features, train_label_names,cv=10,scoring='accuracy')
    svmgd_ngram_cv_scores1=cross_val_score(svm_sgd,bv_train_features, train_label_names,cv=10,scoring='precision_macro')
    svmgd_ngram_cv_scores2=cross_val_score(svm_sgd,bv_train_features, train_label_names,cv=10,scoring='recall_macro')
    svmgd_ngram_cv_scores3=cross_val_score(svm_sgd,bv_train_features, train_label_names,cv=10,scoring='f1_macro')
    svmgd_ngram_cv_scores4=cross_val_score(svm_sgd,bv_train_features, train_label_names,cv=10,scoring='roc_auc')
    
    
    
    
    svmgd_ngram_cv_mean_score = np.mean(svmgd_ngram_cv_scores)
    svmgd_ngram_cv_mean_score1 = np.mean(svmgd_ngram_cv_scores1)
    svmgd_ngram_cv_mean_score2 = np.mean(svmgd_ngram_cv_scores2)
    svmgd_ngram_cv_mean_score3 = np.mean(svmgd_ngram_cv_scores3)
    svmgd_ngram_cv_mean_score4 = np.mean(svmgd_ngram_cv_scores4)
    
    
    
    print('CV Accuracy (10-fold):', svmgd_ngram_cv_scores)
    print('Mean CV Accuracy:', svmgd_ngram_cv_mean_score)
    
    print('CV Precision (10-fold):', svmgd_ngram_cv_scores1)
    print('Mean CV Precision:', svmgd_ngram_cv_mean_score1)
    
    print('CV Recall (10-fold):', svmgd_ngram_cv_scores2)
    print('Mean CV Recall:', svmgd_ngram_cv_mean_score2)
    
    print('CV f1-score (10-fold):', svmgd_ngram_cv_scores3)
    print('Mean f1-score:', svmgd_ngram_cv_mean_score3)
    
    print('CV auc (10-fold):', svmgd_ngram_cv_scores4)
    print('Mean auc:', svmgd_ngram_cv_mean_score4)


print(classification_report(test_label_names,sgd_pred, target_names=target_names))

report = classification_report(test_label_names, sgd_pred, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\mydataset\data\gramnsgdeport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")


from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
model1 = LogisticRegression()
model2 = LinearSVC()
model3=SGDClassifier()
model = VotingClassifier(estimators=[('lr', model1), ('svc', model2),('svm_sgd',model3)], voting='hard')


model.fit(bv_matrix4, Y)

cv=RepeatedStratifiedKFold(n_splits=10,n_repeats=10,random_state=1)

scores_acc=cross_val_score(model,bv_matrix4,Y,scoring='accuracy',cv=cv,n_jobs=1,error_score='raise')
scores_acc_mean=np.mean(scores_acc)
print('CV Accuracy (10-fold):',scores_acc)
print('Mean CV Accuracy:',scores_acc_mean)

scores_acc1=cross_val_score(model,bv_matrix4,Y,scoring='precision_macro',cv=cv,n_jobs=1,error_score='raise')
scores_acc_mean1=np.mean(scores_acc1)
print('CV Precision (10-fold):',scores_acc1)
print('Mean CV Precision:',scores_acc_mean1)


scores_acc2=cross_val_score(model,bv_matrix4,Y,scoring='recall_macro',cv=cv,n_jobs=1,error_score='raise')
scores_acc_mean2=np.mean(scores_acc2)
print('CV Recall (10-fold):',scores_acc2)
print('Mean CV Recall:',scores_acc_mean2)


scores_acc3=cross_val_score(model,bv_matrix4,Y,scoring='f1_macro',cv=cv,n_jobs=1,error_score='raise')
scores_acc_mean3=np.mean(scores_acc3)
print('CV F-score (10-fold):',scores_acc3)
print('Mean CV F-score:',scores_acc_mean3)

scores_acc4=cross_val_score(model,tv_matrix,Y,scoring='roc_auc',cv=cv,n_jobs=1,error_score='raise')
scores_acc_mean4=np.mean(scores_acc1)
print('CV Roc value (10-fold):',scores_acc4)
print('Mean CV Roc:',scores_acc_mean4)
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=10, random_state=42)
rfc.fit(bv_train_features, train_label_names)
rfc_ngram_cv_scores = cross_val_score(rfc, bv_train_features, train_label_names, cv=10)
rfc_ngram_cv_mean_score = np.mean(rfc_ngram_cv_scores)
print('CV Accuracy (10-fold):', rfc_ngram_cv_scores)
print('Mean CV Accuracy:', rfc_ngram_cv_mean_score)
rfc_ngram_test_score = rfc.score(bv_test_features, test_label_names)
print('Test Accuracy:', rfc_ngram_test_score)

    rfc.fit(bv_matrix,real_data['Label'])
    #lr_bow_cv_scores = cross_val_score(lr, cv_matrix, real_data['Label'], cv=10)
    rfc_ngram_cv_scores=cross_val_score(rfc,bv_train_features, train_label_names,cv=10,scoring='accuracy')
    rfc_ngram_cv_scores1=cross_val_score(rfc,bv_train_features, train_label_names,cv=10,scoring='precision_macro')
    rfc_ngram_cv_scores2=cross_val_score(rfc,bv_train_features, train_label_names,cv=10,scoring='recall_macro')
    rfc_ngram_cv_scores3=cross_val_score(rfc,bv_train_features, train_label_names,cv=10,scoring='f1_macro')
    rfc_ngram_cv_scores4=cross_val_score(rfc,bv_train_features, train_label_names,cv=10,scoring='roc_auc')
    
    
    
    
    rfc_ngram_cv_mean_score = np.mean(rfc_ngram_cv_scores)
    rfc_ngram_cv_mean_score1 = np.mean(rfc_ngram_cv_scores1)
    rfc_ngram_cv_mean_score2 = np.mean(rfc_ngram_cv_scores2)
    rfc_ngram_cv_mean_score3 = np.mean(rfc_ngram_cv_scores3)
    rfc_ngram_cv_mean_score4 = np.mean(rfc_ngram_cv_scores4)
    
    
    
    print('CV Accuracy (10-fold):', rfc_ngram_cv_scores)
    print('Mean CV Accuracy:', rfc_ngram_cv_mean_score)
    
    print('CV Precision (10-fold):', rfc_ngram_cv_scores1)
    print('Mean CV Precision:', rfc_ngram_cv_mean_score1)
    
    print('CV Recall (10-fold):', rfc_ngram_cv_scores2)
    print('Mean CV Recall:', rfc_ngram_cv_mean_score2)
    
    print('CV f1-score (10-fold):', rfc_ngram_cv_scores3)
    print('Mean f1-score:', rfc_ngram_cv_mean_score3)
    
    print('CV auc (10-fold):', rfc_ngram_cv_scores4)
    print('Mean auc:', rfc_ngram_cv_mean_score4)
rfc_pred=rfc.predict(bv_test_features)
from sklearn.metrics import confusion_matrix
sm=confusion_matrix(test_label_names,rfc_pred)


tp=sm[1,1]
fp=sm[0,1]
tn=sm[0,0]
fn=sm[1,0]
print(tp,fp,tn,fn)
precision=tp/(tp+fp)
recall=tp/(tp+fn)
accuracy=(tp+tn)/(tp+tn+fp+fn)
f_measure=(2*recall*precision)/(recall+precision)
print(precision,recall,accuracy,f_measure)


print(classification_report(test_label_names,rfc_pred, target_names=target_names))

report = classification_report(test_label_names, rfc_pred, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\mydataset\data\gramnrandomforeport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")

from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
model1 = LogisticRegression()
model2 = LinearSVC()
model3=SGDClassifier()
model = VotingClassifier(estimators=[('lr', model1), ('svc', model2),('svm_sgd',model3)], voting='hard')


model.fit(bv_matrix, Y)

cv=RepeatedStratifiedKFold(n_splits=10,n_repeats=10,random_state=1)

scores_acc=cross_val_score(model,bv_matrix,Y,scoring='accuracy',cv=cv,n_jobs=1,error_score='raise')
scores_acc_mean=np.mean(scores_acc)
print('CV Accuracy (10-fold):',scores_acc)
print('Mean CV Accuracy:',scores_acc_mean)

scores_acc1=cross_val_score(model,bv_matrix,Y,scoring='precision_macro',cv=cv,n_jobs=1,error_score='raise')
scores_acc_mean1=np.mean(scores_acc1)
print('CV Precision (10-fold):',scores_acc1)
print('Mean CV Precision:',scores_acc_mean1)


scores_acc2=cross_val_score(model,bv_matrix,Y,scoring='recall_macro',cv=cv,n_jobs=1,error_score='raise')
scores_acc_mean2=np.mean(scores_acc2)
print('CV Recall (10-fold):',scores_acc2)
print('Mean CV Recall:',scores_acc_mean2)


scores_acc3=cross_val_score(model,bv_matrix,Y,scoring='f1_macro',cv=cv,n_jobs=1,error_score='raise')
scores_acc_mean3=np.mean(scores_acc3)
print('CV F-score (10-fold):',scores_acc3)
print('Mean CV F-score:',scores_acc_mean3)

scores_acc4=cross_val_score(model,bv_matrix,Y,scoring='roc_auc',cv=cv,n_jobs=1,error_score='raise')
scores_acc_mean4=np.mean(scores_acc1)
print('CV Roc value (10-fold):',scores_acc4)
print('Mean CV Roc:',scores_acc_mean4)


# Gradient Boosting Machines
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=10, random_state=42)
gbc.fit(bv_train_features, train_label_names)
gbc_ngram_cv_scores = cross_val_score(gbc, bv_train_features, train_label_names, cv=10)
gbc_ngram_cv_mean_score = np.mean(gbc_ngram_cv_scores)
print('CV Accuracy (10-fold):', gbc_ngram_cv_scores)
print('Mean CV Accuracy:', gbc_ngram_cv_mean_score)
gbc_ngram_test_score = gbc.score(bv_test_features, test_label_names)
print('Test Accuracy:', gbc_ngram_test_score)

gbc_pred=gbc.predict(bv_test_features)

print(classification_report(test_label_names,gbc_pred, target_names=target_names))

report = classification_report(test_label_names, gbc_pred, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\mydataset\data\gramngbcreport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")



real_data1=real_data1.append(pd.DataFrame([['Naive Bayes', mnb_bow_cv_mean_score, mnb_bow_test_score,
mnb_tfidf_cv_mean_score, mnb_tfidf_test_score,mnb_ngram_mean_score,mnb_ngram_test_score,0,0],
['Logistic Regression', lr_bow_cv_mean_score, lr_bow_test_score, lr_tfidf_cv_mean_score, lr_tfidf_test_score,
 lr_ngram_cv_mean_score,lr_ngram_test_score,lr_w2c_cv_mean_score,lr_w2c_test_score],
['Linear SVM', svm_bow_cv_mean_score, svm_bow_test_score,
svm_tfidf_cv_mean_score, svm_tfidf_test_score,svm_ngram_cv_mean_score,svm_ngram_test_score,svm_w2c_cv_mean_score,svm_w2c_test_score],
['Linear SVM (SGD)', svmsgd_bow_cv_mean_score, svm_bow_test_score, svmsgd_tfidf_cv_mean_score, svmsgd_tfidf_test_score,svmsgd_ngram_cv_mean_score,svmsgd_ngram_test_score,
 svmsgd_w2c_cv_mean_sore,svmsgd_w2c_test_score],
 ['Random Forest', rfc_bow_cv_mean_score, rfc_bow_test_score,
rfc_tfidf_cv_mean_score, rfc_tfidf_test_score,rfc_ngram_cv_mean_score,rfc_ngram_test_score,rfc_w2c_cv_mean_score,rfc_w2c_test_score],
['Gradient Boosted Machines', gbc_bow_cv_mean_score, gbc_bow_test_score, gbc_tfidf_cv_mean_score, gbc_tfidf_test_score,gbc_ngram_cv_mean_score,gbc_ngram_test_score,
 gbc_w2c_cv_mean_score,gbc_w2c_test_score]],


columns=['Model', 'CV Score (Bag of Words)', 'Test Score (Bag of Words)',
'CV Score (TF-IDF)', 'Test Score (TF-IDF)','CV Score (Word-ngram)','Test Score (Word-ngram)',
'CV Score (Word 2 Vec)','Test Score(Word2Vec)']).T)
real_data1.to_csv("F:\Finaldata\mydata88.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")


wordfreq = {}
wordfreq1={}
filtered_token=[]
sentence=[]
t_tokens=[]
real_data2=pd.DataFrame()

#data2=pd.read_csv("D:\Finaldata\mydataset\consolidatedlast.csv")#,encoding="iso 8859-1")
data2=pd.read_csv("D:\Final\latest\consolidatedfinalclean1.csv")#,encoding="iso 8859-1")
print(data2['message'][0])

normalize_corpus=[]
normalize_corpus2=[]
vocab1=[]
vocab = Counter()
for i in range(0,len(data2)):
    txt=data2['message'][i]
    #docdoc = nltk.tokenize.wordpunct_tokenize(txt)
    #token123=[]
    #for token in docdoc:
        
        
     #   if(token=='==========' or token=='=' or token=='==' or token=='===' or token=='+++' or token=='>>>' or token=='++=' or token=='~~~' or token=='NAME' or token=='=====' or token=='íӀօѵҽ' or token=='>>>>' or token=='Ñāāì' or token=='×=×' or token=='×=×=' or token=='```' or token=='|==' or token==' '):
      #      continue 
      #  else:
       #     token123.append(token)
            
             
    #sentence1=' ' .join(token123)
    
    #docdoc = nltk.tokenize.wordpunct_tokenize(txt)
    #for token in docdoc:
     #   vocab1.append(token)
    
    #sentence_tokens = nltk.tokenize.wordpunct_tokenize(txt)
    #filtered_tokens = [token for token in sentence_tokens if token not in stop_words and len(token)>2 and token!=' ']
    
    #for token in docdoc:
     #       if token not in wordfreq.keys():
      #          wordfreq[token]=1
       #     else:
        #        wordfreq[token]+=1
    #for token in docdoc:
     #   t_tokens.append(token)
      #  if token not in stop_words:
       #     if token not in wordfreq1.keys():-
        #        wordfreq1[token]=1
         #   else:
          #      wordfreq1[token]+=1
            
           # filtered_token.append(token)
         
    #sentence=' ' .join(filtered_tokens)
    #real_data2=real_data2.append({'ID':i,'message':sentence1},ignore_index=True)        
    normalize_corpus.append(txt) 
    
real_data2.to_csv("F:\Finaldata\Final\latest\cosolidatedfinalclean.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")     
from collections import Counter
vocab=Counter(vocab1)
print(len(vocab))


min_occurane = 2
tokens = [k for k,c in vocab.items() if c >= min_occurane]
tokens1=[]
for t in tokens:
    t1=t.encode('utf-8')
    tokens1.append(t1)
    
    
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=208509)


tokens1 = tokens.encode('utf-8')
print(len(tokens))

print(vocab.most_common(50))

def save_list(lines, filename):
	# convert lines to a single blob of text
    #j='u'+lines
	data = '\n'.join(lines)
	# open file
	file = open(filename, 'w',encoding='utf8')
	# write text
	file.write(data)
	# close file
	file.close()
 
# save tokens to a vocabulary file
save_list(tokens, 'd:/vocab.txt')

vocab = vocab.split()
vocab = set(vocab)

def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r',encoding='utf8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text
 
# load the vocabulary
vocab_filename = 'd:/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)





import matplotlib.pyplot as plt

plt.bar(list(wordfreq1.keys()), wordfreq1.values(), color='g')
plt.show()


# frequencies of the words
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import string

frequency_dist = nltk.FreqDist(normalize_corpus)
frequency_dist
sorted_frequency_dist =sorted(frequency_dist,key=frequency_dist.__getitem__, reverse=True)
sorted_frequency_dist
large_words = dict([(k,v) for k,v in frequency_dist.items() if
len(k)>3])
frequency_dist = nltk.FreqDist(large_words)
frequency_dist.plot(20,cumulative=False)

    
#print(normalize_corpus)   
import gensim
from gensim.models import word2vec
wpt = nltk.WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(document) for document in normalize_corpus]
print(len(tokenized_corpus))
# Set values for various parameters
feature_size = 100 # Word vector dimensionality
window_context = 10 # Context window size
min_word_count = 1 # Minimum word count
sample = 1e-3

from gensim.models.fasttext import FastText
ft_num_features = 200
# sg decides whether to use the skip-gram model (1) or CBOW (0)
ft_model = FastText(tokenized_corpus, size=ft_num_features, window=100,
min_count=2, sample=1e-3, sg=1, iter=5, workers=10)

words1 = ft_model.wv.index2word
#print(len(words))
wvs1 = ft_model.wv[words1]

normalize22= [wpt.tokenize(text) for text in normalize_corpus1] 

final_model1= document_vectorizer(corpus=normalize22,
model=ft_model, num_features=ft_num_features)

avg_ft_train_features = document_vectorizer(corpus=tokenized_train,
model=ft_model, num_features=ft_num_features)
avg_ft_test_features = document_vectorizer(corpus=tokenized_test,
model=ft_model, num_features=ft_num_features)

print('FastText model:> Train features shape:', avg_ft_train_features.shape,
' Test features shape:', avg_ft_test_features.shape)

d4=pd.DataFrame(final_model1)

d4.to_excel("F:\Finaldata\Final\data1\datatext200.xls", index=False,float_format='%.2f', na_rep="NAN!",encoding="utf-32")
#d3.append(avg_wv_test_features)
d4.to_csv("F:\Finaldata\Final\latest\datatext200.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")

# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=42)
 #mnb.fit(cv_matrix,real_data['Label'])
lr.fit(final_model1,real_data['Label'])
#lr.fit(avg_ft_train_features, train_label_names)
lr_w2c_cv_scores = cross_val_score(lr,final_model1 , real_data['Label'], cv=10)
lr_w2c_cv_mean_score = np.mean(lr_w2c_cv_scores)
print('CV Accuracy (10-fold):', lr_w2c_cv_scores)
print('Mean CV Accuracy:', lr_w2c_cv_mean_score)
lr_w2c_test_score = lr.score(avg_ft_test_features, test_label_names)
print('Test Accuracy:', lr_w2c_test_score)


w2v_model = word2vec.Word2Vec(tokenized_corpus, vector_size=feature_size,
window=window_context, min_count = min_word_count, sample=sample,workers=10)
from sklearn.manifold import TSNE
words = w2v_model.wv.index_to_key

#print(len(words))
wvs = w2v_model.wv[words]
m=[]
n=[]
k=[]
l=[]

for j in range(0,len(wvs)):
    n=words[j]
    k.append(n)
    m=wvs[j].T
    l.append(m)
    
#real_data=real_data.append({'Label':data['label'][i],'Message':sentence},ignore_index=True)
for i in range(0,len(words)):
    print(wvs[i])#,ignore_index=True)
d1=pd.DataFrame(k)
d2=pd.DataFrame(l)    
#d1=pd.DataFrame(words,wvs)
d1.to_csv("F:\Finaldata\Final\latest\word300.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")
d2.to_csv("F:\Finaldata\Final\latest\dimension300.csv")

print(wvs)
print(len(w2v_model.wv.index2word))
def document_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index_to_key)
    
    def average_word_vectors(words, model, vocabulary, num_features):
        feature_vector = np.zeros((num_features,), dtype="float64")
        nwords = 0.
        for word in words:
            if word in vocabulary:
                nwords = nwords + 1.
                feature_vector = np.add(feature_vector, model.wv[word])
                
        if nwords:
            feature_vector = np.divide(feature_vector, nwords)
        return feature_vector
    features = [average_word_vectors(tokenized_sentence, model, vocabulary,
                                     num_features) for tokenized_sentence in corpus]
    return np.array(features) 

tokenized_corpus = [wpt.tokenize(document) for document in normalize_corpus]
    
tokenized_train = [wpt.tokenize(text) for text in train_corpus]

tokenized_train1 = [wpt.tokenize(text) for text in normalize_corpus1]

tokenized_test =  [wpt.tokenize(text) for text in test_corpus]

normalize= [wpt.tokenize(text) for text in normalize_corpus] 
normalize1= [wpt.tokenize(text) for text in normalize_corpus3] 
import gensim
# build word2vec model
w2v_num_features = 100


feature_size = 100 # Word vector dimensionality
window_context = 10 # Context window size
min_word_count = 1 # Minimum word count
sample = 1e-3

w2v_model = gensim.models.Word2Vec(tokenized_corpus, vector_size=feature_size,
window=window_context, min_count=1, sample=sample, sg=1, workers=10)

final_model= document_vectorizer(corpus=normalize,
model=w2v_model, num_features=w2v_num_features)


avg_wv_train_features = document_vectorizer(corpus=tokenized_train,
model=w2v_model, num_features=w2v_num_features)

avg_wv_train_features1 = document_vectorizer(corpus=tokenized_train1,
model=w2v_model, num_features=w2v_num_features)

avg_wv_test_features = document_vectorizer(corpus=tokenized_test,
model=w2v_model, num_features=w2v_num_features)

d3=pd.DataFrame(final_model)
#d3.append(avg_wv_test_features)
d3.to_excel("F:\Finaldata\Final\data1\wrod2vec3.xls", index=False,float_format='%.2f', na_rep="NAN!",encoding="utf-32")
d3.to_csv("F:\Finaldata\Final\latest\word2vec201.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")

print('Word2Vec model:> Train features shape:', avg_wv_train_features.
shape,' Test features shape:', avg_wv_train_features.shape)

print('Word2Vec model:> Train features shape:', final_model.shape)



from sklearn.ensemble import AdaBoostClassifier

# Import Support Vector Classifier
from sklearn.svm import SVC
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
svc=SVC(probability=True, kernel='linear')

# Create adaboost classifer object
abc =AdaBoostClassifier(n_estimators=100,random_state=0)

abc.fit(avg_wv_train_features1,Y)

    abc_bow_cv_scores=cross_val_score(abc,avg_wv_train_features1,Y,cv=10,scoring='accuracy')
    abc_bow_cv_scores1=cross_val_score(abc,avg_wv_train_features1,Y,cv=10,scoring='precision_macro')
    abc_bow_cv_scores2=cross_val_score(abc,avg_wv_train_features1,Y,cv=10,scoring='recall_macro')
    abc_bow_cv_scores3=cross_val_score(abc,avg_wv_train_features1,Y,cv=10,scoring='f1_macro')
    abc_bow_cv_scores4=cross_val_score(abc,avg_wv_train_features1,Y,cv=10,scoring='roc_auc')
    
    
    
    
    abc_bow_cv_mean_score = np.mean(abc_bow_cv_scores)
    abc_bow_cv_mean_score1 = np.mean(abc_bow_cv_scores1)
    abc_bow_cv_mean_score2 = np.mean(abc_bow_cv_scores2)
    abc_bow_cv_mean_score3 = np.mean(abc_bow_cv_scores3)
    abc_bow_cv_mean_score4 = np.mean(abc_bow_cv_scores4)
    
    
    
    print('CV Accuracy (10-fold):', abc_bow_cv_scores)
    print('Mean CV Accuracy:', abc_bow_cv_mean_score)
    
    print('CV Precision (10-fold):', abc_bow_cv_scores1)
    print('Mean CV Precision:', abc_bow_cv_mean_score1)
    
    print('CV Recall (10-fold):', abc_bow_cv_scores2)
    print('Mean CV Recall:', abc_bow_cv_mean_score2)
    
    print('CV f1-score (10-fold):', abc_bow_cv_scores3)
    print('Mean f1-score:', abc_bow_cv_mean_score3)
    
    print('CV auc (10-fold):', abc_bow_cv_scores4)
    print('Mean auc:', abc_bow_cv_mean_score4)


# Naïve Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB(alpha=1)
mnb.fit(avg_wv_train_features, train_label_names)
mnb_w2v_bv_scores = cross_val_score(mnb, avg_wv_train_features, train_label_names, cv=10)

mnb_w2v_bv_mean_score = np.mean(mnb_w2v_bv_scores)
print('CV Accuracy (10-fold):', mnb_w2v_bv_scores)
print('Mean CV Accuracy:', mnb_w2v_bv_mean_score)
mnb_w2v_test_score = mnb.score(avg_wv_test_features, test_label_names)
print('Test Accuracy:', mnb_w2v_test_score)

# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=42)
lr.fit(avg_wv_train_features1,real_data['Label'] )
lr_w2c_cv_scores = cross_val_score(lr, avg_wv_train_features1, real_data['Label'], cv=10)
lr_w2c_cv_mean_score = np.mean(lr_w2c_cv_scores)
print('CV Accuracy (10-fold):', lr_w2c_cv_scores)
print('Mean CV Accuracy:', lr_w2c_cv_mean_score)
lr_w2c_test_score = lr.score(avg_wv_test_features, test_label_names)
print('Test Accuracy:', lr_w2c_test_score)

lr_pred=lr.predict(avg_wv_train_features)

    lr.fit(avg_wv_train_features1,real_data['Label'])
    #lr_bow_cv_scores = cross_val_score(lr, cv_matrix, real_data['Label'], cv=10)
    lr_w2c_cv_scores=cross_val_score(lr,avg_wv_train_features1,real_data['Label'],cv=10,scoring='accuracy')
    lr_w2c_cv_scores1=cross_val_score(lr,avg_wv_train_features1,real_data['Label'],cv=10,scoring='precision_macro')
    lr_w2c_cv_scores2=cross_val_score(lr,avg_wv_train_features1,real_data['Label'],cv=10,scoring='recall_macro')
    lr_w2c_cv_scores3=cross_val_score(lr,avg_wv_train_features1,real_data['Label'],cv=10,scoring='f1_macro')
    lr_w2c_cv_scores4=cross_val_score(lr,avg_wv_train_features1,real_data['Label'],cv=10,scoring='roc_auc')
    
    
    
    
    lr_w2c_cv_mean_score = np.mean(lr_w2c_cv_scores)
    lr_w2c_cv_mean_score1 = np.mean(lr_w2c_cv_scores1)
    lr_w2c_cv_mean_score2 = np.mean(lr_w2c_cv_scores2)
    lr_w2c_cv_mean_score3 = np.mean(lr_w2c_cv_scores3)
    lr_w2c_cv_mean_score4 = np.mean(lr_w2c_cv_scores4)
    
    
    
    print('CV Accuracy (10-fold):', lr_w2c_cv_scores)
    print('Mean CV Accuracy:', lr_w2c_cv_mean_score)
    
    print('CV Precision (10-fold):', lr_w2c_cv_scores1)
    print('Mean CV Precision:', lr_w2c_cv_mean_score1)
    
    print('CV Recall (10-fold):', lr_w2c_cv_scores2)
    print('Mean CV Recall:', lr_w2c_cv_mean_score2)
    
    print('CV f1-score (10-fold):', lr_w2c_cv_scores3)
    print('Mean f1-score:', lr_w2c_cv_mean_score3)
    
    print('CV auc (10-fold):', lr_w2c_cv_scores4)
    print('Mean auc:', lr_w2c_cv_mean_score4)



from sklearn.metrics import confusion_matrix
sm=confusion_matrix(test_label_names,lr_pred)


tp=sm[1,1]
fp=sm[0,1]
tn=sm[0,0]
fn=sm[1,0]
print(tp,fp,tn,fn)
precision=tp/(tp+fp)
recall=tp/(tp+fn)
accuracy=(tp+tn)/(tp+tn+fp+fn)
f_measure=(2*recall*precision)/(recall+precision)
print(precision,recall,accuracy,f_measure)


print(data22['label'])
print(lr_pred)
submission = pd.DataFrame({'message':data22['message'],'label':lr_pred})
target_names=['Not offensive','offensive']
submission.to_csv("E:\latest\drytv\drytvlabeleddata.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")
print(classification_report(data22['label'],lr_pred, target_names=target_names))

print(train_label_names1)
report = classification_report(list(data22['label']), lr_pred, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\datanew\word2veclogisticreport7th100.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")

from sklearn.metrics import confusion_matrix
cm=confusion_matrix(data22['label'], lr_pred)


tp=cm[1,1]
fp=cm[0,1]
tn=cm[0,0]
fn=cm[1,0]
print(tp,fp,tn,fn)
precision=tp/(tp+fp)
recall=tp/(tp+fn)
accuracy=(tp+tn)/(tp+tn+fp+fn)
f_measure=(2*recall*precision)/(recall+precision)
print(precision,recall,accuracy,f_measure)

from sklearn.ensemble import GradientBoostingClassifier
g= GradientBoostingClassifier(learning_rate=0.01,random_state=1)

g.fit(avg_wv_train_features1, Y)
svmsgd_w2c_cv_scores = cross_val_score(g, avg_wv_train_features1, Y, cv=10)
svmsgd_w2c_cv_mean_score = np.mean(svmsgd_w2c_cv_scores)
print('CV Accuracy (10-fold):', svmsgd_w2c_cv_scores)
print('Mean CV Accuracy:', svmsgd_w2c_cv_mean_score)
svmsgd_w2c_test_score = svm_sgd.score(avg_wv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_w2c_test_score)
model.fit(x_train, y_train)
model.score(x_test,y_test)

# Support Vector Machines
from sklearn.svm import LinearSVC
svm = LinearSVC(penalty='l2', C=1, random_state=42)
svm.fit(avg_wv_train_features1, real_data['Label'])
svm_w2c_cv_scores = cross_val_score(svm, avg_wv_train_features1, real_data['Label'], cv=10)
svm_w2c_cv_mean_score = np.mean(svm_w2c_cv_scores)
print('CV Accuracy (10-fold):', svm_w2c_cv_scores)
print('Mean CV Accuracy:', svm_w2c_cv_mean_score)
svm_w2c_test_score = svm.score(avg_wv_test_features, test_label_names)
print('Test Accuracy:', svm_w2c_test_score)

svm_pred=svm.predict(avg_wv_test_features)

    svm.fit(X5,Y)
    #lr_bow_cv_scores = cross_val_score(lr, cv_matrix, real_data['Label'], cv=10)
    svm_w2c_cv_scores=cross_val_score(svm,X5,Y,cv=10,scoring='accuracy')
    svm_w2c_cv_scores1=cross_val_score(svm,X5,Y,cv=10,scoring='precision_macro')
    svm_w2c_cv_scores2=cross_val_score(svm,X5,Y,cv=10,scoring='recall_macro')
    svm_w2c_cv_scores3=cross_val_score(svm,X5,Y,cv=10,scoring='f1_macro')
    svm_w2c_cv_scores4=cross_val_score(svm,X5,Y,cv=10,scoring='roc_auc')
    
    
    
    
    svm_w2c_cv_mean_score = np.mean(svm_w2c_cv_scores)
    svm_w2c_cv_mean_score1 = np.mean(svm_w2c_cv_scores1)
    svm_w2c_cv_mean_score2 = np.mean(svm_w2c_cv_scores2)
    svm_w2c_cv_mean_score3 = np.mean(svm_w2c_cv_scores3)
    svm_w2c_cv_mean_score4 = np.mean(svm_w2c_cv_scores4)
    
    
    
    print('CV Accuracy (10-fold):', svm_w2c_cv_scores)
    print('Mean CV Accuracy:', svm_w2c_cv_mean_score)
    
    print('CV Precision (10-fold):', svm_w2c_cv_scores1)
    print('Mean CV Precision:', svm_w2c_cv_mean_score1)
    
    print('CV Recall (10-fold):', svm_w2c_cv_scores2)
    print('Mean CV Recall:', svm_w2c_cv_mean_score2)
    
    print('CV f1-score (10-fold):', svm_w2c_cv_scores3)
    print('Mean f1-score:', svm_w2c_cv_mean_score3)
    
    print('CV auc (10-fold):', svm_w2c_cv_scores4)
    print('Mean auc:', svm_w2c_cv_mean_score4)


from sklearn.metrics import confusion_matrix
sm=confusion_matrix(test_label_names,svm_pred)


tp=sm[1,1]
fp=sm[0,1]
tn=sm[0,0]
fn=sm[1,0]
print(tp,fp,tn,fn)
precision=tp/(tp+fp)
recall=tp/(tp+fn)
accuracy=(tp+tn)/(tp+tn+fp+fn)
f_measure=(2*recall*precision)/(recall+precision)
print(precision,recall,accuracy,f_measure)


print(classification_report(test_label_names,svm_pred, target_names=target_names))

report = classification_report(test_label_names, svm_pred, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\mydataset\data\word2vecsvmreport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")
# SVM with Stochastic Gradient Descent

from sklearn.linear_model import SGDClassifier
svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=100, random_state=42)
svm_sgd.fit(avg_wv_train_features1, Y)
svmsgd_w2c_cv_scores = cross_val_score(svm_sgd, avg_wv_train_features1, Y, cv=10)
svmsgd_w2c_cv_mean_score = np.mean(svmsgd_w2c_cv_scores)
print('CV Accuracy (10-fold):', svmsgd_w2c_cv_scores)
print('Mean CV Accuracy:', svmsgd_w2c_cv_mean_score)
svmsgd_w2c_test_score = svm_sgd.score(avg_wv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_w2c_test_score)

sgd_pred=svm_sgd.predict(avg_wv_test_features)

    svm_sgd.fit(X5,real_data['Label'])
    #lr_bow_cv_scores = cross_val_score(lr, cv_matrix, real_data['Label'], cv=10)
    svmsgd_w2c_cv_scores=cross_val_score(svm_sgd,X5,real_data['Label'],cv=10,scoring='accuracy')
    svmsgd_w2c_cv_scores1=cross_val_score(svm_sgd,X5,real_data['Label'],cv=10,scoring='precision_macro')
    svmsgd_w2c_cv_scores2=cross_val_score(svm_sgd,X5,real_data['Label'],cv=10,scoring='recall_macro')
    svmsgd_w2c_cv_scores3=cross_val_score(svm_sgd,X5,real_data['Label'],cv=10,scoring='f1_macro')
    svmsgd_w2c_cv_scores4=cross_val_score(svm_sgd,X5,real_data['Label'],cv=10,scoring='roc_auc')
    
    
    
    
    svmsgd_w2c_cv_mean_score = np.mean(svmsgd_w2c_cv_scores)
    svmsgd_w2c_cv_mean_score1 = np.mean(svmsgd_w2c_cv_scores1)
    svmsgd_w2c_cv_mean_score2 = np.mean(svmsgd_w2c_cv_scores2)
    svmsgd_w2c_cv_mean_score3 = np.mean(svmsgd_w2c_cv_scores3)
    svmsgd_w2c_cv_mean_score4 = np.mean(svmsgd_w2c_cv_scores4)
    
    
    
    print('CV Accuracy (10-fold):', svmsgd_w2c_cv_scores)
    print('Mean CV Accuracy:', svmsgd_w2c_cv_mean_score)
    
    print('CV Precision (10-fold):', svmsgd_w2c_cv_scores1)
    print('Mean CV Precision:', svmsgd_w2c_cv_mean_score1)
    
    print('CV Recall (10-fold):', svmsgd_w2c_cv_scores2)
    print('Mean CV Recall:', svmsgd_w2c_cv_mean_score2)
    
    print('CV f1-score (10-fold):', svmsgd_w2c_cv_scores3)
    print('Mean f1-score:', svmsgd_w2c_cv_mean_score3)
    
    print('CV auc (10-fold):', svmsgd_w2c_cv_scores4)
    print('Mean auc:', svmsgd_w2c_cv_mean_score4)


print(classification_report(test_label_names,sgd_pred, target_names=target_names))

report = classification_report(test_label_names, sgd_pred, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\mydataset\data\word2vecsgdreport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=10, random_state=42)
rfc.fit(avg_wv_train_features, train_label_names)
rfc_w2c_cv_scores = cross_val_score(rfc, avg_wv_train_features, train_label_names, cv=10)
rfc_w2c_cv_mean_score = np.mean(rfc_w2c_cv_scores)
print('CV Accuracy (10-fold):', rfc_w2c_cv_scores)
print('Mean CV Accuracy:', rfc_w2c_cv_mean_score)
rfc_w2c_test_score = rfc.score(avg_wv_test_features, test_label_names)
print('Test Accuracy:', rfc_w2c_test_score)

rfc_pred=rfc.predict(avg_wv_test_features)

    rfc.fit(X5,real_data['Label'])
    #lr_bow_cv_scores = cross_val_score(lr, cv_matrix, real_data['Label'], cv=10)
    rfc_w2c_cv_scores=cross_val_score(rfc,X5,real_data['Label'],cv=10,scoring='accuracy')
    rfc_w2c_cv_scores1=cross_val_score(rfc,X5,real_data['Label'],cv=10,scoring='precision_macro')
    rfc_w2c_cv_scores2=cross_val_score(rfc,X5,real_data['Label'],cv=10,scoring='recall_macro')
    rfc_w2c_cv_scores3=cross_val_score(rfc,X5,real_data['Label'],cv=10,scoring='f1_macro')
    rfc_w2c_cv_scores4=cross_val_score(rfc,X5,real_data['Label'],cv=10,scoring='roc_auc')
    
    
    
    
    rfc_w2c_cv_mean_score = np.mean(rfc_w2c_cv_scores)
    rfc_w2c_cv_mean_score1 = np.mean(rfc_w2c_cv_scores1)
    rfc_w2c_cv_mean_score2 = np.mean(rfc_w2c_cv_scores2)
    rfc_w2c_cv_mean_score3 = np.mean(rfc_w2c_cv_scores3)
    rfc_w2c_cv_mean_score4 = np.mean(rfc_w2c_cv_scores4)
    
    
    
    print('CV Accuracy (10-fold):', rfc_w2c_cv_scores)
    print('Mean CV Accuracy:', rfc_w2c_cv_mean_score)
    
    print('CV Precision (10-fold):', rfc_w2c_cv_scores1)
    print('Mean CV Precision:', rfc_w2c_cv_mean_score1)
    
    print('CV Recall (10-fold):', rfc_w2c_cv_scores2)
    print('Mean CV Recall:', rfc_w2c_cv_mean_score2)
    
    print('CV f1-score (10-fold):', rfc_w2c_cv_scores3)
    print('Mean f1-score:', rfc_w2c_cv_mean_score3)
    
    print('CV auc (10-fold):', rfc_w2c_cv_scores4)
    print('Mean auc:', rfc_w2c_cv_mean_score4)
from sklearn.metrics import confusion_matrix
sm=confusion_matrix(test_label_names,rfc_pred)


tp=sm[1,1]
fp=sm[0,1]
tn=sm[0,0]
fn=sm[1,0]
print(tp,fp,tn,fn)
precision=tp/(tp+fp)
recall=tp/(tp+fn)
accuracy=(tp+tn)/(tp+tn+fp+fn)
f_measure=(2*recall*precision)/(recall+precision)
print(precision,recall,accuracy,f_measure)

print(classification_report(test_label_names,rfc_pred, target_names=target_names))

report = classification_report(test_label_names, rfc_pred, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\mydataset\data\word2vecrfcreport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")


from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
model1 = LogisticRegression()
model2 = LinearSVC()
model3=SGDClassifier()
model = VotingClassifier(estimators=[('lr', model1), ('svc', model2),('svm_sgd',model3)], voting='hard')


model.fit(avg_wv_train_features1, Y)

cv=RepeatedStratifiedKFold(n_splits=10,n_repeats=10,random_state=1)

scores_acc=cross_val_score(model,avg_wv_train_features1, Y,scoring='accuracy',cv=cv,n_jobs=1,error_score='raise')
scores_acc_mean=np.mean(scores_acc)
print('CV Accuracy (10-fold):',scores_acc)
print('Mean CV Accuracy:',scores_acc_mean)

scores_acc1=cross_val_score(model,avg_wv_train_features1, Y,scoring='precision_macro',cv=cv,n_jobs=1,error_score='raise')
scores_acc_mean1=np.mean(scores_acc1)
print('CV Precision (10-fold):',scores_acc1)
print('Mean CV Precision:',scores_acc_mean1)


scores_acc2=cross_val_score(model,avg_wv_train_features1, Y,scoring='recall_macro',cv=cv,n_jobs=1,error_score='raise')
scores_acc_mean2=np.mean(scores_acc2)
print('CV Recall (10-fold):',scores_acc2)
print('Mean CV Recall:',scores_acc_mean2)


scores_acc3=cross_val_score(model,avg_wv_train_features1, Y,scoring='f1_macro',cv=cv,n_jobs=1,error_score='raise')
scores_acc_mean3=np.mean(scores_acc3)
print('CV F-score (10-fold):',scores_acc3)
print('Mean CV F-score:',scores_acc_mean3)

scores_acc4=cross_val_score(model,avg_wv_train_features1, Y,scoring='roc_auc',cv=cv,n_jobs=1,error_score='raise')
scores_acc_mean4=np.mean(scores_acc1)
print('CV Roc value (10-fold):',scores_acc4)
print('Mean CV Roc:',scores_acc_mean4)

# Gradient Boosting Machines
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=10, random_state=42)
gbc.fit(avg_wv_train_features1, Y)
gbc_w2c_cv_scores = cross_val_score(gbc, avg_wv_train_features, train_label_names, cv=10)
gbc_w2c_cv_mean_score = np.mean(gbc_w2c_cv_scores)
print('CV Accuracy (10-fold):', gbc_w2c_cv_scores)
print('Mean CV Accuracy:', gbc_w2c_cv_mean_score)
gbc_w2c_test_score = gbc.score(avg_wv_test_features, test_label_names)
print('Test Accuracy:', gbc_w2c_test_score)

gbc_pred=gbc.predict(avg_wv_test_features)

print(classification_report(test_label_names,gbc_pred, target_names=target_names))

report = classification_report(test_label_names, gbc_pred, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\mydataset\data\word2vecgradiantreport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")


real_data1=real_data1.append(pd.DataFrame([['Naive Bayes', mnb_bow_cv_mean_score, mnb_bow_test_score,
mnb_tfidf_cv_mean_score, mnb_tfidf_test_score,mnb_ngram_mean_score,mnb_ngram_test_score,0,0],
['Logistic Regression', lr_bow_cv_mean_score, lr_bow_test_score, lr_tfidf_cv_mean_score, lr_tfidf_test_score,
 lr_ngram_cv_mean_score,lr_ngram_test_score,lr_w2c_cv_mean_score,lr_w2c_test_score],
['Linear SVM', svm_bow_cv_mean_score, svm_bow_test_score,
svm_tfidf_cv_mean_score, svm_tfidf_test_score,svm_ngram_cv_mean_score,svm_ngram_test_score,svm_w2c_cv_mean_score,svm_w2c_test_score],
['Linear SVM (SGD)', svmsgd_bow_cv_mean_score, svm_bow_test_score, svmsgd_tfidf_cv_mean_score, svmsgd_tfidf_test_score,svmsgd_ngram_cv_mean_score,svmsgd_ngram_test_score,
 svmsgd_w2c_cv_mean_score,svmsgd_w2c_test_score],
 ['Random Forest', rfc_bow_cv_mean_score, rfc_bow_test_score,
rfc_tfidf_cv_mean_score, rfc_tfidf_test_score,rfc_ngram_cv_mean_score,rfc_ngram_test_score,rfc_w2c_cv_mean_score,rfc_w2c_test_score],
['Gradient Boosted Machines', gbc_bow_cv_mean_score, gbc_bow_test_score, gbc_tfidf_cv_mean_score, gbc_tfidf_test_score,gbc_ngram_cv_mean_score,gbc_ngram_test_score,
 gbc_w2c_cv_mean_score,gbc_w2c_test_score]],


columns=['Model', 'CV Score (Bag of Words)', 'Test Score (Bag of Words)',
'CV Score (TF-IDF)', 'Test Score (TF-IDF)','CV Score (Word-ngram)','Test Score (Word-ngram)',
'CV Score (Word 2 Vec)','Test Score(Word2Vec)']).T)
real_data1.to_csv("F:\Finaldata\mydataset\data\datareport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")

from sklearn.svm import LinearSVC
svm = LinearSVC(penalty='l2', C=1, random_state=42)

from sklearn.ensemble import VotingClassifier
model1 = LogisticRegression(random_state=1)
model2 = LinearSVC(random_state=1)
model3=GradientBoostingClassifier(random_state=1)
model = VotingClassifier(estimators=[('lr', model1), ('svc', model2),('gbc',model3)], voting='hard')
model.fit(avg_wv_train_features1, Y)
rfc_w2c_test_score = model.score(avg_wv_test_features, test_label_names)
print('Test Accuracy:', rfc_w2c_test_score)

    rfc_w2c_cv_scores=cross_val_score(model,Y,real_data['Label'],cv=10,scoring='accuracy')
    rfc_w2c_cv_scores1=cross_val_score(model,Y,real_data['Label'],cv=10,scoring='precision_macro')
    rfc_w2c_cv_scores2=cross_val_score(model,Y,real_data['Label'],cv=10,scoring='recall_macro')
    rfc_w2c_cv_scores3=cross_val_score(model,Y,real_data['Label'],cv=10,scoring='f1_macro')
    rfc_w2c_cv_scores4=cross_val_score(model,Y,real_data['Label'],cv=10,scoring='roc_auc')
    
    
    
    
    rfc_w2c_cv_mean_score = np.mean(rfc_w2c_cv_scores)
    rfc_w2c_cv_mean_score1 = np.mean(rfc_w2c_cv_scores1)
    rfc_w2c_cv_mean_score2 = np.mean(rfc_w2c_cv_scores2)
    rfc_w2c_cv_mean_score3 = np.mean(rfc_w2c_cv_scores3)
    rfc_w2c_cv_mean_score4 = np.mean(rfc_w2c_cv_scores4)
    
    
    
    print('CV Accuracy (10-fold):', rfc_w2c_cv_scores)
    print('Mean CV Accuracy:', rfc_w2c_cv_mean_score)
    
    print('CV Precision (10-fold):', rfc_w2c_cv_scores1)
    print('Mean CV Precision:', rfc_w2c_cv_mean_score1)
    
    print('CV Recall (10-fold):', rfc_w2c_cv_scores2)
    print('Mean CV Recall:', rfc_w2c_cv_mean_score2)
    
    print('CV f1-score (10-fold):', rfc_w2c_cv_scores3)
    print('Mean f1-score:', rfc_w2c_cv_mean_score3)
    
    print('CV auc (10-fold):', rfc_w2c_cv_scores4)
    print('Mean auc:', rfc_w2c_cv_mean_score4)



model.score(x_test,y_test)

#w2v_model = word2vec.Word2Vec(tokenized_corpus, size=feature_size,window=window_context, min_count = min_word_count, sample=sample, iter=100,workers=10)
    
from sklearn.manifold import TSNE
words = w2v_model.wv.index2word
wvs = w2v_model.wv[words]
tsne = TSNE(n_components=2, random_state=0, n_iter=5000, perplexity=2)
np.set_printoptions(suppress=True)
T = tsne.fit_transform(wvs)
labels = words
    

    
# generate word2vec word embeddings

# build word2vec model
w2v_num_features = 1000
w2v_model = gensim.models.Word2Vec(tokenized_train, size=w2v_num_features,
window=100, min_count=2, sample=1e-3, sg=1, iter=5, workers=10)
# generate document level embeddings
# remember we only use train dataset vocabulary embeddings
# so that test dataset truly remains an unseen dataset
# generate averaged word vector features from word2vec model
avg_wv_train_features = document_vectorizer(corpus=tokenized_train,
model=w2v_model, num_features=w2v_num_features)


#Convoulational neural network implementation

from __future__ import print_function, division
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future

import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from keras.wrappers.scikit_learn import KerasClassifier

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout


MAX_SEQUENCE_LENGTH = 17
MAX_VOCAB_SIZE = 200000
EMBEDDING_DIM = 200
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 128
EPOCHS = 10

import numpy as np
import keras
from keras.utils import to_categorical
from keras.layers import TimeDistributed, Conv3D, Input, Flatten, Dense
from keras.applications.inception_v3 import InceptionV3
from random import randint
from keras.models import Model
tokenizer = Tokenizer()
# fit the tokenizer on the documents

tokenizer.fit_on_texts(normalize_corpus1)
tokenizer.fit_on_texts(tokens12)



encoded_docs = tokenizer.texts_to_sequences(normalize_corpus1)

vocab_size = len(tokenizer.word_index) + 1
word_2_index = tokenizer.word_index



print(tokens12[500])
print(word_2_index[tokens12[500]])

input_sequence = []
output_words = []
input_seq_length = 100
n_words = len(tokens12)
print(n_words)
for i in range(0, n_words - input_seq_length , 1):
    in_seq = tokens12[i:i + input_seq_length]
    out_seq = tokens12[i + input_seq_length]
    input_sequence.append([word_2_index[word] for word in in_seq])
    output_words.append(word_2_index[out_seq])
    
print(input_sequence[0])  

X = np.reshape(input_sequence, (len(input_sequence), input_seq_length, 1))
X = X / float(vocab_size)
y = to_categorical(output_words)


y = to_categorical(output_words)



print("X shape:", X.shape)
print("y shape:", y.shape)



max_length = max([len(s.split()) for s in normalize_corpus1])
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

tokens12=[]
for token in tokens:
    if(token=='==========' or token=='=' or token=='==' or token=='===' or token=='+++' or token=='>>>' or token=='++=' or token=='~~~' or token=='NAME' or token=='=====' or token=='íӀօѵҽ' or token=='>>>>' or token=='Ñāāì' or token=='×=×' or token=='×=×=' or token=='```' or token=='|=='):
        print(token)
    else:
        tokens12.append(token)
        
        #tokens12.append(token)
for token in tokens:
    if(token!='==========' or token!='=' or token!='==' or token!='==='):
        
        

        
# encode training data set
Xtrain = tokenizer.texts_to_matrix(normalize_corpus1, mode='freq')
print(Xtrain.shape)
ytrain=Y
vocab_size = len(tokenizer.word_index) + 1


tokenizer.fit_on_texts(train_corpus)
#Transforms each text in texts to a sequence of integers.
train_sequences = tokenizer.texts_to_sequences(train_corpus)
test_sequences = tokenizer.texts_to_sequences(test_corpus)
word_index = tokenizer.word_index
print("ength of word Index:", len(word_index))
print("First 5 elements in the word_index dictionary:", dict(list(word_index.items())[0: 5]) )
print("First comment text in training set:\n", train_sequences[0])

#Pad tokenized sequences
trainvalid_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print("Shape of padded sequence list:\n", trainvalid_data.shape)
print("First comment text in training set — 0 for padding — only last 50 sequences as the rest are paddings:\n", trainvalid_data[0][-50:])

cnn_model = Sequential()
cnn_model.add(Embedding(MAX_VOCAB_SIZE, 128))
cnn_model.add(Conv1D(filters = 128, kernel_size = 5, activation = "relu"))
cnn_model.add(MaxPooling1D(pool_size = 5))
cnn_model.add(Conv1D(filters = 128, kernel_size = 5, activation = "relu"))
cnn_model.add(MaxPooling1D(pool_size = 5))
cnn_model.add(Conv1D(filters = 128, kernel_size = 5, activation = "relu"))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(units = 128, activation = 'relu'))
cnn_model.add(Dense(units = 6, activation = 'sigmoid'))
print(cnn_model.summary())

cnn_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['AUC'])
cnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#Split the dataset into train and validation set for training and evaludating the model
X_train, X_val, y_train, y_val = train_test_split(trainvalid_data, train_label_names, shuffle = True, random_state = 123)
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)
#Trains the model for a fixed number of epochs (iterations on a dataset)
history = cnn_model.fit(X_train, y_train, batch_size = 128, epochs = 1, validation_data = (X_val, y_val))

history=cnn_model.fit(
  Xtrain,
  ytrain,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
  validation_split=VALIDATION_SPLIT
)
model.summary()
'''model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(Xtrain, ytrain, epochs=10, verbose=2)

n_words=Xtrain.shape[1]


print(model.summary())
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])'''



def evaluate_model(Xtrain, ytrain, Xtest, ytest):
	scores = list()
	n_repeats = 1
	n_words = Xtest.shape[1]
	for i in range(n_repeats):
		# define network
		model = Sequential()
		model.add(Dense(50, input_shape=(n_words,), activation='relu'))
		model.add(Dense(1, activation='sigmoid'))
		# compile network
		model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
		# fit network
		model.fit(Xtrain, ytrain, epochs=10, verbose=2)
		# evaluate
		loss, acc = model.evaluate(Xtest, ytest, verbose=0)
		scores.append(acc)
		print('%d accuracy: %s' % ((i+1), acc))
	return scores

def prepare_data(train_docs, test_docs, mode):
	# create the tokenizer
	tokenizer = Tokenizer()
	# fit the tokenizer on the documents
	tokenizer.fit_on_texts(train_docs)
	# encode training data set
	Xtrain = tokenizer.texts_to_matrix(train_docs, mode=mode)
	# encode training data set
	Xtest = tokenizer.texts_to_matrix(test_docs, mode=mode)
	return Xtrain, Xtest
 
modes = ['tfidf', 'freq']
results = pd.DataFrame()
for mode in modes:
	# prepare data for mode
	Xtrain, Xtest = prepare_data(train_corpus, test_corpus, mode)
	# evaluate model on data for mode
	results[mode] = evaluate_model(Xtrain, train_label_names, Xtest, test_label_names)
    
    
    
  #Initializing the class
tokenizer = Tokenizer(num_words = MAX_NUM_WORDS)
#Updates internal vocabulary based on a list of texts.
tokenizer.fit_on_texts(train_texts)
#Transforms each text in texts to a sequence of integers.
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)
word_index = tokenizer.word_index
print(“Length of word Index:”, len(word_index))
print(“First 5 elements in the word_index dictionary:”, dict(list(word_index.items())[0: 5]) )
print(“First comment text in training set:\n”, train_sequences[0])  
    
#train_corpus,test_corpus,train_label_names,test_label_names


print('Loading word vectors...')

word2vec12 = {}
#with open(os.path.join('../large_files/glove.6B/glove.6B.%sd.txt' % EMBEDDING_DIM)) as f:
  # is just a space-separated text file in the format:
  # word vec[0] vec[1] vec[2] ...
for i in range(0,len(words1)):
     word = words1[i]
     vec = np.asarray(wvs1[i], dtype='float32')
     word2vec12[word] = vec
print('Found %s word vectors.' % len(word2vec12))


#data5=pd.read_csv("F:\Finaldata\data405.csv")

print('Loading in comments...')

train = real_data['Message']
sentences = train
possible_labels = ["label"]
targets = data[possible_labels].values




# convert the sentences (strings) into integers
#t=nltk.tokenize.wordpunct_tokenize(train)
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
# print("sequences:", sequences); exit()

#tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
#tokenizer.fit_on_texts(sentences)
#sequences = tokenizer.texts_to_sequences(sentences)
#targets = train_label_names.values

print("max sequence length:", max(len(s) for s in sequences))
print("min sequence length:", min(len(s) for s in sequences))


s = sorted(len(s) for s in sequences)
print("median sequence length:", s[len(s) // 2])


print("max word index:", max(max(seq) for seq in sequences if len(seq) > 0))

word2idx = tokenizer.word_index


print('Found %s unique tokens.' % len(word2idx))

data22 = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data22.shape)


print('Filling pre-trained embeddings...')
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
embedding_matrix1 = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx.items():
  if i < MAX_VOCAB_SIZE:
    embedding_vector = word2vec12.get(word)
    if embedding_vector is not None:
      # words not found in embedding index will be all zeros.
      embedding_matrix1[i] = embedding_vector


# prepare embedding matrix
'''print('Filling pre-trained embeddings...')
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)

embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

#for word, i in word2idx.items():
 # if i < MAX_VOCAB_SIZE:
  #  embedding_vector = word2vec.get(word)'
  
print(wvs[1])
for word, i in word2idx.items():
    embedding_vector=wvs[i]
    #if embedding_vector is not None:
              # words not found in embedding index will be all zeros.
    embedding_matrix[i] = embedding_vector'''
    #if embedding_vector is not None:
      # words not found in embedding index will be all zeros.
     # embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(
  num_words,
  EMBEDDING_DIM,
  weights=[embedding_matrix1],
  input_length=MAX_SEQUENCE_LENGTH,
  trainable=False
)
#possible_labels = ["label"]
print('Building model...')

# train a 1D convnet with global maxpooling
input_ = Input(shape=(MAX_SEQUENCE_LENGTH,))
x = embedding_layer(input_)
x = Conv1D(128, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='relu')(x)





output = Dense(len(possible_labels), activation='sigmoid')(x)

model = Model(input_, output)
model.compile(
  loss='binary_crossentropy',
  optimizer='rmsprop',
  metrics=['accuracy']
)
print('Training model...')
model.fit(
  Xtrain,
  ytrain,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
  validation_split=VALIDATION_SPLIT
)
model.summary()



print(model.history)
from sklearn.metrics import classification_report
pred=model.predict(data22)
pred2=[]
for i in pred:
        if i>=.5 :
            pred2.append(1)
        else:
            pred2.append(0)
target_names=['0','1']
print(classification_report(test_label_names1,pred2, target_names=target_names))

from sklearn.metrics import confusion_matrix
cm=confusion_matrix(data22, pred2)


tp=cm[1,1]
fp=cm[0,1]
tn=cm[0,0]
fn=cm[1,0]
print(tp,fp,tn,fn)
precision=tp/(tp+fp)
recall=tp/(tp+fn)
accuracy=(tp+tn)/(tp+tn+fp+fn)
f_measure=(2*recall*precision)/(recall+precision)
print(precision,recall,accuracy,f_measure)

report = classification_report(test_label_names1, pred2, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\mydataset\data\gofwordlogisticeport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")


train_c,train_label_names1
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold



X_train=data22
Y_train=targets




from sklearn.model_selection import train_test_split
train_c,test_c,train_label_names1,test_label_names1 = train_test_split(X_train,Y_train,test_size=0.1, random_state=42)
x_c=np.concatenate((X_train, Y_train), axis = 1)
kf=StratifiedKFold(10,shuffle=True,random_state=42)
cos_y=[]
cos_pred=[]

oos_y=[]
oos_pred=[]
fold=0
acc=[]
for train,test in kf.split(train_c,train_label_names1):
    
    pred1=[]
    fold+=1
    print(f"Fold#{fold}")
          
    x_train=train_c[train]  
    y_train=train_label_names1[train]
    x_test=train_c[test]
    y_test=train_label_names1[test]    
    
    input_ = Input(shape=(MAX_SEQUENCE_LENGTH,))
    x = embedding_layer(input_)
    x = Conv1D(128, 3, activation='relu')(x)
    x = MaxPooling1D(3)(x)
    x = Conv1D(128, 3, activation='relu')(x)
    x = MaxPooling1D(3)(x)
    x = Conv1D(128, 3, activation='relu')(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(128, activation='relu')(x)


    output = Dense(len(possible_labels), activation='sigmoid')(x)

    model = Model(input_, output)
    model.compile(
    loss='binary_crossentropy',
    optimizer='rmsprop',
    metrics=['accuracy']
     )



    model.fit(x_train,y_train,validation_data=(x_test,y_test),verbose=0,batch_size=BATCH_SIZE,
    epochs=EPOCHS)
    
    pred=model.predict(x_test)
    for i in pred:
        if i>=.5 :
            pred1.append(1)
        else:
            pred1.append(0)
    
    target_names=['0','1']
print(classification_report(test_label_names1,pred1, target_names=target_names))

from sklearn.metrics import confusion_matrix
cm=confusion_matrix(test_label_names1, pred2)


tp=cm[1,1]
fp=cm[0,1]
tn=cm[0,0]
fn=cm[1,0]
print(tp,fp,tn,fn)
precision=tp/(tp+fp)
recall=tp/(tp+fn)
accuracy=(tp+tn)/(tp+tn+fp+fn)
f_measure=(2*recall*precision)/(recall+precision)
print(precision,recall,accuracy,f_measure)
    
    oos_y.append(y_test)
    #pred1=np.argmax(pred)
    
    #np.argmax(prediction)
    oos_pred.append(pred1)
    #score=np.sqrt(metrics.mean_squared_error(pred,y_test))
    
    #y_compare=np.argmax(y_test)
    score=metrics.accuracy_score(y_test,pred1)
    acc.append(score)
    
    print(f"Fold score (Accurayc)#{score}")
import tensorflow as tf
n=np.asarray(acc, dtype='float32')

print(n.mean())

pred=model.predict(test_c)
pred2=[]
for i in pred:
        if i>=.5 :
            pred2.append(1)
        else:
            pred2.append(0)
target_names=['Not offensive','offensive']
print(classification_report(test_label_names1,pred2, target_names=target_names))

report = classification_report(test_label_names1, pred2, target_names=target_names,output_dict=True)
cos_y=np.concatenate(oos_y)
cos_pred=np.concatenate(oos_pred)

score=np.sqrt(metrics.mean_squared_error(cos_pred,cos_y))

print(f"Final out of sample score(RMSE)#{score}")




#def built_classifier1():
    
  embedding_layer = Embedding(
    num_words,
    EMBEDDING_DIM,
    weights=[embedding_matrix1],
    input_length=MAX_SEQUENCE_LENGTH,
     trainable=False
    )
    input_ = Input(shape=(MAX_SEQUENCE_LENGTH,))
    x = embedding_layer(input_)
    x = Conv1D(128, 3, activation='relu')(x)
    x = MaxPooling1D(3)(x)
    x = Conv1D(128, 3, activation='relu')(x)
    x = MaxPooling1D(3)(x)
    x = Conv1D(128, 3, activation='relu')(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(128, activation='relu')(x)


    output = Dense(len(possible_labels), activation='sigmoid')(x)
    model = Model(input_, output)
    model.compile(
    loss='binary_crossentropy',
    optimizer='rmsprop',
    metrics=['accuracy']
    )
    return model
X_train=data22
Y_train=targets
classifier= KerasClassifier(build_fn=built_classifier1,batch_size=BATCH_SIZE,epochs=EPOCHS,validation_split=VALIDATION_SPLIT)
#train_c,train_label_names1
#train_c,test_c,train_label_names1,test_label_names1
accuracies=cross_val_score(classifier,X=train_c,y=train_label_names1,cv=10)

#print(n.mean())
print(accuracies.mean())
print(accuracies.std())

pred=model.predict(test_c)
pred2=[]
for i in pred:
        if i>=.5 :
            pred2.append(1)
        else:
            pred2.append(0)

target_names=['Not offensive','offensive']
print(classification_report(test_label_names1,pred2, target_names=target_names))

report = classification_report(test_label_names1, pred2, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\mydataset\data\convulutinalport.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")
# plot some data
plt.plot(r.history['loss'], label='loss')
plt.plot(r.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

plt.plot(r.history['accuracy'], label='acc')
plt.plot(r.history['val_accuracy'], label='val_acc')
plt.legend()
plt.show()


from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from sklearn.model_selection import cross_val_score
print('Building model...')



#kf=StratifiedKFold(10,shuffle=True,random_state=42)
#cos_y=[]
#cos_pred=[]

#oos_y=[]
#oos_pred=[]
#fold=0
#acc1=[]
#for train,test in kf.split(train_c,train_label_names1):
    
 #   pred1=[]
  #  fold+=1
   # print(f"Fold#{fold}")
          
    #x_train=train_c[train]  
    #y_train=train_label_names1[train]
    #x_test=train_c[test]
    #y_test=train_label_names1[test]    
# create an LSTM network with a single LSTM
input_ = Input(shape=(MAX_SEQUENCE_LENGTH,))
x = embedding_layer(input_)
x = LSTM(15, return_sequences=True)(x)
    
    #x = Bidirectional(LSTM(15, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
output = Dense(len(possible_labels), activation="sigmoid")(x)

model = Model(input_, output)
model.compile(
loss='binary_crossentropy',
optimizer=Adam(lr=0.01),
metrics=['accuracy'],
 )

model = Model(input_, output)
model.compile(
loss='binary_crossentropy',
optimizer=Adam(lr=0.01),
metrics=[tf.keras.metrics.Precision()],
 )

history=model.fit(
  data22,
  targets,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
  validation_split=VALIDATION_SPLIT
)
 model.fit(train_c,train_label_names1,validation_data=(test_c,test_label_names1),verbose=0,batch_size=BATCH_SIZE,
    epochs=EPOCHS)
    #train_c,test_c,train_label_names1,test_label_names1
    pred=model.predict(test_c)
    oos_y=[]
    pred1=[]
    for i in pred:
        if i>=.5 :
            pred1.append(1)
        else:
            pred1.append(0)
            
    
    oos_y.append(y_test)
    #pred1=np.argmax(pred)
    
    #np.argmax(prediction)
    oos_pred.append(pred1)
    #score=np.sqrt(metrics.mean_squared_error(pred,y_test))
    
    #y_compare=np.argmax(y_test)
    score=metrics.accuracy_score(y_test,pred1)
    acc1.append(score)
    print(f"Fold score (Accurayc)#{score}")

n1=np.asarray(acc1, dtype='float32')

print(n1.mean())
pred3=model.predict(test_c)
pred4=[]
for i in pred3:
        if i>=.5 :
            pred4.append(1)
        else:
            pred4.append(0)

target_names=['Not offensive','offensive']
print(classification_report(test_label_names1,pred4, target_names=target_names))

report = classification_report(test_label_names1, pred4, target_names=target_names,output_dict=True)
df2 = pd.DataFrame(report).transpose()
df2.to_csv("F:\Finaldata\mydataset\data\directlstm.csv",float_format='%.2f', na_rep="NAN!",encoding="utf-32")
print(embedding_matrix1.shape[0])

from keras.layers import Input, Dense, Embedding, Concatenate, Activation, 
    Dropout, LSTM, Bidirectional, GlobalMaxPooling1D, GaussianNoise
from keras.models import Model

#from __future__ import print_function
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras.models import Sequential
from keras.layers import LSTM, GRU, Embedding, Dense, Flatten, Bidirectional
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
import numpy as np

def classifier_new():
    
   ''' embedding_layer = Embedding(
    num_words,
    EMBEDDING_DIM,
    weights=[embedding_matrix1],
    input_length=MAX_SEQUENCE_LENGTH,
     trainable=False
    )
    
    input_ = Input(shape=(MAX_SEQUENCE_LENGTH,))
    x = embedding_layer(input_)'''
    
   model = Sequential()
   model.add(Embedding(num_words,EMBEDDING_DIM,weights=[embedding_matrix1],input_length=MAX_SEQUENCE_LENGTH,trainable=False))
   model.add(Bidirectional(LSTM(256, dropout=0.3, recurrent_dropout=0.3, return_sequences=True)))
   model.add(Bidirectional(LSTM(256, dropout=0.3, recurrent_dropout=0.3, return_sequences=True)))
   model.add(Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.3)))
   model.add(Dense(1, activation='sigmoid'))

   #model.summary()
    #x = LSTM(15, return_sequences=True)(x)
    #x = Bidirectional(LSTM(15, return_sequences=True))(x)
    #x = GlobalMaxPool1D()(x)
    #output = Dense(len(possible_labels), activation="sigmoid")(x)

    #model = Model(input_, output)
   model.compile(
   loss='binary_crossentropy',
   optimizer=Adam(lr=0.01),
   metrics=['accuracy'],
    )
   return model


def buildModel(embeddings_matrix1, MAX_SEQUENCE_LENGTH, lstm_dim, hidden_layer_dim, num_classes, 
               noise=0.1, dropout_lstm=0.2, dropout=0.2):
    turn1_input = Input(shape=(MAX_SEQUENCE_LENGTH,))
    turn2_input = Input(shape=(MAX_SEQUENCE_LENGTH,))
    turn3_input = Input(shape=(MAX_SEQUENCE_LENGTH,))
    embedding_dim = embeddings_matrix1.shape[1]
   
    embeddingLayer = Embedding(embeddings_matrix1.shape[0],
                                embedding_dim,
                                weights=[embeddings_matrix1],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    
    turn1_branch = embeddingLayer(turn1_input)
    turn2_branch = embeddingLayer(turn2_input) 
    turn3_branch = embeddingLayer(turn3_input) 
    
    turn1_branch = GaussianNoise(noise, input_shape=(None, MAX_SEQUENCE_LENGTH, embedding_dim))(turn1_branch)
    turn2_branch = GaussianNoise(noise, input_shape=(None, MAX_SEQUENCE_LENGTH, embedding_dim))(turn2_branch)
    turn3_branch = GaussianNoise(noise, input_shape=(None, MAX_SEQUENCE_LENGTH, embedding_dim))(turn3_branch)

    lstm1 = Bidirectional(LSTM(lstm_dim, dropout=dropout_lstm))
    lstm2 = Bidirectional(LSTM(lstm_dim, dropout=dropout_lstm))
    
    turn1_branch = lstm1(turn1_branch)
    turn2_branch = lstm2(turn2_branch)
    turn3_branch = lstm1(turn3_branch)
    
    x = Concatenate(axis=-1)([turn1_branch, turn2_branch, turn3_branch])
    
    x = Dropout(dropout)(x)
    
    x = Dense(hidden_layer_dim, activation='relu')(x)
    x = GlobalMaxPool1D()(x)
    
    output = Dense(num_classes, activation='softmax')(x)
    
    model = Model(inputs=[turn1_input, turn2_input, turn3_input], outputs=output)
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    
    return model
classifier1= KerasClassifier(build_fn=classifier_new,batch_size=BATCH_SIZE,epochs=EPOCHS,validation_split=VALIDATION_SPLIT)
accuracies1=cross_val_score(classifier1,X=X_train,y=Y_train,cv=10)
print(accuracies1.mean())
print(accuracies1.std())

import model_evaluation_utils as meu

# plot some data
plt.plot(r.history['loss'], label='loss')
plt.plot(r.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

# accuracies
plt.plot(r.history['accuracy'], label='acc')
plt.plot(r.history['val_accuracy'], label='val_acc')
plt.legend()
plt.show()
kl=targets[:,0]
p = model.predict(data22)
kl1=p[:,0]
aucs = []
#for j in range(6):
auc = roc_auc_score(targets[:,0], p[:,0])
aucs.append(auc)
print(np.mean(aucs))

#model performance repport

from sklearn import metrics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.base import clone
from sklearn.preprocessing import label_binarize
from scipy import interp
from sklearn.metrics import roc_curve, auc 

def get_metrics(true_labels, predicted_labels):
    
    print('Accuracy:', np.round(
                        metrics.accuracy_score(true_labels, 
                                               predicted_labels),
                        4))
    print('Precision:', np.round(
                        metrics.precision_score(true_labels, 
                                               predicted_labels,
                                               average='weighted'),
                        4))
    print('Recall:', np.round(
                        metrics.recall_score(true_labels, 
                                               predicted_labels,
                                               average='weighted'),
                        4))
    print('F1 Score:', np.round(
                        metrics.f1_score(true_labels, 
                                               predicted_labels,
                                               average='weighted'),
                        4))
                        
def train_predict_model(classifier, 
                        train_features, train_labels, 
                        test_features, test_labels):
    # build model    
    classifier.fit(train_features, train_labels)
    # predict using model
    predictions = classifier.predict(test_features) 
    return predictions                            

def display_confusion_matrix(true_labels, predicted_labels, classes=[1,0]):
    
    total_classes = len(classes)
    level_labels = [total_classes*[0], list(range(total_classes))]

    cm = metrics.confusion_matrix(y_true=true_labels, y_pred=predicted_labels, 
                                  labels=classes)
    cm_frame = pd.DataFrame(data=cm, 
                            columns=pd.MultiIndex(levels=[['Predicted:'], classes], 
                                                  labels=level_labels), 
                            index=pd.MultiIndex(levels=[['Actual:'], classes], 
                                                labels=level_labels)) 
    print(cm_frame) 
    
def display_classification_report(true_labels, predicted_labels, classes=[1,0]):

    report = metrics.classification_report(y_true=true_labels, 
                                           y_pred=predicted_labels, 
                                           labels=classes) 
    print(report)

def display_model_performance_metrics(true_labels, predicted_labels, classes=[1,0]):
    print('Model Performance metrics:')
    print('-'*30)
    get_metrics(true_labels=true_labels, predicted_labels=predicted_labels)
    print('\nModel Classification report:')
    print('-'*30)
    display_classification_report(true_labels=true_labels, predicted_labels=predicted_labels, 
                                  classes=classes)
    print('\nPrediction Confusion Matrix:')
    print('-'*30)
    display_confusion_matrix(true_labels=true_labels, predicted_labels=predicted_labels, 
                             classes=classes)
def plot_model_decision_surface(clf, train_features, train_labels,
                                plot_step=0.02, cmap=plt.cm.RdYlBu,
                                markers=None, alphas=None, colors=None):
    
    if train_features.shape[1] != 2:
        raise ValueError("X_train should have exactly 2 columnns!")
    
    x_min, x_max = train_features[:, 0].min() - plot_step, train_features[:, 0].max() + plot_step
    y_min, y_max = train_features[:, 1].min() - plot_step, train_features[:, 1].max() + plot_step
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))

    clf_est = clone(clf)
    clf_est.fit(train_features,train_labels)
    if hasattr(clf_est, 'predict_proba'):
        Z = clf_est.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,1]
    else:
        Z = clf_est.predict(np.c_[xx.ravel(), yy.ravel()])    
    Z = Z.reshape(xx.shape)
    cs = plt.contourf(xx, yy, Z, cmap=cmap)
    
    le = LabelEncoder()
    y_enc = le.fit_transform(train_labels)
    n_classes = len(le.classes_)
    plot_colors = ''.join(colors) if colors else [None] * n_classes
    label_names = le.classes_
    markers = markers if markers else [None] * n_classes
    alphas = alphas if alphas else [None] * n_classes
    for i, color in zip(range(n_classes), plot_colors):
        idx = np.where(y_enc == i)
        plt.scatter(train_features[idx, 0], train_features[idx, 1], c=color,
                    label=label_names[i], cmap=cmap, edgecolors='black', 
                    marker=markers[i], alpha=alphas[i])
    plt.legend()
    plt.show()    
    
    
    def plot_model_roc_curve(clf, features, true_labels, label_encoder=None, class_names=None):
    
    ## Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    if hasattr(clf, 'classes_'):
        class_labels = clf.classes_
    elif label_encoder:
        class_labels = label_encoder.classes_
    elif class_names:
        class_labels = class_names
    else:
        raise ValueError('Unable to derive prediction classes, please specify class_names!')
    n_classes = len(class_labels)
    y_test = label_binarize(true_labels, classes=class_labels)
    if n_classes == 2:
        if hasattr(clf, 'predict_proba'):
            prob = clf.predict_proba(features)
            y_score = prob[:, prob.shape[1]-1] 
        elif hasattr(clf, 'decision_function'):
            prob = clf.decision_function(features)
            y_score = prob[:, prob.shape[1]-1]
        else:
            raise AttributeError("Estimator doesn't have a probability or confidence scoring system!")
        
        fpr, tpr, _ = roc_curve(y_test, y_score)      
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label='ROC curve (area = {0:0.2f})'
                                 ''.format(roc_auc),
                 linewidth=2.5)
        
    elif n_classes > 2:
        if hasattr(clf, 'predict_proba'):
            y_score = clf.predict_proba(features)
        elif hasattr(clf, 'decision_function'):
            y_score = clf.decision_function(features)
        else:
            raise AttributeError("Estimator doesn't have a probability or confidence scoring system!")

        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])

        ## Compute micro-average ROC curve and ROC area
        fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

        ## Compute macro-average ROC curve and ROC area
        # First aggregate all false positive rates
        all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
        # Then interpolate all ROC curves at this points
        mean_tpr = np.zeros_like(all_fpr)
        for i in range(n_classes):
            mean_tpr += interp(all_fpr, fpr[i], tpr[i])
        # Finally average it and compute AUC
        mean_tpr /= n_classes
        fpr["macro"] = all_fpr
        tpr["macro"] = mean_tpr
        roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

        ## Plot ROC curves
        plt.figure(figsize=(6, 4))
        plt.plot(fpr["micro"], tpr["micro"],
                 label='micro-average ROC curve (area = {0:0.2f})'
                       ''.format(roc_auc["micro"]), linewidth=3)

        plt.plot(fpr["macro"], tpr["macro"],
                 label='macro-average ROC curve (area = {0:0.2f})'
                       ''.format(roc_auc["macro"]), linewidth=3)

        for i, label in enumerate(class_labels):
            plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'
                                           ''.format(label, roc_auc[i]), 
                     linewidth=2, linestyle=':')
    else:
        raise ValueError('Number of classes should be atleast 2 or more')
        
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()
    
