
from audioop import add
import json
import os
from re import M
import time
from matplotlib import pyplot as plt
from pylab import figure #
from pandas import DataFrame #store data in dataframe
import pandas as pd #importing data from excel to dataframe
import seaborn as sns # 2d pca plot
"""
from sklearn.preprocessing import StandardScaler #standardizing the data
from sklearn.decomposition import PCA #principal component analysis
from sklearn.cluster import AgglomerativeClustering # hierarchical clustering method
from sklearn.cluster import KMeans #clustering method
from sklearn.cluster import SpectralClustering #clustering method
"""
import scipy.stats as stats
import diptest
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

def make_data_table(input_directory):
    cell_count = 0
    measurement_count = 0
    all_feature_list = []
    amplitudes_list = []
    df_data_bymeas = DataFrame()
    pv_count = 0
    cck_count = 0
    pc_count = 0
    smb_count = 0
    unknown_count = 0
    ch_count = 0

    # known cell types from txt to list
    with open(folder_name+'OLM_list.txt') as ch:
        ch_lines = ch.readlines()
        ch_cells = []
        for line in ch_lines:
            ch_cells.append(line[:10])
        ch.close()
    with open(folder_name+'HS_list.txt') as smb:
        smb_lines = smb.readlines()
        smb_cells = []
        for line in smb_lines:
            smb_cells.append(line[:10])
        smb.close()
    # iterate through protocols and cells
    for protocol in os.listdir(input_directory):  # list all the protocols in the output of feature extraction
        protocol_dir = input_directory + '/' + protocol
        df_data_prot_bymeas = DataFrame()
        if os.path.isdir(protocol_dir):
            for cell in os.listdir(protocol_dir):  # list all the directories and files in protocols
                cell_dir = protocol_dir + '/' + cell
                if os.path.isdir(cell_dir):  # filter out directories(one cell)
                    cell_count += 1
                    for measurement in os.listdir(cell_dir):
                        measurement_dir = cell_dir + '/' + measurement
                        if os.path.isdir(measurement_dir):
                            measurement_count += 1
                            for channel in os.listdir(measurement_dir):
                                channel_dir = measurement_dir + '/' + channel
                                if os.path.isdir(channel_dir):
                                    feature_json_dir = channel_dir + '/features.json'

                                    # open feature.json
                                    try:
                                        with open(feature_json_dir) as f:  # open feature.json files
                                            # Opening JSON file:
                                            data = json.load(f)
                                            print(measurement_dir)
                                    except FileNotFoundError:
                                        print("[ERROR][{}] Protocol file not found. Skipping measurement.".format(cell))
                                        continue

                                    cur_feature_list = []
                                    cur_value_list = []

                                    cur_feature_list.append('#name')
                                    cur_value_list.append(measurement)

                                    step_list = []  # list of currents

                                    # collecting step values into step_list
                                    for step in data:
                                        if step[0:4] == "step":
                                            step_list.append(float((step[5:])))
                                    # sort step lsit (ascending order):
                                    step_list.sort()

                                    # collecting faeture and values to cur_feature_list and cur_value_list
                                    for line in data["rheobase_current"]['soma']:  # 1.
                                        if 'feature' in line:
                                            feature_name = "rheobase_" + line['feature']

                                            cur_feature_list.append(feature_name)
                                            cur_value_list.append(line['val'][0])

                                            if feature_name not in all_feature_list:
                                                all_feature_list.append(feature_name)

                                    for line in data["steady_state_current"]['soma']:  # 2.
                                        if 'feature' in line:
                                            feature_name = "steady_state_" + line['feature']

                                            cur_feature_list.append(feature_name)
                                            cur_value_list.append(line['val'][0])

                                            if feature_name not in all_feature_list:
                                                all_feature_list.append(feature_name)

                                    for line in data["maxspike_current"]['soma']:  # 3.
                                        if 'feature' in line:
                                            feature_name = "maxspike_" + line['feature']

                                            cur_feature_list.append(feature_name)
                                            cur_value_list.append(line['val'][0])

                                            if feature_name not in all_feature_list:
                                                all_feature_list.append(feature_name)

                                    for line in data["rheobase_prev_current"]['soma']:  # 3.
                                        if 'feature' in line:
                                            feature_name = "rheobase_prev_" + line['feature']

                                            cur_feature_list.append(feature_name)
                                            cur_value_list.append(line['val'][0])

                                            if feature_name not in all_feature_list:
                                                all_feature_list.append(feature_name)

                                    for line in data["standard_negative_current"]['soma']:  # 3.
                                        if 'feature' in line:
                                            feature_name = "standard_negative_" + line['feature']

                                            cur_feature_list.append(feature_name)
                                            cur_value_list.append(line['val'][0])

                                            if feature_name not in all_feature_list:
                                                all_feature_list.append(feature_name)

                                    for line in data["global"]['soma']:  # 4.
                                        feature_name = "global_" + line['feature']
                                        cur_feature_list.append(feature_name)
                                        cur_value_list.append(line['val'][0])

                                        if feature_name not in all_feature_list:
                                            all_feature_list.append(feature_name)

                                    # adding cell type to the dataframe:
                                    if cell in smb_cells:
                                        cur_feature_list.append('#cell_type')
                                        cur_value_list.append('HS')
                                        smb_count += 1
                                    else:
                                        cur_feature_list.append('#cell_type')
                                        cur_value_list.append('OLM')
                                        smb_count += 1

                                    tmp_df = DataFrame([cur_value_list],
                                                    columns=cur_feature_list)
                                    df_data_bymeas = df_data_bymeas.append(tmp_df, ignore_index=True)
                                    df_data_prot_bymeas = df_data_prot_bymeas.append(tmp_df, ignore_index=True)

        df_data_prot_bymeas = df_data_prot_bymeas.sort_index(axis=1) #ezt nem tudjuk mi alapján és miért rendezi
        # prot_df = df.sort_values(by=['#name', '#protocol'])
        df_data_prot_bymeas.to_csv(protocol + "_df.txt", sep='\t')

    # sort dataframe by column names:
    df_data_bymeas = df_data_bymeas.sort_index(axis=1)
    #df_data_bymeas = df_data_bymeas.sort_values(by=['#name', '#protocol'])
    #df_data_bymeas.to_excel(folder_name+"df_data_bymeas.xlsx")
    df_data_bymeas.to_excel(folder_name+"df_data_bymeas_toModify.xlsx")

    return df_data_bymeas

def remove_useless_features(df_data_bymeas, save_dir):

    removed_features = []


    df_data_bymeas_without_useless_features = df_data_bymeas.copy(deep=True)

    for feat in df_data_bymeas_without_useless_features:

        ''' meaningless features in general''' 
        if 'is_not_stuck' in feat or 'numspikes' in feat or 'indices' in feat or 'trace_check' in feat or 'peak_time' in feat or 'irregularity_index' in feat or 'initburst' in feat or 'rate_change' in feat:
            removed_features.append(feat)
            
        if 'standard_negative' in feat:
            if 'Spikecount' in feat or 'ISI' in feat or 'spike' in feat or 'AP' in feat or 'AHP' in feat or 'burst' in feat or 'adaptation' in feat or 'frequency' in feat or 'peak' in feat or 'amp_drop' in feat:
                removed_features.append(feat)
                
        if 'steady_state' in feat :
            if 'Spikecount' in feat or 'ohmic' in feat:
                removed_features.append(feat)
                
        if 'standard_negative' not in feat and 'sag' in feat:
                removed_features.append(feat)
                
        if 'rheobase_prev' in feat:
            if 'AP' in feat or 'ISI' in feat or 'burst' in feat or 'AHP' in feat or 'adaptation' in feat or 'amp_drop' in feat or 'spike' in feat or 'frequency' in feat or 'peak' in feat:
                removed_features.append(feat)
                
        if 'BPAP' in feat or 'AIS' in feat or 'BAC' in feat or 'phaseslope' in feat or 'interburst' in feat or 'AP_begin_time' in feat:
            removed_features.append(feat)
            
        ''' meaningless features in general'''        
        if feat in ['rheobase_time', 'rheobase_prev_time', 'steady_state_time', 'maxspike_time', 'standard_negative_time',
                    'rheobase_voltage', 'rheobase_prev_voltage', 'steady_state_voltage', 'maxspike_voltage', 'standard_negative_voltage',
                    'rheobase_time_constant', 'rheobase_prev_time_constant', 'steady_state_time_constant', 'maxspike_time_constant', 'standard_negative_time_constant', 'rheobase_burst_mean_freq', 
                    'rheobase_prev_max_amp_difference', 'standard_negative_max_amp_difference']:
                removed_features.append(feat)
                
        ''' meaningless features at rheobase where it is ok to have only one AP '''
        if feat in ['rheobase_AHP2_depth_from_peak', 'rheobase_AHP_depth_diff', 'rheobase_AP2_AP1_begin_width_diff', 'rheobase_AP2_AP1_diff', 'rheobase_AP2_AP1_peak_diff', 'rheobase_AP2_amp',
                     'rheobase_AP2_begin_voltage', 'rheobase_AP2_begin_width', 'rheobase_AP2_peak', 'rheobase_AP2_width', 'rheobase_AP_amplitude_change', 'rheobase_AP_amplitude_diff',  
                     'rheobase_AP_duration_change', 'rheobase_AP_duration_half_width_change', 'rheobase_AP_fall_rate_change', 'rheobase_AP_rise_rate_change', 
                     'rheobase_adaptation_index', 'rheobase_adaptation_index2', 'rheobase_amp_drop_first_last', 'rheobase_amp_drop_first_second', 'rheobase_amp_drop_second_last',
                     'rheobase_fast_AHP_change', 'rheobase_max_amp_difference', 'rheobase_min_voltage_between_spikes', 'rheobase_time_to_second_spike']:
                removed_features.append(feat)                

    #df_data_bymeas_without_useless_features.drop(columns = removed_features)
    df_data_bymeas_without_useless_features = df_data_bymeas_without_useless_features.drop(removed_features, axis = 1)

    print('Number of ALL features: ', len(df_data_bymeas.columns.values.tolist()[2:]))
    print('Number of REMOVED features: ', len(removed_features))
    print('Number of features KEPT: ', len(df_data_bymeas_without_useless_features.columns.values.tolist()[2:]))

    with open(save_dir + 'removed_useless_features.txt', 'w') as f:
        f.write('\n'.join(removed_features))

    df_data_bymeas_without_useless_features.to_excel(save_dir + "df_data_bymeas_without_useless_features_toModify.xlsx")

    return df_data_bymeas_without_useless_features

def check_missing_values(df_data_bymeas, save_dir):

    ''' Missing values by FEATURES'''

    count_missing_all = df_data_bymeas.isna().sum()[2:]   #leaving out name, and group
    count_missing_all.to_csv(save_dir + "count_missing_values_all.txt", sep='\t')
    missing_indices = np.nonzero(count_missing_all.values)
    missing_all = count_missing_all.iloc[missing_indices] #contains those features and the count of missing values where it is not 0 (there ARE missing values)
    print('Number of features: ', len(count_missing_all))
    print('Number of features with missing values: ', len(missing_all))

    count_missing_OLM = df_data_bymeas.loc[df_data_bymeas['#cell_type'] == 'OLM'].isna().sum()[2:]   #leaving out name, and group
    count_missing_OLM.to_csv(save_dir + "count_missing_values_OLM.txt", sep='\t')
    missing_OLM = count_missing_OLM.iloc[missing_indices]

    count_missing_HS = df_data_bymeas.loc[df_data_bymeas['#cell_type'] == 'HS'].isna().sum()[2:]   #leaving out name, and group
    count_missing_HS.to_csv(save_dir + "count_missing_values_HS.txt", sep='\t')
    #print(count_missing_all.index, count_missing_all.values)
    missing_HS = count_missing_HS.iloc[missing_indices]

    plt.figure()
    plt.bar(range(len(count_missing_all.values)), count_missing_all.values, color = 'b', label = 'all')
    plt.bar(range(len(count_missing_OLM.values)), count_missing_OLM.values, color = '#7E2F8E', label = 'OLM')
    plt.bar(range(len(count_missing_HS.values)), count_missing_HS.values, color = '#77AC30', label = 'HS')
    plt.xticks(range(len(count_missing_all.index)), count_missing_all.index, rotation = 90)
    plt.ylabel('Number of missing values')
    plt.legend()

    plt.figure()
    plt.bar(range(len(missing_all.values)), missing_all.values, color = 'b', label = 'all')
    plt.bar(range(len(missing_OLM.values)), missing_OLM.values, color = '#7E2F8E', label = 'OLM')
    plt.bar(range(len(missing_HS.values)), missing_HS.values, color = '#77AC30', label = 'HS')
    plt.xticks(range(len(missing_all.index)), missing_all.index, rotation = 90)
    plt.ylabel('Number of missing values')
    plt.legend()


    ''' Missing values by CELLS '''

    count_missing_by_cells_all = df_data_bymeas.isna().sum(axis = 1)
    count_missing_by_cells_all.to_csv(save_dir + "count_missing_values_by_cells_all.txt", sep='\t')
    

def dataframe_bygroups(filename, save_dir):
    df = pd.read_excel(filename, index_col=0) # using full table
    OLM=DataFrame()
    HS=DataFrame()
    df_group1 = df.loc[df['#cell_type'] == 'OLM']
    df_group2 = df.loc[df['#cell_type'] == 'HS']
    df_group1.to_excel(save_dir+"df_OLM.xlsx")
    df_group2.to_excel(save_dir+"df_HS.xlsx")

def plot_feat_values(df_path, save_dir):

    if not os.path.exists(save_dir) :
        os.makedirs(save_dir)

    df = pd.read_excel(df_path, index_col=0)
    feats = df.columns[2:]   
    print(feats)
    '''
    #for feat in df:  
        #if feat != '#name' and feat != '#group':
    feats = df.columns[2:]    
    df[feats[0:11]].plot(subplots=True, x='#name', layout = (2,5))
    '''
    k=0 #just for naming the saved figures
    for i in range(0, len(feats)+1, 25):
        #print('i', i)
        
        fig, axs = plt.subplots(5, 5, figsize=(5*4, 5*4))
        fig.subplots_adjust(wspace = 0.5, hspace = 0.6)
        axs=axs.flatten()
        if len(feats)-i >= 25:
            for j in range(0, 25, 1):
                #print('j', j)
                #print(i+j)
                axs[j].plot(range(len(df[feats[i+j]])), df[feats[i+j]], '.')
                axs[j].title.set_text(feats[i+j])
        else:
            for j in range(0, len(feats)-i, 1):
                #print('j', j)
                #print(i+j)
                axs[j].plot(range(len(df[feats[i+j]])), df[feats[i+j]], '.')   
                axs[j].title.set_text(feats[i+j])  
        plt.savefig(save_dir + 'feature_values_' + str(k)) 
        k+=1
        

def define_outliers(load_name, save_name):
    # outlier search based on Heiner Vivien thesis
    df = pd.read_excel(load_name, index_col=0) # using full table
    df_outliers = df.copy()  # df_copy contains a null-matrix with 1-s in the coordinates of outliers
    for col in df_outliers.columns:
        if col != '#name' and col != '#cell_type':
            med1 = df_outliers[col].median()  # med1 is the median of the column
            # df_copy[col] = df_copy[col] - med1  # subtraction of med1 from all the members of the column
            med2 = abs(df_outliers[col] - med1).median()  # med2 is the median of the column after subtraction of med1
            for ind in df_outliers.index:

                if med2 * 10 < abs(med1 - df_outliers.at[ind, col]):
                    df_outliers.at[ind, col] = 1
                elif med2 * 10 >= abs(med1 - df_outliers.at[ind, col]):
                    df_outliers.at[ind, col] = 0
                else:
                    df_outliers.at[ind, col] = None

    df_outliers.to_excel(save_name)
    
def process_outliers_by_features(df_path, save_path_count, save_path_suspicious):

    df = pd.read_excel(df_path, index_col=0)
    #print(df.columns)
    
    ''' Count outliers by features'''
    count_outliers = df.sum()[2:]   #leaving out name, and group
    count_outliers.to_csv(save_path_count, sep='\t')
    
    mean = count_outliers.values.mean()
    print(mean)
    
    suspicious_feats = count_outliers[count_outliers > mean]
    suspicious_feats.to_csv(save_path_suspicious, sep='\t')    
    
def process_outliers_by_cells(df_path, save_path_count, save_path_suspicious):

    df = pd.read_excel(df_path, index_col=0)
    
    ''' Count outliers by cells'''
    count_outliers = df.sum(axis = 1)
    count_outliers.index = df['#name']
    count_outliers.to_csv(save_path_count, sep='\t')  
    
    mean = count_outliers.values.mean()
    print(mean)
    
    suspicious_cells = count_outliers[count_outliers > mean]
    suspicious_cells.to_csv(save_path_suspicious, sep='\t')

def mannwhitney_test(df_path, save_dir):

    df = pd.read_excel(df_path, index_col=0)
    
    feature_list = []
    p_value_list = []

    df_group1 = df.loc[df['#cell_type'] == 'OLM']
    df_group2 = df.loc[df['#cell_type'] == 'HS']

    for i in range(2, len(df_group1.columns)):
        col1 = df_group1.iloc[:, [i]]
        col2 = df_group2.iloc[:, [i]]
        mwu = stats.mannwhitneyu(x=col1, y=col2, method='exact', nan_policy = 'omit')

        feature_list.append(list(col1)[0])
        p_value_list.append(mwu.pvalue[0])

    d = {'Feature': feature_list, 'p_value': p_value_list}
    mwu_df = pd.DataFrame(d)
    mwu_df = mwu_df.sort_values(by=['p_value'])

    mwu_df.to_excel(save_dir + "mwu_OLM_HS.xlsx")    

def boxplot_feat_values(df_path, mwu_df_path, save_dir):

    if not os.path.exists(save_dir) :
        os.makedirs(save_dir)

    mwu_df = pd.read_excel(mwu_df_path, index_col=0)
    feats = mwu_df['Feature'].values   # I read the features from mwu_df, so to plot them in the order of the p_values
    print(len(feats))
    print(feats)
    
    df = pd.read_excel(df_path, index_col=0)
    
    k=0 #just for naming the saved figures
    for i in range(0, len(feats)+1, 16):
        #print('i', i)
        
        fig, axs = plt.subplots(4, 4, figsize=(4*5, 4*4))
        fig.subplots_adjust(wspace = 0.8, hspace = 0.6)
        axs=axs.flatten()
        
        if len(feats)-i >= 16:
            for j in range(0, 16, 1):
                feat_values_group1 = df.loc[df['#cell_type'] == 'OLM', feats[i+j]]
                feat_values_group2 = df.loc[df['#cell_type'] == 'HS', feats[i+j]]
                #print('j', j)
                #print(i+j)
                axs[j].plot([1]*len(feat_values_group1), feat_values_group1, 'm.')
                axs[j].plot([2]*len(feat_values_group2), feat_values_group2, 'g.')
                axs[j].boxplot([feat_values_group1[~np.isnan(feat_values_group1)], feat_values_group2[~np.isnan(feat_values_group2)]])  #eliminating NaNs
                axs[j].title.set_text(feats[i+j] + '\n' + 'p value: ' + str(mwu_df.loc[mwu_df['Feature'] == feats[i+j], 'p_value'].item()))
                axs[j].set_xticks((1, 2))
                axs[j].set_xticklabels(('OLM', 'HS'))
        else:
            for j in range(0, len(feats)-i, 1):
                feat_values_group1 = df.loc[df['#cell_type'] == 'OLM', feats[i+j]]
                feat_values_group2 = df.loc[df['#cell_type'] == 'HS', feats[i+j]]
                #print('j', j)
                #print(i+j)
                axs[j].plot([1]*len(feat_values_group1), feat_values_group1, 'm.')
                axs[j].plot([2]*len(feat_values_group2), feat_values_group2, 'g.')
                axs[j].boxplot([feat_values_group1[~np.isnan(feat_values_group1)], feat_values_group2[~np.isnan(feat_values_group2)]])  #eliminating NaNs               
                axs[j].title.set_text(feats[i+j] + '\n' + 'p value: ' + str(mwu_df.loc[mwu_df['Feature'] == feats[i+j], 'p_value'].item())) 
                axs[j].set_xticks((1, 2))
                axs[j].set_xticklabels(('OLM', 'HS'))                
        plt.savefig(save_dir + 'boxplot_feature_values_' + str(k)) 
        k+=1 

def barchart_mwu_p_values(mwu_df_path, save_dir):

    mwu_df = pd.read_excel(mwu_df_path, index_col=0)   
    
    mwu_df_significant = mwu_df.loc[mwu_df['p_value'] <= 0.05]
    
    num_tests = len(mwu_df.Feature)
    
    bonferroni = 0.05/num_tests
    sidak = 1-((1-0.05)**(1/num_tests))
    
    #Benjamini–Hochberg procedure
    ind = [index for index,value in enumerate(mwu_df.p_value) if value <= (index+1)/num_tests*0.05]
    Benjamini_Hochberg = mwu_df.p_value[max(ind)]
    
    plt.figure()
    #plt.bar(range(len(mwu_df.Feature)), mwu_df.p_value, color = 'b') # plot all
    #plt.xticks(range(len(mwu_df.Feature)), mwu_df_significant, rotation = 90)
    plt.bar(range(len(mwu_df_significant.Feature)), mwu_df_significant.p_value, color = 'b')
    plt.axhline(y=0.05, label='0.05', color='g')
    plt.axhline(y=0.01, label='0.01', color='c')
    plt.axhline(y=bonferroni, label=str(bonferroni) + ' (Bonferroni correction)', color='r')
    plt.axhline(y=sidak, label=str(sidak) + ' (Sidak correction)', color='orange')
    plt.axhline(y=Benjamini_Hochberg, label=str(Benjamini_Hochberg) + ' (Benjamini_Hochberg correction)', color='brown')
    plt.xticks(range(len(mwu_df_significant.Feature)), mwu_df_significant.Feature, rotation = 90)
    plt.ylabel('p-value')

    

    for i in range(1, len(mwu_df_significant.Feature)+1):
        holm_bonferroni = 0.05/(num_tests+1-i) 
        if i == 1:
            plt.hlines(y=holm_bonferroni, xmin=i-1-0.5, xmax=i-1+0.5 , color = 'm', label = 'Holm-Bonferroni correction')
        else:
            plt.hlines(y=holm_bonferroni, xmin=i-1-0.5, xmax=i-1+0.5 , color = 'm')
    
    plt.legend()    
    
    
def add_p_value_to_table(df_path, mwu_df_path, save_dir):

    if not os.path.exists(save_dir) :
        os.makedirs(save_dir)

    mwu_df = pd.read_excel(mwu_df_path, index_col=0)
    
    df = pd.read_excel(df_path, index_col=0)   
    
    ''' bringig mwu_df to the same format as df so p_values can be concatenated to the df'''
    mwu_df = mwu_df.transpose()
    mwu_df = mwu_df.reset_index(drop=True)
    mwu_df.columns = mwu_df.iloc[0] # making the feature names to be the column headers instead of the leftover indices
    mwu_df = mwu_df[1:] # remove duplicate headers that we created in the previous line

    mwu_df.insert(0, '#name', ['p_value'])
    mwu_df.insert(0, '#cell_type', [np.nan])
    ''' '''
    
    df_new = pd.concat([mwu_df, df], ignore_index = True)
    
    df_new.to_excel(save_dir + "df_data_bymeas_without_useless_features_with_P_VALUES.xlsx")
    

def statistics_feat_values(df_path, mwu_df_path, save_dir):

    medians_olm = []
    iq_olm = []
    minimums_olm = []
    maximums_olm = []
    medians_hs = []
    iq_hs = []
    minimums_hs = []
    maximums_hs = []
    percentages = []
    upper_quartile_olm = []
    lower_quartile_olm = []
    upper_quartile_hs = []
    lower_quartile_hs = []

    if not os.path.exists(save_dir) :
        os.makedirs(save_dir)

    mwu_df = pd.read_excel(mwu_df_path, index_col=0)
    feats = mwu_df['Feature'].values   # I read the features from mwu_df, so to plot them in the order of the p_values

    df = pd.read_excel(df_path, index_col=0)

    for i in range(0, len(feats)):
        feat_values_group1 = df.loc[df['#cell_type'] == 'OLM', feats[i]]
        feat_values_group2 = df.loc[df['#cell_type'] == 'HS', feats[i]]
        median_olm = np.nanmedian(feat_values_group1)
        medians_olm.append(median_olm)
        q1, q2 = np.nanpercentile(feat_values_group1, [75, 25])
        upper_quartile_olm.append(q1)
        lower_quartile_olm.append(q2)
        iq_feat = q1-q2
        iq_olm.append(iq_feat)
        minimums_olm.append(min(feat_values_group1))
        maximums_olm.append(max(feat_values_group1))

        median_hs = np.nanmedian(feat_values_group2)
        medians_hs.append(median_hs)
        q3, q4 = np.nanpercentile(feat_values_group2, [75, 25])
        upper_quartile_hs.append(q3)
        lower_quartile_hs.append(q4)
        iq_feat2 = q3-q4
        iq_hs.append(iq_feat2)
        minimums_hs.append(min(feat_values_group2))
        maximums_hs.append(max(feat_values_group2))

        if median_olm != 0:
            percentage = (median_hs/median_olm) * 100 # HS hany %-a az OLM-nek, Gabor igy kerte
            percentages.append(percentage)
        else:
            percentages.append('NaN')

    mwu_df['Median OLM'] = medians_olm
    mwu_df['InterQ OLM min'] = lower_quartile_olm
    mwu_df['InterQ OLM max'] = upper_quartile_olm
    mwu_df['Minimum OLM'] = minimums_olm
    mwu_df['Maximum OLM'] = maximums_olm

    mwu_df['Median HS'] = medians_hs
    mwu_df['InterQ HS min'] = lower_quartile_hs
    mwu_df['InterQ HS max'] = upper_quartile_hs
    mwu_df['Minimum HS'] = minimums_hs
    mwu_df['Maximum HS'] = maximums_hs
    mwu_df['Percentage'] = percentages
    mwu_df.to_excel(save_dir + "mwu_OLM_HS_with_allstat.xlsx")


# MAIN
folder_name="/mnt/csoport31-2/Modellezo_csapat/HS_OLM/Lucatol_Lucanak/statisztika_semi_final/"
data_name="all_features"

'''Data processing'''

#df_data_bymeas = make_data_table(folder_name+data_name)
#df_data_bymeas_without_useless_features = remove_useless_features(df_data_bymeas, folder_name)
#check_missing_values(df_data_bymeas_without_useless_features, folder_name)
#dataframe_bygroups(folder_name + 'df_data_bymeas_without_useless_features_toModify.xlsx', folder_name)

''' Plot features '''

#plot_feat_values(folder_name+"df_OLM.xlsx", folder_name+'feature_plots_OLM/')
#plot_feat_values(folder_name+"df_HS.xlsx", folder_name+'feature_plots_HS/')

'''Outlier search'''

#define_outliers(folder_name + 'df_data_bymeas_without_useless_features_toModify.xlsx', folder_name+ 'outlier_df_bymeas_full.xlsx')

#define_outliers(folder_name + 'df_OLM.xlsx', folder_name + 'outlier_OLM.xlsx')
#define_outliers(folder_name + 'df_HS.xlsx', folder_name + 'outlier_HS.xlsx')

#process_outliers_by_features(folder_name + 'outlier_OLM.xlsx', folder_name + 'count_outliers_by_features_OLM.txt', folder_name + 'suspicious_features_OLM.txt')
#process_outliers_by_features(folder_name + 'outlier_HS.xlsx', folder_name + 'count_outliers_by_features_HS.txt', folder_name + 'suspicious_features_HS.txt')

#process_outliers_by_cells(folder_name + 'outlier_OLM.xlsx', folder_name + 'count_outliers_by_cells_OLM.txt', folder_name + 'suspicious_cells_OLM.txt')
#process_outliers_by_cells(folder_name + 'outlier_HS.xlsx', folder_name + 'count_outliers_by_cells_HS.txt', folder_name + 'suspicious_cells_HS.txt')


''' Mann-Whitney Test '''
82
#mannwhitney_test(folder_name + 'df_data_bymeas_without_useless_features_toModify.xlsx', folder_name)

''' Boxplot of features '''
#boxplot_feat_values(folder_name + 'df_data_bymeas_without_useless_features_toModify.xlsx', folder_name + 'mwu_OLM_HS.xlsx', folder_name + 'boxplots_feat_values/')
#megj.: allatonkent meg nem csinaltam meg, de at kell nezni azokat az outlierek miatt

#barchart_mwu_p_values(folder_name + 'mwu_OLM_HS.xlsx', folder_name)

#add_p_value_to_table(folder_name + 'df_data_bymeas_without_useless_features_toModify.xlsx', folder_name + 'mwu_OLM_HS.xlsx', folder_name)

statistics_feat_values(folder_name + 'df_data_bymeas_without_useless_features_toModify.xlsx', folder_name + 'mwu_OLM_HS.xlsx', folder_name)

plt.show()




