# pre-processing :

bcftools norm -m -any path/file.vcf -o path/output.vcf #USING BCFTOOLS
gt = allel.GenotypeArray(callset['calldata/GT'] #USING SCIKIT-ALLEL
vcftools --vcf path/my.vcf --012  --out path/out.vcf #USING VCFTOOLS


#processing :
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
import matplotlib.patches as patches
from sklearn.metrics import roc_curve,auc
from sklearn.model_selection import cross_val_predict
from numpy import interp
cv = StratifiedKFold(n_splits=4,shuffle=True,random_state=0)
fig1 = plt.figure(figsize=[12,12])
ax1 = fig1.add_subplot(111,aspect = 'equal')
acc=[]
f1=[]
recall=[]
pre=[]
tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1

for fold, (train_index, test_index) in enumerate(cv.split(x,y), 1):
    X_train = x.iloc[train_index]
    y_train = y[train_index]
    X_test = x.iloc[test_index]
    y_test = y[test_index]
    sm = SMOTE()
    X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)
    model = RandomForestClassifier()
    model.fit(X_train_oversampled, y_train_oversampled)  
    y_pred = model.predict(X_test)
    print("---------------------")
    print(f'For fold {fold}:')
    print(f"Counter of y train before smote =, {Counter(y_train)}")
    print(f"Counter of y train after smote =, {Counter(y_train_oversampled)}")
    print(f"Counter of y test =, {Counter(y_test)}")
    #print(f'Accuracy: {model.score(X_test, y_test)}')
    print("classification report:")
    print(classification_report(y_test,y_pred))
    acc.append(model.score(X_test, y_test))
    f1.append(f1_score(y_test, y_pred))
    recall.append(recall_score(y_test, y_pred))
    pre.append(precision_score(y_test, y_pred))
    print(f'confusion matrix: {confusion_matrix(y_test, y_pred)}')
    print("featuers:")
    feature_names=X_train_oversampled.keys()
    important = pd.Series(model.feature_importances_, index=feature_names).nlargest(100)
    print(important)
    prediction = model.predict_proba(X_test)
    print("predict_proba")
    print(prediction)

    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
    print(f"AUC =, {roc_auc_score(y_test, prediction[:, 1])}")
    

plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue',
         label=r'Mean ROC (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

avgaccuracy=sum(acc) / len(acc)
avgf1=sum(f1) / len(f1)
avgrecall=sum(recall) / len(recall)
avgpre=sum(pre)/ len(pre)
print(f"Accuracy=, {acc}")
print(f"Avg accuracy, {avgaccuracy}")
print(f"F1=, {f1}")
print(f"Avg F1, {avgf1}")
print(f"pre=, {pre}")
print(f"Avg pre, {avgpre}")
print(f"recall=, {recall}")
print(f"Avg recall, {avgrecall}")