# pre-processing : bcftools norm -m -any path/file.vcf -o path/output.vcf #USING BCFTOOLS gt = allel.GenotypeArray(callset['calldata/GT'] #USING SCIKIT-ALLEL vcftools --vcf path/my.vcf --012 --out path/out.vcf #USING VCFTOOLS #processing : from sklearn.model_selection import StratifiedKFold from imblearn.over_sampling import SMOTE from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import f1_score from sklearn.metrics import recall_score from sklearn.metrics import precision_score from sklearn.metrics import confusion_matrix from sklearn.metrics import roc_auc_score import matplotlib.patches as patches from sklearn.metrics import roc_curve,auc from sklearn.model_selection import cross_val_predict from numpy import interp cv = StratifiedKFold(n_splits=4,shuffle=True,random_state=0) fig1 = plt.figure(figsize=[12,12]) ax1 = fig1.add_subplot(111,aspect = 'equal') acc=[] f1=[] recall=[] pre=[] tprs = [] aucs = [] mean_fpr = np.linspace(0,1,100) i = 1 for fold, (train_index, test_index) in enumerate(cv.split(x,y), 1): X_train = x.iloc[train_index] y_train = y[train_index] X_test = x.iloc[test_index] y_test = y[test_index] sm = SMOTE() X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train) model = RandomForestClassifier() model.fit(X_train_oversampled, y_train_oversampled) y_pred = model.predict(X_test) print("---------------------") print(f'For fold {fold}:') print(f"Counter of y train before smote =, {Counter(y_train)}") print(f"Counter of y train after smote =, {Counter(y_train_oversampled)}") print(f"Counter of y test =, {Counter(y_test)}") #print(f'Accuracy: {model.score(X_test, y_test)}') print("classification report:") print(classification_report(y_test,y_pred)) acc.append(model.score(X_test, y_test)) f1.append(f1_score(y_test, y_pred)) recall.append(recall_score(y_test, y_pred)) pre.append(precision_score(y_test, y_pred)) print(f'confusion matrix: {confusion_matrix(y_test, y_pred)}') print("featuers:") feature_names=X_train_oversampled.keys() important = pd.Series(model.feature_importances_, index=feature_names).nlargest(100) print(important) prediction = model.predict_proba(X_test) print("predict_proba") print(prediction) fpr, tpr, t = roc_curve(y_test, prediction[:, 1]) tprs.append(interp(mean_fpr, fpr, tpr)) roc_auc = auc(fpr, tpr) aucs.append(roc_auc) plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc)) i= i+1 print(f"AUC =, {roc_auc_score(y_test, prediction[:, 1])}") plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black') mean_tpr = np.mean(tprs, axis=0) mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, color='blue', label=r'Mean ROC (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC') plt.legend(loc="lower right") plt.show() avgaccuracy=sum(acc) / len(acc) avgf1=sum(f1) / len(f1) avgrecall=sum(recall) / len(recall) avgpre=sum(pre)/ len(pre) print(f"Accuracy=, {acc}") print(f"Avg accuracy, {avgaccuracy}") print(f"F1=, {f1}") print(f"Avg F1, {avgf1}") print(f"pre=, {pre}") print(f"Avg pre, {avgpre}") print(f"recall=, {recall}") print(f"Avg recall, {avgrecall}")