#Author: Francisco Pereira Lobo (franciscolobo@gmail.com) - 2024
#Description: compare our findings with VC work

#cleaning up the environment
rm(list=ls())

#this path should point to the directory produced when you uncompressed file zenodo_reproductibility.tgz
setwd("~/projects/evolution_of_complexity/docs/biorx_evolution_of_complexity/version1/dataFiles/reproductibility_raw_data/zenodo_reproductibility/")

load(file = "results/RData/homologous2SUPERFAMILY_VC.RData")

VC_data <- read.table("data/metadata/correlations_VC.txt")


#VC SUPERFAMILY IDs
VC_SUPERFAMILY_IDs <- VC_data$V1

#how many entries in VC work?
length(VC_SUPERFAMILY_IDs)

#SUPERFAMILY IDs from our dataset (Benjamim-Lobo as a homnage)
BL_SUPERFAMILY_IDs <- colnames(homologous2SUPERFAMILY_VC$y)

#IDs found in both experiments
common_IDs <- intersect(VC_SUPERFAMILY_IDs, BL_SUPERFAMILY_IDs)

#length of valid population size
pop_size <- length(common_IDs)

#significant associations from VC work
significant_VC_IDs <- VC_data$V1[((VC_data$V2 >= 0.8) | (VC_data$V2 <= -0.8))]

#SUPERFAMILY IDs associated with NCT in VC study and also found in our data
shared_significant_VC_IDs <- intersect(significant_VC_IDs, common_IDs)

#Number of success states
num_suc_states <- length(shared_significant_VC_IDs)


#SUPERFAMILY IDs associated with NCT in our data and also found in VC data
shared_significant_BL_IDs <- intersect(homologous2SUPERFAMILY_VC$sig_IDs, common_IDs)
sample_size <- length(shared_significant_BL_IDs)


#intersection of significant associations in VC and BL data
intersect_BL_VC_significant_IDs <- intersect(shared_significant_BL_IDs, shared_significant_VC_IDs)

#number of successes in BL (defined as the one also observed in VC significative ID list)
successes <- length(intersect_BL_VC_significant_IDs)

#p-value
phyper((successes-1), num_suc_states, (pop_size-num_suc_states), sample_size, lower.tail= FALSE)

