####transferability analysis scripts
####R codes
####Previous steps in Transferability.sh

##reformat the credibel set file
loci <- fread("GWAS_EUR_published.txt") #the published GWAS loci in European population, with the following columns: rsid, chr, pos, markername, effect allele, non-effect allele, effect allele frequency, effect estimate, standard error of effect estimate, P value in our IVW meta-analysis, P value from the GWAS paper the loci was reported

names(loci) <- c("SNP", "CHR", "BP","MARKERNAME","EUR_EA","EUR_NEA","EUR_EAF","EUR_BETA","EUR_SE","EUR_P","EUR_P_paper") #rename columns

loci <- data.frame(loci)
loci$Pthresh=loci$EUR_P*100 ## NOTE: Established SNP list summary stats to include additional column with P-value threshold (100*pSentinel) for filtering credible set proxy SNPs
loci$N.COJO=NULL
loci$Location=NULL
loci$IMPACT=NULL
loci$Allele=NULL
loci <- loci[!is.na(loci$EUR_P),] #drop rows without valid P value
dim(loci) #number of loci which we are able to access
#[1] 205

##Read in credible set
cred=read.table("eur.gwas.snps.eu.ld.txt", header=T) #EUR credible set file from PLINK, see Transferability.sh
cred$Lead_Proxy="Proxy"
cred$Lead_Proxy=replace(cred$Lead_Proxy, cred$SNP.Lead==cred$SNP.proxy, "Lead") #Assign lead snps 
cred$Lead_Proxy=as.factor(cred$Lead_Proxy) 

loci$Locus <- loci$SNP
cred2=merge(loci[c("SNP", "Locus", "Pthresh")], cred, by.x="SNP", by.y="SNP.Lead", all=T)

#Read in EUR study sumstats and merge with credset to extract credset sumstats and filter based on P
fe_eur <- fread("/SAN/ugi/mdd/trans_ethnic_ma/results/FE_EUR_MDD2_ASDG_qced_rsid.txt.gz") #Read the EUR GWAS sumstats
fe_eur <- fe_eur[,c(1:9,14)]
names(fe_eur) <- c("MarkerName","CHR","BP","EUR_EA","EUR_NEA","EUR_EAF","EUR_BETA","EUR_SE","EUR_P","SNP") #Rename columns, EA effect allele, NEA, non-effect allele, EAF effect allele frequency.
cred2=merge(cred2, fe_eur, by.x=c("CHR", "BP", "SNP.proxy"), by.y=c("CHR", "BP","SNP"), all.x=T)

cred2 <- cred2[cred2$SNP %in% loci$SNP,] #exclude redundant entries
cred2 <- cred2[!is.na(cred2$EUR_P),] #exclude redundant entries


lead=subset(cred2, Lead_Proxy=="Lead")
proxy=subset(cred2, Lead_Proxy=="Proxy")

cred_fin=subset(proxy, proxy$EUR_P<proxy$Pthresh | proxy$EUR_P <5e-100)
cred_fin=rbind(lead,cred_fin)

cred_fin <- cred_fin[cred_fin$SNP %in% loci$SNP,]
length(unique(cred_fin$SNP))
#[1] 205

cred_fin$Pthresh=NULL
cred_fin$SNP.y=NULL
cred_fin$A1=NULL
cred_fin$TEST=NULL
cred_fin$A2=NULL
cred_fin$STAT=NULL
cred_fin$NMISS=NULL

names(cred_fin)[4]="SNP.Lead"
names(cred_fin)[3]="SNP"

cred_fin$CHR_A=NULL
cred_fin$BP_A=NULL

#Calculate tranferability P threshold in each non-European ancestry with the Pf factor
#Pf factor is based on imperical estimation, please see manuscript method section for details
Pf_afr <- 0.008341
Pf_eas <- 0.007378
Pf_sas <- 0.006847
Pf_his <- 0.003147
library(dplyr)
df <- count(cred_fin,Locus)
df$P_thre_afr <- 10^(log(0.05,base=10)- Pf_afr*(df$n-1))
df$P_thre_eas <- 10^(log(0.05,base=10)- Pf_eas*(df$n-1))
df$P_thre_his <- 10^(log(0.05,base=10)- Pf_his*(df$n-1))
df$P_thre_sas <- 10^(log(0.05,base=10)- Pf_sas*(df$n-1))
names(df)[2] <- "N_credset"
cred_fin <- merge(cred_fin,df,all.x=T,order=F)

cred_fin <- cred_fin[order(cred_fin$CHR,cred_fin$BP),]
write.table(cred_fin, "MDD.EUR.credset_final.txt", col.names=T, row.names=F, quote=F, sep= "\t") #save the cred_fin file

## estimate EUR GWAS significant loci in summary statistics of African ancestry

library(data.table)
setwd("/SAN/ugi/mdd/trans_ethnic_ma/results/") #set working directory to project space
cred_fin <- fread("MDD.EUR.credset_final.txt",stringsAsFactors=F) #read in cred_fin
loci <- fread("GWAS_EUR_published.txt") #list of published EUR GWAS loci

fe_afr <- fread("FE_afr_qced.txt") #GWAS summary statistics in African ancestry
cred_fin_afr=merge(cred_fin, fe_afr[,c("MarkerName", "EA", "NEA", "EAF", "BETA", "SE", "P")], by.x="MarkerName", by.y="MarkerName", all.x=T) #merge AFR sumtats with the credible set file
names(cred_fin_afr)[20:25] <- c("afr_EA","afr_NEA","afr_EAF","afr_BETA","afr_SE","afr_P") #rename column names from AFR sumstats
cred_fin_afr <- cred_fin_afr[!is.na(cred_fin_afr$afr_P),] #exclude columns without P values

# access whether EA/NEA match between two populations
EUR_alleles <- paste0(cred_fin_afr$EUR_EA,cred_fin_afr$EUR_NEA) #paste EA and NEA into a new column, to check if EUR and AFR sumstats used the same alleles ad effect alleles
afr_alleles <- paste0(cred_fin_afr$afr_EA,cred_fin_afr$afr_NEA) 
check <- EUR_alleles == afr_alleles
sum(!check) #number of rows where AFR and EUR used different alleles as effect alleles
#[1] 0
length(unique(cred_fin_afr$Locus)) #number of loci available for transferability analysis
#[1] 204

#Check direction of effect sizes
#Mismatched to be assigned as not transferred
cred_fin_afr$b.direction=NA
cred_fin_afr$b.direction=replace(cred_fin_afr$b.direction, cred_fin_afr$EUR_BETA>=0 & cred_fin_afr$afr_BETA>=0, "match")
cred_fin_afr$b.direction=replace(cred_fin_afr$b.direction, cred_fin_afr$EUR_BETA<=0 & cred_fin_afr$afr_BETA<=0, "match")
cred_fin_afr$b.direction=replace(cred_fin_afr$b.direction, cred_fin_afr$EUR_BETA<=0 & cred_fin_afr$afr_BETA>=0, "mismatch")
cred_fin_afr$b.direction=replace(cred_fin_afr$b.direction, cred_fin_afr$EUR_BETA>=0 & cred_fin_afr$afr_BETA<=0, "mismatch")


#Check P values
cred_fin_afr$transferred=NA
cred_fin_afr$transferred=replace(cred_fin_afr$transferred, cred_fin_afr$afr_P < 0.05, "Yes")
cred_fin_afr$transferred=replace(cred_fin_afr$transferred, cred_fin_afr$afr_P >= 0.05, "No")
cred_fin_afr$transferred=replace(cred_fin_afr$transferred, cred_fin_afr$b.direction=="mismatch", "No")


#Calculate expected power using R script

alpha <- 0.05 #set the Type I error rate level for power calculation
N_ctrl <- 161817 #Number of controls in AFR sumstats
N_cas <- 36831 #Number of cases in AFR sumstats

N_tot <- N_cas+N_ctrl
n <- N_tot

# ratio cases controls
r <- N_cas/(N_cas+N_ctrl)

# proportion of exposed controls african
p_ctrl <- cred_fin_afr$afr_EAF
f <- p_ctrl

# # # # beta europeans
b <- cred_fin_afr$EUR_BETA

phi <- r 

POWER<-pchisq(qchisq(alpha,df=1,lower = F), df=1, ncp = 2*f*(1-f)*n*phi*(1-phi)*b^2, lower = F)

sum(POWER,na.rm=T) #sum up power and divide by number of loci/variants for total expected power
#[1] 2376.05
2376.05/204
#[1] 11.6473

cred_fin_afr$power=POWER

cred_fin_afr$powered=NA
cred_fin_afr$powered=replace(cred_fin_afr$powered, cred_fin_afr$power>=0.8, "Yes") #power greater than 80% defined as sufficient statistical power
cred_fin_afr$powered=replace(cred_fin_afr$powered, cred_fin_afr$power<0.8, "No")

write.table(cred_fin_afr, "/SAN/ugi/mdd/trans_ethnic_ma/results/cred_fin_afr.txt", col.names=T, row.names=F, quote=F, sep= "\t") #save the extensive intermediate data frame

## Define transferable loci and ancestry specific loci
## Define transferable loci
my_loci <- unique(cred_fin_afr$Locus) #extract the vector of unique loci available for assessment in AFR ancestry

trans <- c() # create the vector for transferable loci
# loop through each loci to look for transferable ones
for (i in 1:length(my_loci)){
t <- cred_fin_afr[cred_fin_afr$Locus == my_loci[i],]
if(sum(t$transferred=="Yes")>0) {trans <- c(trans,my_loci[i])}
}

## Define specific
spec <- c() # create the vector for ancestry specific loci
#Loop through each loci to look for non-transferable ones

for (i in 1:length(my_loci)){
t <- cred_fin_afr[cred_fin_afr$Locus == my_loci[i],]
if(sum(t$powered=="Yes")>0 & sum(t$afr_P<t$P_thre_afr)==0) {spec <- c(spec,my_loci[i])} #loci with at least one powered variants, while have zero transferable variant will be defined as ancestry specific (i.e., non-transferable)
}

intersect(trans,spec) # check the intersection between trans and spec vectors, expected to be 0
#[1] 0

## Finalize transferability results for afr
loci <- loci[,1:13] #the dataframe with list of significant loci from published European GWAS
loci$Trans_afr <- NA #the final column for transferability result in African population
loci$Trans_afr[loci$SNP %in% trans] <- "Yes"
loci$Trans_afr[loci$SNP %in% spec] <- "No"
write.table(loci,"EUR_loci_transferability_AFR_final.txt",col.names=T, row.names=F, quote=F, sep= "\t")

##Power Adjusted Transferability (PAT) ratio calculation

loci <- fread("GWAS_EUR_published.txt") #list of GWAS significant loci in EUR
afr <- fread("EUR_loci_transferability_AFR_final.txt") #transferablity result with all SNPs available in African sumstats

afr <- afr[afr$Locus %in% loci$Locus,] #only keep loci available in both Africans and Europeans
afr_pat <- aggregate(power ~ Locus, data = afr, max) #retrive SNP with maximum power in each locus
sum(afr_pat$power) #sum of maximum power in each locus
nrow(afr_pat)/sum(afr_pat$power) #number of transferable loci devided by the sum of power is the PAT ratio

