Intro

This script produces all results and figures in the manuscript ‘A novel method for using RNA-seq data to identify imprinted genes in social Hymenoptera with multiply mated queens’. It is based on the data from Li et al 2014 manuscript ‘Caste-specific RNA editomes in the leaf-cutting ant Acromyrmex echinatior’. The raw sequences can be downloaded from the NCBI GEO repositories using the accession GSE51576 (or visit https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE51576).

Qiye Li and Zongji Wang of BGI conducted the processing of the raw RNA and DNA data, to output a table of SNPs where the relative frequency of any given SNP-allele differed significantly between a paired DNA and RNA sample. This output table is used as the input here. Queries regarding the production of that table should be directed to them.

An interactive table of contents can be used to navigate to the different sections of this document, which is split into two broad parts:

  • Part 1: analysing the output of the bioinformatics
  • Part 2: producing thte figures from the sequence data and the ddPCR data

The input files can be found in the ‘input_data’ directory, and are names ‘ASE_data_frame.csv’, ‘compiled_ddPCR_results.csv’, and ‘dilution_data.csv’. This script outputs two tables, ‘table_s4.csv’ and ‘table_s5.csv’, and a pdf containing the figures for the manuscript ‘figures.pdf’.

Part 0: loading packages

All R packages used are loaded here, these are required to be installed in order for the script to run.

library(ggplot2)    #for graphics
library(splitstackshape)    #for table splitting (not sure this is needed anymore...)
library(reshape)    #for dataframe manipulation
library(tidyr) # for dataframe manipulation
## 
## Attaching package: 'tidyr'
## The following objects are masked from 'package:reshape':
## 
##     expand, smiths
library(tidyverse) #for dataframe manipulation
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ readr   1.3.1     ✔ stringr 1.4.0
## ✔ purrr   0.3.3     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks reshape::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::rename() masks reshape::rename()
library(ggpubr) #for better control of panels
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(RColorBrewer) #for easier control of colour

Part 1: sequencing data

Importing the Allele-specific expression dataset:

#IMPORT THE DATASET
ASE<- read.csv('../input_data/ASE_data_frame.csv')

Part 1.1: Generating confidence intervals

#Calculating the AgrestiCoul confidence intervals for the proportions
#Make the columns the correct format for calculations
ASE$Gene <- as.character(ASE$Gene)
ASE$Colony <- as.character(ASE$Colony)
ASE$Caste <- as.character(ASE$Caste)
ASE$CountDNA1 <- as.numeric(ASE$CountDNA1)
ASE$CountRNA1 <- as.numeric(ASE$CountRNA1)
ASE$DNAreadsTotal <- as.numeric(ASE$DNAreadsTotal)
ASE$RNAreadsTotal <- as.numeric(ASE$RNAreadsTotal)

##calculate the confidence intervals for the DNA counts, following Agresti-Coull paper cited in manuscript
X <- as.numeric(ASE$CountDNA1)
n <- as.numeric(ASE$DNAreadsTotal)
p <- ((X + 2) / (n + 4))
Z = 1.96 #1.96 -> 95% confidence interval
Top <- p * (1 - p)
Bottom = n + 4
Fraction <- Top / Bottom
ASE$DNAPropMin <- p - (Z * sqrt(Fraction))
ASE$DNAPropMax <- p + (Z * sqrt(Fraction))

#same with the RNA:
##Calculate the observed RNA Confidence Intervals, Agresti-Coull Method.
X <- as.numeric(ASE$CountRNA1)
n <- as.numeric(ASE$RNAreadsTotal)
p <- ((X + 2) / (n + 4))
Z = 1.96
Top <- p * (1 - p)
Bottom = n + 4
Fraction <- Top / Bottom
ASE$RNAPropMin <- p - (Z * sqrt(Fraction))
ASE$RNAPropMax <- p + (Z * sqrt(Fraction))

Part 1.2: Generating imprinting predictions

We proceed through the tests in order of increasing alpha (here we use a to denote alpha):

1 = complete patrigenic expression (a = 0);

2 = biased patrigenic expression (a = 0.2);

3 = biased matrigenic expression (a = 0.8);

4 = complete patrigenic expression (a = 1).

This could likely be shortened to a loop, but has not been done so here. For each value of a, we conduct the combinatorial tests with 3 values of q (0,0.5,1), which denote the queen genotype (Allele proportion of queen).

Part 1.2.1: alpha = 0 ie. exclusive patrigenic expression

a = 0 #set the maternal bias, 0 is complete paternal bias

#make sure the columns are defined as they should be
ASE$DNAPropMin<- as.numeric(as.character(ASE$DNAPropMin)) 
ASE$DNAPropMax<- as.numeric(as.character(ASE$DNAPropMax))
ASE$RNAPropMax<- as.numeric(as.character(ASE$RNAPropMax))
ASE$RNAPropMin<- as.numeric(as.character(ASE$RNAPropMin))

### RUN TESTS
    ##1. Patrigenic expression, queen homozygous for alternative allele.
q = 0 #queen has DNA allele frequency of 0

##Generate predictions from the DNA proportions
ExpRNAmin_complete<- (2*ASE$DNAPropMin - q)
ExpRNAmax_complete<- (2*ASE$DNAPropMax - q)
ExpRNAmin_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMin - q)
ExpRNAmax_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMax - q)

ASE$ExpRNAmin <-pmin(ExpRNAmin_complete, ExpRNAmin_incomplete) #take the minimum of the two to define lower (this is important for the bias section, but makes no difference here)

ASE$ExpRNAmax <-pmax(ExpRNAmax_complete, ExpRNAmax_incomplete) #take the maximum of the two to define the expected upper

    ##Does observed RNA confidence interval  with expected? (and DNA must be lower than 0.5)
ASE$Pat1<- (ASE$RNAPropMin <= ASE$ExpRNAmax) & (ASE$ExpRNAmin <= ASE$RNAPropMax) &
      (ASE$DNAPropMin <= 0.5)

    ##2. Patrigenic expression, heterozygous
q = 0.5

    ##Generate predictions
ExpRNAmin_complete<- (2*ASE$DNAPropMin - q)
ExpRNAmax_complete<- (2*ASE$DNAPropMax - q)
ExpRNAmin_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMin - q)
ExpRNAmax_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMax - q)

ASE$ExpRNAmin <-pmin(ExpRNAmin_complete, ExpRNAmin_incomplete) #take the minimum of the two to define lower
ASE$ExpRNAmax <-pmax(ExpRNAmax_complete, ExpRNAmax_incomplete) #take the maximum of the two to define the expected upper

##Does obsRNA confidence interval with expected (and does DNA fall between 0.75  and 0.25)
ASE$Pat2<- (ASE$RNAPropMin <= ASE$ExpRNAmax) & (ASE$ExpRNAmin <= ASE$RNAPropMax) & 
  (ASE$DNAPropMin <= 0.75) & (0.25 <= ASE$DNAPropMax)

##3. Patrigenic expression, homozygous 2
q = 1

    ##Generate predictions
ExpRNAmin_complete<- (2*ASE$DNAPropMin - q)
ExpRNAmax_complete<- (2*ASE$DNAPropMax - q)
ExpRNAmin_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMin - q)
ExpRNAmax_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMax - q)

ASE$ExpRNAmin <-pmin(ExpRNAmin_complete, ExpRNAmin_incomplete) #take the minimum of the two to define lower
ASE$ExpRNAmax <-pmax(ExpRNAmax_complete, ExpRNAmax_incomplete) #take the maximum of the two to define the expected upper

    ##Does obsRNA confidence interval with expected (and does DNA fall between 0.5 and 1)
ASE$Pat3<- (ASE$RNAPropMin <= ASE$ExpRNAmax) & (ASE$ExpRNAmin <= ASE$RNAPropMax) & 
  (ASE$DNAPropMin <= 1) & (0.5 <= ASE$DNAPropMax)

Part 1.2.2: alpha = 0.2 ie. biased patrigenic expression

This section adds columns for biased patrigenic expression at alpha = 0.2

#BIASED PATRIGENIC EXPRESSION
a = 0.2 #set the maternal bias, 0.2 is 80% from the Patrigene

##1. Patrigenic expression, homozygous for alternative allele.
q = 0 #queen has DNA = 0

##Generate predictions from the DNA proportions
ExpRNAmin_complete<- (2*ASE$DNAPropMin - q)
ExpRNAmax_complete<- (2*ASE$DNAPropMax - q)
ExpRNAmin_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMin - q)
ExpRNAmax_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMax - q)

ASE$ExpRNAmin <-pmin(ExpRNAmin_complete, ExpRNAmin_incomplete) #take the minimum of the two to define lower limit
ASE$ExpRNAmax <-pmax(ExpRNAmax_complete, ExpRNAmax_incomplete) #take the maximum of the two to define the expected upper limit

##Does observed RNA confidence interval  with expected? (and DNA must be lower than 0.5)
ASE$Biased_Pat1<- (ASE$RNAPropMin <= ASE$ExpRNAmax) & (ASE$ExpRNAmin <= ASE$RNAPropMax) &
  (ASE$DNAPropMin <= 0.5)

##2. Patrigenic expression, heterozygous
q = 0.5

##Generate predictions
ExpRNAmin_complete<- (2*ASE$DNAPropMin - q)
ExpRNAmax_complete<- (2*ASE$DNAPropMax - q)
ExpRNAmin_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMin - q)
ExpRNAmax_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMax - q)

ASE$ExpRNAmin <-pmin(ExpRNAmin_complete, ExpRNAmin_incomplete) #take the minimum of the two to define lower
ASE$ExpRNAmax <-pmax(ExpRNAmax_complete, ExpRNAmax_incomplete) #take the maximum of the two to define the expected upper

##Does obsRNA confidence interval with expected (and does DNA fall between 0.75  and 0.25)?
ASE$Biased_Pat2<- (ASE$RNAPropMin <= ASE$ExpRNAmax) & (ASE$ExpRNAmin <= ASE$RNAPropMax) & 
  (ASE$DNAPropMin <= 0.75) & (0.25 <= ASE$DNAPropMax)

##3. Patrigenic expression, homozygous 2
q = 1

##Generate predictions
ExpRNAmin_complete<- (2*ASE$DNAPropMin - q)
ExpRNAmax_complete<- (2*ASE$DNAPropMax - q)
ExpRNAmin_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMin - q)
ExpRNAmax_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMax - q)

ASE$ExpRNAmin <-pmin(ExpRNAmin_complete, ExpRNAmin_incomplete) #take the minimum of the two to define lower
ASE$ExpRNAmax <-pmax(ExpRNAmax_complete, ExpRNAmax_incomplete) #take the maximum of the two to define the expected upper

    ##Does obsRNA confidence interval with expected (and does DNA fall between 0.5 and 1)
ASE$Biased_Pat3<- (ASE$RNAPropMin <= ASE$ExpRNAmax) & (ASE$ExpRNAmin <= ASE$RNAPropMax) & 
  (ASE$DNAPropMin <= 1) & (0.5 <= ASE$DNAPropMax)

Note the inflection at x = 0.5.

Part 1.2.3: alpha = 0.8 ie. biased matrigenic expression

#BIASED MATRIGENIC EXPRESSION, alpha = 0.8
a = 0.8 #set the maternal bias, 0.8 is maternal bias 

  ##1 Matrigenic expression, homozygous 1
q = 0
ExpRNAmin_complete<- q
ExpRNAmax_complete<- q
ExpRNAmin_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMin - q)
ExpRNAmax_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMax - q)

ASE$ExpRNAmin <-pmin(ExpRNAmin_complete, ExpRNAmin_incomplete) #take the minimum of the two to define lower
ASE$ExpRNAmax <-pmax(ExpRNAmax_complete, ExpRNAmax_incomplete) #take the maximum of the two to define the expected upper

##Does obsRNA confidence interval  with expected, and DNA must be less than 0.5
ASE$Biased_Mat1<- (ASE$RNAPropMin <= ASE$ExpRNAmax) & (ASE$ExpRNAmin <= ASE$RNAPropMax) & 
  (ASE$DNAPropMin <= 0.5)


## 2 Matrigenic expression, heterozygous
q = 0.5
ExpRNAmin_complete<- q
ExpRNAmax_complete<- q
ExpRNAmin_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMin - q)
ExpRNAmax_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMax - q)

ASE$ExpRNAmin <-pmin(ExpRNAmin_complete, ExpRNAmin_incomplete) #take the minimum of the two to define lower
ASE$ExpRNAmax <-pmax(ExpRNAmax_complete, ExpRNAmax_incomplete) #take the maximum of the two to define the expected upper

##Does obsRNA confidence interval  with expected, and DNA must be between 0.25 and 0.75
ASE$Biased_Mat2<- (ASE$RNAPropMin <= ASE$ExpRNAmax) & (ASE$ExpRNAmin <= ASE$RNAPropMax) & 
  (ASE$DNAPropMin <= 0.75) & (0.25 <= ASE$DNAPropMax)


## 3 Matrigenic expression, homozygous2
q = 1
ExpRNAmin_complete<- q
ExpRNAmax_complete<- q
ExpRNAmin_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMin - q)
ExpRNAmax_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMax - q)

ASE$ExpRNAmin <-pmin(ExpRNAmin_complete, ExpRNAmin_incomplete) #take the minimum of the two to define lower
ASE$ExpRNAmax <-pmax(ExpRNAmax_complete, ExpRNAmax_incomplete) #take the maximum of the two to define the expected upper

##Does obsRNA confidence interval  with expected.
ASE$Biased_Mat3<- (ASE$RNAPropMin <= ASE$ExpRNAmax) & (ASE$ExpRNAmin <= ASE$RNAPropMax) & 
  (ASE$DNAPropMin <= 1) & (0.5 <= ASE$DNAPropMax)

Part 1.2.4: alpha = 1, ie. complete matrigenic expression

#Complete MATRIGENIC EXPRESSION, alpha = 1
a = 1 #set the maternal bias, 0.8 is maternal bias 

##1. Matrigenic expression, homozygous 1
q = 0
ExpRNAmin_complete<- q
ExpRNAmax_complete<- q
ExpRNAmin_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMin - q)
ExpRNAmax_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMax - q)

ASE$ExpRNAmin <-pmin(ExpRNAmin_complete, ExpRNAmin_incomplete) #take the minimum of the two to define lower
ASE$ExpRNAmax <-pmax(ExpRNAmax_complete, ExpRNAmax_incomplete) #take the maximum of the two to define the expected upper

##Does obsRNA confidence interval  with expected, and DNA must be less than 0.5
ASE$Mat1<- (ASE$RNAPropMin <= ASE$ExpRNAmax) & (ASE$ExpRNAmin <= ASE$RNAPropMax) & 
  (ASE$DNAPropMin <= 0.5)


##2. Matrigenic expression, heterozygous
q = 0.5
ExpRNAmin_complete<- q
ExpRNAmax_complete<- q
ExpRNAmin_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMin - q)
ExpRNAmax_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMax - q)

ASE$ExpRNAmin <-pmin(ExpRNAmin_complete, ExpRNAmin_incomplete) #take the minimum of the two to define lower
ASE$ExpRNAmax <-pmax(ExpRNAmax_complete, ExpRNAmax_incomplete) #take the maximum of the two to define the expected upper

##Does obsRNA confidence interval  with expected.
ASE$Mat2<- (ASE$RNAPropMin <= ASE$ExpRNAmax) & (ASE$ExpRNAmin <= ASE$RNAPropMax) & 
  (ASE$DNAPropMin <= 0.75) & (0.25 <= ASE$DNAPropMax)


##3. Matrigenic expression, homozygous2
q = 1
ExpRNAmin_complete<- q
ExpRNAmax_complete<- q
ExpRNAmin_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMin - q)
ExpRNAmax_incomplete<- (a*q) + (1-a)*(2*ASE$DNAPropMax - q)

ASE$ExpRNAmin <-pmin(ExpRNAmin_complete, ExpRNAmin_incomplete) #take the minimum of the two to define lower
ASE$ExpRNAmax <-pmax(ExpRNAmax_complete, ExpRNAmax_incomplete) #take the maximum of the two to define the expected upper

##Does obsRNA confidence interval  with expected.
ASE$Mat3<- (ASE$RNAPropMin <= ASE$ExpRNAmax) & (ASE$ExpRNAmin <= ASE$RNAPropMax) & 
  (ASE$DNAPropMin <= 1) & (0.5 <= ASE$DNAPropMax)

The table ASE now has logical values representing whether the RNA vs DNA proportions are consistent with patrigenic or matrigenic imprinting, at all three possibilities for the queen genotype, and at complete silencing and biased expression.

Part 1.3: Combinatorial analysis, testing consistency with imprinting predictions

The imprinting analysis is built on the assumptions that all samples will be imprinted in the same direction, and that samples within a colony will share a queen (ie. common queen genotype). The tables output were used to build the tables that report imprinting results in the supplementary info.

ASE$Chr_Locus<- paste(ASE$Chr, ASE$Locus, sep = '_') #make a column to identify the loci across dif. chr, in case there are repeating loci in different chromosomes

# Run combinatorial tests for each of the sets of predictions above
test_results_df<- ASE %>% 
  group_by(Chr_Locus,Gene, Colony) %>% #for each locus, within each colony
  summarise(Complete_Paternal = all(Pat1 == T)|all(Pat2 == T)|all(Pat3 == T),  #did they all follow Patrigene predictions 1 OR all follow 2 OR all follow 3
            Biased_Paternal = all(Biased_Pat1 == T)|all(Biased_Pat2 == T)|all(Biased_Pat3 == T), #same for other alpha values
            Biased_Maternal = all(Biased_Mat1 == T)|all(Biased_Mat2 == T)|all(Biased_Mat3 == T), 
            Complete_Maternal = all(Mat1 == T)|all(Mat2 == T)|all(Mat3 == T)) %>%
  group_by(Chr_Locus, Gene) %>% #then across colonies (only grouping by locus)
  summarise(Complete_Paternal = all(Complete_Paternal == T), Biased_Paternal = all(Biased_Paternal == T),  #were all colonies consistent with paternal
            Biased_Maternal = all(Biased_Maternal == T), Complete_Maternal = all(Complete_Maternal == T)) #were all colonies consistent with maternal

#find those loci where all samples were consistent with different imprinting scenarios, by subsetting from the tables just generated
paternal_complete<- subset(test_results_df, Complete_Paternal == TRUE)
paternal_biased<- subset(test_results_df, Biased_Paternal == TRUE)
maternal_biased<- subset(test_results_df, Biased_Maternal == TRUE)
maternal_complete<- subset(test_results_df, Complete_Maternal == TRUE)

# pull out the complete data for each of the imprinting scenarios
complete_paternal_data<- unique(subset(ASE, Chr_Locus %in% paternal_complete$Chr_Locus))
complete_maternal_data<- unique(subset(ASE, Chr_Locus %in% maternal_complete$Chr_Locus))
biased_paternal_data<- unique(subset(ASE, Chr_Locus %in% paternal_biased$Chr_Locus))
biased_maternal_data<- unique(subset(ASE, Chr_Locus %in% maternal_biased$Chr_Locus))

#simplify so it's easy to read
output_table_paternal_complete<- unique(complete_paternal_data[c('Chr','Locus','Gene')])
output_table_maternal_complete<- unique(complete_maternal_data[c('Chr','Locus','Gene')])
output_table_paternal_biased<- unique(biased_paternal_data[c('Chr','Locus','Gene')])
output_table_maternal_biased<- unique(biased_maternal_data[c('Chr','Locus','Gene')])

#then can be output to tables if required 
#write.table(file = 'paternal_output_complete.csv', output_table_paternal_complete, quote = F, row.names = F, sep = ';')
#write.table(file = 'paternal_output_biased.csv', output_table_paternal_biased, quote = F, row.names = F, sep = ';')
#write.table(file = 'maternal_output_biased.csv', output_table_maternal_biased, quote = F, row.names = F, sep = ';')
#write.table(file = 'maternal_output_complete.csv', output_table_maternal_complete, quote = F, row.names = F, sep = ';')

Part 1.4: Results reported in manuscript

This section generates the results reported in the manuscript text regarding the number of loci under different scenarios

#numbers for manuscript
# how many loci/genes/etc total? and for different imprinting scenarios?
# number of loci overall: 
length(unique(subset(test_results_df, Complete_Paternal == TRUE | Biased_Paternal == TRUE | Biased_Maternal == TRUE | Complete_Maternal == TRUE)$Chr_Locus))
## [1] 92
#number of genes overall:
length(unique(subset(test_results_df, Complete_Paternal == TRUE | Biased_Paternal == TRUE | Biased_Maternal == TRUE | Complete_Maternal == TRUE)$Gene))
## [1] 43
# number of loci overall complete silencing: 
length(unique(subset(test_results_df, Complete_Paternal == TRUE | Complete_Maternal == TRUE)$Chr_Locus))
## [1] 63
#number of genes overall:
length(unique(subset(test_results_df, Complete_Paternal == TRUE | Complete_Maternal == TRUE)$Gene))
## [1] 34
# number of loci for complete paternal:
length(unique(subset(test_results_df, Complete_Paternal == TRUE)$Chr_Locus))
## [1] 47
# number of genes complete paternal:
length(unique(subset(test_results_df, Complete_Paternal == TRUE)$Gene))
## [1] 31
# number of loci for biased paternal:
length(unique(subset(test_results_df, Biased_Paternal == TRUE)$Chr_Locus))
## [1] 59
# number of genes biased paternal:
length(unique(subset(test_results_df, Biased_Paternal == TRUE)$Gene))
## [1] 36
# number of loci complete maternal:
length(unique(subset(test_results_df, Complete_Maternal == TRUE)$Chr_Locus))
## [1] 25
# number of genes complete maternal:
length(unique(subset(test_results_df, Complete_Maternal == TRUE)$Gene))
## [1] 10
# number of loci biased maternal:
length(unique(subset(test_results_df, Biased_Maternal == TRUE)$Chr_Locus))
## [1] 55
# number of genes biased maternal:
length(unique(subset(test_results_df, Biased_Maternal == TRUE)$Gene))
## [1] 24

Part 1.5: Tables, figs and results for supplementary info

Part 1.5.1: supplementary table S4: how many loci per gene/scaffold passed for different values of alpha, and how many loci were there in total?

df_0<- output_table_paternal_complete #simpler names for dataframes df_0 means alpha = 0, df_02 means alpha = 0.2, etc.
df_02<- output_table_paternal_biased
df_08<- output_table_maternal_biased
df_1<- output_table_maternal_complete

#Run for the varying levels of alpha (imprinting bias). 
## alpha = 0
df_0<- df_0 %>% #for alpha = 0 table
  group_by(Gene, Chr) %>% #group by each gene on each chromosome
  summarise(count= length(Locus), Gene_Chr = unique(paste(Gene, Chr, sep = '_'))) #how many loci were there, and make a gene_Chr variable

ASE_subset_0<- ASE %>% 
  mutate(Gene_Chr = paste(ASE$Gene, ASE$Chr, sep = '_')) %>% #create the same gene_Chr variable for merging
  subset(Gene_Chr %in% df_0$Gene_Chr) %>% #subset to only those genes that passed at alpha = 0
  group_by(Gene_Chr) %>% #for each gene
  summarise(count = length(unique(Locus))) #how many loci in total

df_0<- merge(df_0, ASE_subset_0, by = 'Gene_Chr') #merge the two

df_0$alpha_0 <- df_0$count.x #how many showed imprinting?
df_0$total <- df_0$count.y #how many were there?

# This basic structure is repeated for the different alpha values
## alpha = 0.2
df_02<- df_02 %>%
  group_by(Gene, Chr) %>%
  summarise(count= length(Locus), Gene_Chr = unique(paste(Gene, Chr, sep = '_')))

ASE_subset_02<- ASE %>% 
  mutate(Gene_Chr = paste(ASE$Gene, ASE$Chr, sep = '_')) %>%
  subset(Gene_Chr %in% df_02$Gene_Chr) %>%
  group_by(Gene_Chr) %>%
  summarise(count = length(unique(Locus)))
df_02 <- merge(df_02, ASE_subset_02, by = 'Gene_Chr')

df_02$alpha_02 <- df_02$count.x
df_02$total <- df_02$count.y

## alpha = 0.8
df_08<- df_08 %>%
  group_by(Gene, Chr) %>%
  summarise(count= length(Locus), Gene_Chr = unique(paste(Gene, Chr, sep = '_')))

ASE_subset_08<- ASE %>% 
  mutate(Gene_Chr = paste(ASE$Gene, ASE$Chr, sep = '_')) %>%
  subset(Gene_Chr %in% df_08$Gene_Chr) %>%
  group_by(Gene_Chr) %>%
  summarise(count = length(unique(Locus)))
df_08 <- merge(df_08, ASE_subset_08, by = 'Gene_Chr')

df_08$alpha_08 <- df_08$count.x
df_08$total <- df_08$count.y

## alpha = 1
df_1<- df_1 %>%
  group_by(Gene, Chr) %>%
  summarise(count= length(Locus), Gene_Chr = unique(paste(Gene, Chr, sep = '_')))

ASE_subset_1<- ASE %>% 
  mutate(Gene_Chr = paste(ASE$Gene, ASE$Chr, sep = '_')) %>%
  subset(Gene_Chr %in% df_1$Gene_Chr) %>%
  group_by(Gene_Chr) %>%
  summarise(count = length(unique(Locus)))
df_1 <- merge(df_1, ASE_subset_1, by = 'Gene_Chr')
df_1$alpha_1 <- df_1$count.x
df_1$total <- df_1$count.y

#select the useful columns
df_0 <- df_0[,c(2,3,6,7)]
df_02 <- df_02[,c(2,3,6,7)]
df_08 <- df_08[,c(2,3,6,7)]
df_1 <- df_1[,c(2,3,6,7)]

#create an output table by merging the outputs from the different alpha tables
df_out<- merge(df_0, df_02, by = c('Gene','Chr','total'), all.x = T, all.y = T)
df_out<- merge(df_out, df_08, by = c('Gene','Chr','total'), all.x = T, all.y = T)
df_out<- merge(df_out, df_1, by = c('Gene','Chr','total'), all.x = T, all.y = T)
df_out[is.na(df_out)]<- 0 #replace the nas with 0s
df_out<- df_out[c(1,2,4:7,3)] #take the interesting columns 

#can be saved to table:
write.table(df_out, '../output/table_S4.csv',sep = ';', quote = F, row.names = F)
#head(df_out) #print the merged output

Part 1.5.2: supplementary table S5: sample consistency

# How many samples were consistent with imprinting across all the loci with ASE? This is the same table as test_results_df, but with number of samples
test_results_df_numbers<- ASE %>% 
  group_by(Chr_Locus,Colony) %>% # for each locus and colony 
  summarise(Complete_Paternal = max(sum(Pat1 == T), sum(Pat2 == T), sum(Pat3 == T)), #take the maximum total of samples that passed the tests, 3 if colony is consistent with imprinting
            Biased_Paternal = max(sum(Biased_Pat1 == T), sum(Biased_Pat2 == T), sum(Biased_Pat3 == T)), #and for other values of alpha
            Biased_Maternal = max(sum(Biased_Mat1 == T), sum(Biased_Mat2 == T), sum(Biased_Mat3 == T)), 
            Complete_Maternal = max(sum(Mat1 == T), sum(Mat2 == T), sum(Mat3 == T))
            ) %>%
  group_by(Chr_Locus) %>% #then for each locus
  summarise(Complete_Paternal = sum(Complete_Paternal), Biased_Paternal = sum(Biased_Paternal), #sum up each of the values to get total for matrigenic and patrigenic (9 if locus consistent)
            Biased_Maternal = sum(Biased_Maternal), Complete_Maternal = sum(Complete_Maternal))

## how many genes with only 1 locus
loci<- ASE %>%
  group_by(Gene) %>%
  summarise(loci = length(unique(Locus))) #summarise by number of loci
one_locus<- subset(loci, loci == 1) #take only those with  1
length(one_locus$Gene) #how many are there
## [1] 43
### figuring out how many samples were consistent/inconsistent for each locus
any_imprinted<- subset(test_results_df_numbers, Complete_Paternal == 9 | Biased_Paternal == 9 | Biased_Maternal == 9 | Complete_Maternal == 9  ) #loci where all 9 were consistent with one of the imprinting scenarios

# find out all loci for the gene
ASE<- ASE %>%
  mutate(Chr_Gene = paste(Chr, Gene, sep = '_')) #make a scaffold by gene column (for those cases where the same gene is split over scaffolds)
  
gene_list<- ASE %>%
  subset(Chr_Locus %in% any_imprinted$Chr_Locus) #make a list of all the genes where 1 locus imprinted

any_locus_within_gene_imprinted<- ASE %>%
  subset(Chr_Gene %in% gene_list$Chr_Gene) #get a subset of ASE for all genes where ≥1 locus imprinted (this has data for all the samples)

#how many samples were inconsistent with imprinting for each alpha?
inconsistent_samples<- test_results_df_numbers %>%
  subset(Chr_Locus %in% any_locus_within_gene_imprinted$Chr_Locus) %>%
  mutate(Complete_Paternal = 9 - Complete_Paternal, Biased_Paternal = 9 - Biased_Paternal, Biased_Maternal = 9 - Biased_Maternal, Complete_Maternal = 9 - Complete_Maternal)

ASE_simple<- ASE[c('Chr_Locus','Gene','Chr','Locus')] #make a readable ASE table
df<- unique(merge(ASE_simple, inconsistent_samples, by = 'Chr_Locus')[-1]) #merge so we have the gene names

#make summary table about inconsistent samples, how many genes where x samples were inconsistent?
summary_inconsistent_samples<- 
  df %>%
  group_by(Gene, Chr) %>%
  summarise(alpha_0 = sum(Complete_Paternal), alpha_02 = sum(Biased_Paternal), alpha_08 = sum(Biased_Maternal),alpha_1 = sum(Complete_Maternal)) %>%
  pivot_longer(cols = starts_with('alpha'), values_to = "Samples") %>%
  group_by(Samples) %>%
  summarise(alpha_0 = sum(name == 'alpha_0'),alpha_02 = sum(name == 'alpha_02'),alpha_08 = sum(name == 'alpha_08'),alpha_1 = sum(name == 'alpha_1'))

write.csv(df, '../output/table_S5.csv', quote =F, row.names = F)

Part 1.5.3 supplementary fig S1

for_plot<- summary_inconsistent_samples %>%
  pivot_longer(cols = starts_with('alpha'), values_to = 'count') %>%
  mutate(name = parse_number(name)) %>%
  mutate(name = case_when(name == 8 ~ 0.8,
                          name == 2 ~ 0.2,
                          name == 0 ~ 0,
                          name == 1 ~ 1
                          )) %>%
  subset(Samples != 0)
for_plot<- unique(for_plot)

p <- ggplot(for_plot, aes(x = Samples, y = count, group = name, colour = as.factor(name)))
p <- p + scale_x_log10()
p <- p + geom_point(size = 4, alpha = 0.6, position = position_jitter(height = 0, width = 0.01))
p <- p + theme_minimal()
p <- p + scale_colour_viridis_d(name = 'Alpha')
p <- p + xlab("Samples inconsistent within gene") + ylab("Count")
p <- p + labs(title = 'Figure S1') 

#pdf('Fig. S1.pdf', useDingbats = F)
p

#dev.off()

Part 1.5.4: Which samples did not fit with predictions and caused genes to be rejected?

#Figuring out which samples were the odd ones out, was it ones with ASE?
### Calclate prob(ASE) for all samples again using Fisher test, as this is not in the table imported. 

#define the fisher test, take the 4 values, construct a matrix and then run the fisher test (unless there is na, then return 'error')
myfisher<- function(x){
  table<- matrix(as.numeric(c(x['CountDNA1'],x['CountDNA2'],x['CountRNA1'],x['CountRNA2'])), ncol=2, byrow=T)
  if(any(is.na(table))) p <- "error" else p <- fisher.test(table)$p.value
  return(p)
}

#run the test for each locus
ASE$p <- apply(ASE, 1, myfisher)

#set the p-value to numeric
ASE$p<- as.numeric(ASE$p)
## Warning: NAs introduced by coercion
ASE$adjusted_p<- p.adjust(ASE$p, method='BH', n = length(ASE$p)) #Adjust the p-value using bonferroni

#for each imprinting scenario, were ASE and consistency with the model correlated? (Also use fisher test)
#for complete paternal
DF<- ASE %>% 
  select(Chr, Locus, Colony, Caste, Gene, starts_with('Pat'), p, adjusted_p) %>% #select the useful columns
  mutate(ASE = (adjusted_p < 0.05)) %>% #significant or not?
  group_by(Colony, Gene, Chr, Locus) %>% #group by sample and locus
  mutate(maternal_genotype_comp_pat = which.max(c(sum(Pat1), sum(Pat2), sum(Pat3)))) %>% #which queen genotype was closest to passing?
  group_by(Caste, add = T) %>% #add caste to grouping
  mutate(odd_one_out = !case_when(maternal_genotype_comp_pat == 1 ~ (Pat1 == T), #if homozygous 1, is pat1 true?
                                  maternal_genotype_comp_pat == 2 ~ (Pat2 == T), #pat 2 for heterozygous
                                  maternal_genotype_comp_pat == 3 ~ (Pat3 == T)) #pat 3 for homozygous 2?
)

  table<- matrix(c(
    nrow(subset(DF, ASE == T & odd_one_out == T)), nrow(subset(DF, ASE == T & odd_one_out == F)), nrow(subset(DF, ASE == F & odd_one_out == T)), nrow(subset(DF, ASE == F & odd_one_out == F))), ncol=2, byrow=T) #construct matrix for fisher test
  
p <- fisher.test(table)$p.value #conduct test
p #print p value
## [1] 4.601182e-25
#this process is repeated for the different levels of alpha
#for biased paternal
DF<- ASE %>% 
    select(Chr, Locus, Colony, Caste, Gene, starts_with('Biased_P'), p, adjusted_p) %>%
    mutate(ASE = (adjusted_p < 0.05)) %>%
    group_by(Colony, Gene, Chr, Locus) %>%
    mutate(maternal_genotype_comp_pat = which.max(c(sum(Biased_Pat1), sum(Biased_Pat2), sum(Biased_Pat3)))) %>% 
    group_by(Caste, add = T) %>%
    mutate(odd_one_out = !case_when(maternal_genotype_comp_pat == 1 ~ (Biased_Pat1 == T),
                                    maternal_genotype_comp_pat == 2 ~ (Biased_Pat2 == T),
                                    maternal_genotype_comp_pat == 3 ~ (Biased_Pat3 == T))
    )
  
  table<- matrix(c(
    nrow(subset(DF, ASE == T & odd_one_out == T)), nrow(subset(DF, ASE == T & odd_one_out == F)), nrow(subset(DF, ASE == F & odd_one_out == T)), nrow(subset(DF, ASE == F & odd_one_out == F))), ncol=2, byrow=T)
  
p <- fisher.test(table)$p.value
p
## [1] 1.460279e-29
# for complete maternal
DF<- ASE %>% 
  select(Chr, Locus, Colony,Caste, Gene, starts_with('Mat'), p, adjusted_p) %>%
  mutate(ASE = (adjusted_p < 0.05)) %>%
  group_by(Colony, Gene, Chr, Locus) %>%
  mutate(maternal_genotype_comp_pat = which.max(c(sum(Mat1), sum(Mat2), sum(Mat3)))) %>% 
  group_by(Caste, add = T) %>%
  mutate(odd_one_out = !case_when(maternal_genotype_comp_pat == 1 ~ (Mat1 == T),
                                  maternal_genotype_comp_pat == 2 ~ (Mat2 == T),
                                  maternal_genotype_comp_pat == 3 ~ (Mat3 == T))
  )

table<- matrix(c(
  nrow(subset(DF, ASE == T & odd_one_out == T)), nrow(subset(DF, ASE == T & odd_one_out == F)), nrow(subset(DF, ASE == F & odd_one_out == T)), nrow(subset(DF, ASE == F & odd_one_out == F))), ncol=2, byrow=T)

p <- fisher.test(table)$p.value
p
## [1] 9.895435e-19
#for biased maternal
DF<- ASE %>% 
  select(Chr, Locus, Colony, Caste, Gene, starts_with('Biased_M'), p, adjusted_p) %>%
  mutate(ASE = (adjusted_p < 0.05)) %>%
  group_by(Colony, Gene, Chr, Locus) %>%
  mutate(maternal_genotype_comp_pat = which.max(c(sum(Biased_Mat1), sum(Biased_Mat1), sum(Biased_Mat1)))) %>% 
  group_by(Caste, add = T) %>%
  mutate(odd_one_out = !case_when(maternal_genotype_comp_pat == 1 ~ (Biased_Mat1 == T),
                                  maternal_genotype_comp_pat == 2 ~ (Biased_Mat1 == T),
                                  maternal_genotype_comp_pat == 3 ~ (Biased_Mat1 == T))
  )

table<- matrix(c(
  nrow(subset(DF, ASE == T & odd_one_out == T)), nrow(subset(DF, ASE == T & odd_one_out == F)), nrow(subset(DF, ASE == F & odd_one_out == T)), nrow(subset(DF, ASE == F & odd_one_out == F))), ncol=2, byrow=T)

p <- fisher.test(table)$p.value
p
## [1] 0.02810963
## which samples were inconsistent? (gives names) 
odd_samples<- ASE %>% 
  select(Chr, Locus, Colony, Caste, Gene, starts_with('Pat'), p, adjusted_p) %>% #select the useful columns
  mutate(ASE = (adjusted_p < 0.05)) %>% #logical: do we see ASE?
  group_by(Colony, Gene, Chr, Locus) %>% #group by colony at each locus
  mutate(maternal_genotype_comp_pat = which.max(c(sum(Pat1), sum(Pat2), sum(Pat3)))) %>% #which queen genotype were we closest to
  group_by(Caste, add = T) %>% #now group by each individual sample by adding caste
  mutate(odd_one_out = !case_when(maternal_genotype_comp_pat == 1 ~ (Pat1 == T), #which sample did not fit with the predictions for each genotype
                                  maternal_genotype_comp_pat == 2 ~ (Pat2 == T),
                                  maternal_genotype_comp_pat == 3 ~ (Pat3 == T))) %>% 
  group_by(Gene, Chr, Locus) %>% #remove caste grouping
  mutate(count_odd = sum(odd_one_out)) %>% #count how many samples were odd
  subset(odd_one_out == T) %>% #take only the samples were that were 'odd'
  select(Chr, Locus, Gene, Colony, Caste) %>% #select the informative columns
  group_by(Chr, Locus, Gene, Colony, Caste) #group to make it easier to read
  
#write.csv(odd_samples, 'odd_samples.csv')
odd_samples #print which samples were inconsistent, no obvious patterns seen.
## # A tibble: 398 x 5
## # Groups:   Chr, Locus, Gene, Colony, Caste [398]
##    Chr          Locus Gene                   Colony Caste
##    <fct>        <int> <chr>                  <chr>  <chr>
##  1 C1940581       147 Protein_distal_antenna 322    G    
##  2 scaffold185 578430 Copia_protein          322    G    
##  3 scaffold185 578441 Copia_protein          322    G    
##  4 scaffold185 578446 Copia_protein          322    G    
##  5 scaffold185 578486 Copia_protein          322    G    
##  6 scaffold185 578487 Copia_protein          322    G    
##  7 scaffold185 578509 Copia_protein          322    G    
##  8 scaffold185 578588 Copia_protein          322    G    
##  9 scaffold185 578946 Copia_protein          322    G    
## 10 scaffold185 578949 Copia_protein          322    G    
## # … with 388 more rows

Part 2: figures from sequencing and ddPCR data

Aesthetic features of some figures were subsequently edited in adobe suite, title sizes, gaps between figure panels, etc.

df<- read.table("../input_data/compiled_ddPCR_results.csv", header = TRUE, sep=',')
## subset to get female castes
df_diploid<- subset(df, !grepl("M",df$Sample))
df_diploid<- subset(df_diploid, !grepl("NTC", df_diploid$Sample))
df_diploid<- separate(df_diploid, 'Sample', into= c("colony","caste","type"), sep = '_')
df_diploid<- df_diploid[,c(1,2,3,4,9,10,11)]
df_diploid$min <- as.numeric(as.character(df_diploid$min))/100
df_diploid$max <- as.numeric(as.character(df_diploid$max))/100
df_diploid$fraction <- as.numeric(as.character(df_diploid$fraction))/100

#some of alleles inconsistent in the S1, so fix by doing (1- proportion) to give correct 
df_diploid$fraction<- ifelse(df_diploid$Target == 'S1_SNP1', 1- df_diploid$fraction, df_diploid$fraction)
df_diploid$min<- ifelse(df_diploid$Target == 'S1_SNP1', 1- df_diploid$min, df_diploid$min)
df_diploid$max<- ifelse(df_diploid$Target == 'S1_SNP1', 1- df_diploid$max, df_diploid$max)

#make wider for plotting
df_diploid<- 
    df_diploid %>% pivot_wider(names_from = 'type', values_from = c('min','max','fraction'))

#defining colours
colours<- rev(brewer.pal(8, "Set1")[-6])

Part 2.1, Figure 1: Predictions

#shapes on fig are drawn with polygons, which needs a few tables built

#table for polygons
df_polygon_1<- data.frame(queen = 'hetero', DNA = c(0.25, 0.25, 0.5, 0.75, 0.75, 0.5), RNA = c(0.5, 0.4, 0.5, 0.5, 0.6, 0.5))
df_polygon_0<- data.frame(queen = 'homo_0', DNA = c(0, 0.5, 0.5), RNA = c(0,0,0.2))
df_polygon_2<- data.frame(queen = 'homo_1', DNA = c(0.5, 0.5, 1), RNA = c(0.8, 1, 1))
df_polygon_mat<- rbind(df_polygon_0, df_polygon_1, df_polygon_2)

#draw fig for maternal predictions
figure_1a_m<-   ggplot(df_polygon_mat, aes(x=DNA, y= RNA, fill = queen))+   #initiate
    theme_bw()+
    coord_cartesian(ylim=c(0,1), xlim=c(0,1))+  #limits
    geom_abline(intercept=0, slope =1, linetype= "dashed", size = 1.5)+ #null expectation
    geom_polygon(aes(fill = as.factor(queen)), alpha = 0.3) + scale_fill_manual(values = c('green','red','blue'))+ #add polygons according to queen genotype
    annotate("segment", x=0, xend=0.50, y= 0, yend= 0, colour = "green", size = 1.5) + #add homozygous 1 line
    annotate("segment", x=0.25, xend=0.75, y=0.5, yend= 0.5, colour = "red", size = 1.5) +#add heterozygous line
    annotate("segment", x=0.5, xend=1, y= 1, yend= 1, colour = "blue", size = 1.5) +  #add homozygous 2 line
    labs(x="Proportion of SNP 1 in DNA", y= "Proportion of SNP 1 in cDNA") + theme_bw() + theme(legend.position = 'none') + #change labels
    annotate("text",label = "Matrigenic expression", x = 0, y = 1, size = 5, hjust = 0) #title for subfig
  

#Likewise for the patrigenic expectations
#table for polygons  
df_polygon_0<- data.frame(queen = 'homo_0', DNA = c(0, 0.5, 0.5), RNA = c(0,1,0.8))
df_polygon_1<- data.frame(queen = 'hetero', DNA = c(0.25, 0.25, 0.5, 0.75, 0.75, 0.5), RNA = c(0, 0.1, 0.5, 0.9, 1, 0.5))
df_polygon_2<- data.frame(queen = 'homo_1', DNA = c(0.5, 0.5, 1), RNA = c(0, 0.2, 1))
df_polygon_pat<- rbind(df_polygon_0, df_polygon_1, df_polygon_2)

figure_1b_p<-   ggplot(df_polygon_pat, aes(x=DNA, y= RNA, fill = queen))+   
    coord_cartesian(ylim=c(0,1), xlim=c(0,1))+ 
    geom_abline(intercept=0, slope =1, linetype= "dashed", size = 1.5)+
    geom_polygon(aes(fill = as.factor(queen)), alpha = 0.3) + scale_fill_manual(values = c('green','red','blue'))+
    annotate("segment", x=0, xend=0.50, y= 0, yend=1, colour = "green", size = 1.5) +
    annotate("segment", x=0.25, xend=0.75, y= 0, yend=1, colour = "red", size = 1.5) + 
    annotate("segment", x=0.5, xend=1, y= 0, yend= 1, colour = "blue", size = 1.5) + 
    labs(x="Proportion of SNP 1 in DNA", y= "Proportion of SNP 1 in cDNA") + theme_bw() + theme(legend.position = 'none') + 
    annotate("text",label = "Patrigenic expression", x = 0, y = 1, size = 5, hjust = 0)

figure_1 <- ggarrange(figure_1a_m, figure_1b_p, labels = c('A','B')) #2 panels next to each other
figure_1 

Part 2.2, Figure 2: results of seq and ddpcr for the four selected genes

Part 2.2.1, Figure 2a: figure for sequencing data

## SEQUENCING DATA

df_pat<- complete_paternal_data #use paternal data
S1<- subset(df_pat, Chr == "scaffold390" & Locus == 364062) #take the locus we used
MRJP<- subset(df_pat, Chr == "scaffold230" & Locus == 3456) #and 2nd locus

MRJP$RNAProp[7:9]<- 1 - MRJP$RNAProp[7:9] #the RNA is looking at the wrong allele here compared to DNA, so fix 
MRJP$RNAPropMin[7:9]<- 1 - MRJP$RNAPropMin[7:9] #the RNA is looking at the wrong allele here
MRJP$RNAPropMax[7:9]<- 1 - MRJP$RNAPropMax[7:9] #the RNA is looking at the wrong allele here

df_pat<- rbind( MRJP, S1) #make df with both genes
df_pat$Colony<- as.factor(df_pat$Colony) #make sure colony is factor, not numeric

#figure 2: the patrigenic figure, seq
figure_2a_p<- ggplot(df_pat, aes(x=DNAProp, y= RNAProp))+   
  theme_bw()+
  geom_abline(intercept=0, slope =1, linetype= "dashed")+
  geom_polygon(data = df_polygon_pat, aes(x = DNA, y = RNA, fill = as.factor(queen)), alpha = 0.3) + scale_fill_manual(values = c('green','red','blue'))+
  annotate("segment", x=0, xend=0.50, y= 0, yend=1, colour = "green", size = 1.5) +
  annotate("segment", x=0.25, xend=0.75, y= 0, yend=1, colour = "red", size = 1.5) + 
  annotate("segment", x=0.5, xend=1, y= 0, yend= 1, colour = "blue", size = 1.5) + 
  geom_point(size = 3, aes(colour=Colony, shape=Caste)) + 
  geom_errorbar(aes(ymin=RNAPropMin, ymax = RNAPropMax, colour=Colony)) +
  geom_errorbarh(aes(xmin= DNAPropMin, xmax= DNAPropMax, colour=Colony)) + 
  coord_cartesian(ylim=c(0,1), xlim=c(0,1))+ 
  labs(x="Proportion of SNP 1 in DNA", y= "Proportion of SNP 1 in cDNA")+
  facet_wrap(~Gene, ncol = 2) + 
  scale_color_manual(values = colours[c(5,6,7)])



#Repeat for the maternal genes chosen
df_mat<- complete_maternal_data
SETMAR <- subset(df_mat, Chr == "scaffold361" & Locus == 29276)
VIT<- subset(df_mat, Chr == "scaffold527" & Locus == 1627977)
df_mat <- rbind(SETMAR, VIT)
df_mat $Colony<- as.factor(df_mat$Colony)
#figure 2: the matrigenic figure, seq
figure_2a_m<- ggplot(df_mat, aes(x=DNAProp, y= RNAProp))+   
  theme_bw()+
  geom_abline(intercept=0, slope =1, linetype= "dashed")+
  geom_polygon(data = df_polygon_mat, aes(x = DNA, y = RNA, fill = as.factor(queen)), alpha = 0.3) + scale_fill_manual(values = c('green','red','blue'))+
  annotate("segment", x=0, xend=0.50, y= 0, yend= 0, colour = "green", size = 1.5) +
  annotate("segment", x=0.25, xend=0.75, y=0.5, yend= 0.5, colour = "red", size = 1.5) + 
  annotate("segment", x=0.5, xend=1, y= 1, yend= 1, colour = "blue", size = 1.5) + 
  geom_point(size = 3, aes(colour=Colony, shape=Caste)) + 
  geom_errorbar(aes(ymin=RNAPropMin, ymax = RNAPropMax, colour=Colony)) +
  geom_errorbarh(aes(xmin= DNAPropMin, xmax= DNAPropMax, colour=Colony)) + 
  coord_cartesian(ylim=c(0,1), xlim=c(0,1))+ 
  labs(x="Proportion of SNP 1 in DNA", y= "Proportion of SNP 1 in cDNA")+
  facet_wrap(~Gene, ncol = 2) + 
  scale_color_manual(values = colours[c(5,6,7)])

figure_2a_seq<- ggarrange(figure_2a_p, figure_2a_m, ncol = 2, common.legend = T)

Part 2.2.2, Figure 2b: figure for ddPCR data

#repeat for the ddPCR data: 

#take the genes for the paternal panel
df_diploid_p <- subset(df_diploid, Target == 'MRJP3' | Target == "S1_SNP1") #take the patrigenes we looked at
#figure 2b: the patrigenic figure
figure_2b_p<- ggplot(df_diploid_p, aes(x=fraction_DNA, y= fraction_cDNA))+  
  theme_bw()+
  geom_abline(intercept=0, slope =1, linetype= "dashed")+
  geom_polygon(data = df_polygon_pat, aes(x = DNA, y = RNA, fill = as.factor(queen)), alpha = 0.3) + scale_fill_manual(values = c('green','red','blue'))+
  annotate("segment", x=0, xend=0.50, y= 0, yend=1, colour = "green", size = 1.5) +
  annotate("segment", x=0.25, xend=0.75, y= 0, yend=1, colour = "red", size = 1.5) + 
  annotate("segment", x=0.5, xend=1, y= 0, yend= 1, colour = "blue", size = 1.5) + 
  geom_point(size = 3, aes(colour=colony, shape=caste))+
  geom_errorbar(aes(ymin=min_cDNA, ymax = max_cDNA, colour=colony)) +
  geom_errorbarh(aes(xmin= min_DNA, xmax= max_DNA, colour=colony)) + 
  coord_cartesian(ylim=c(0,1), xlim=c(0,1))+ 
  labs(x="Proportion of SNP 1 in DNA", y= "Proportion of SNP 1 in cDNA")+
  facet_wrap(~Target, ncol = 2) +  
  scale_color_manual(values = colours)

df_diploid_m <- subset(df_diploid, Target == 'VIT_SNP1' | Target == "SETMAR") #take the matrigenes we looked at

#figure 2b: the matrigenic figure
figure_2b_m<- ggplot(df_diploid_m, aes(x=fraction_DNA, y= fraction_cDNA))+  
  theme_bw()+
  geom_abline(intercept=0, slope =1, linetype= "dashed")+
  geom_polygon(data = df_polygon_mat, aes(x = DNA, y = RNA, fill = as.factor(queen)), alpha = 0.3) + scale_fill_manual(values = c('green','red','blue'))+
  annotate("segment", x=0, xend=0.50, y= 0, yend= 0, colour = "green", size = 1.5) +
  annotate("segment", x=0.25, xend=0.75, y=0.5, yend= 0.5, colour = "red", size = 1.5) + 
  annotate("segment", x=0.5, xend=1, y= 1, yend= 1, colour = "blue", size = 1.5) + 
  geom_point(size = 3, aes(colour=colony, shape=caste))+ 
  geom_errorbar(aes(ymin=min_cDNA, ymax = max_cDNA, colour=colony)) +
  geom_errorbarh(aes(xmin= min_DNA, xmax= max_DNA, colour=colony)) +
  geom_abline(intercept=0, slope =1, linetype= "dashed")+
  annotate("segment", x=0, xend=0.5, y= 0, yend= 0.0, colour = "green")+
  annotate("segment", x=0.25, xend=0.75, y= 0.50, yend= 0.50, colour = "red")+
  annotate("segment", x=0.50, xend=1, y=1, yend=1, colour= "blue")+ 
  coord_cartesian(ylim=c(0,1), xlim=c(0,1))+ 
  labs(x="Proportion of SNP 1 in DNA", y= "Proportion of SNP 1 in cDNA")+
  facet_wrap(~Target, ncol = 2) + 
  scale_color_manual(values = colours)

figure_2b_ddPCR<- ggarrange(figure_2b_p, figure_2b_m, ncol = 2, common.legend = T)

Part 2.2.3, Figure 2: collating figs 2a and 2b

figure_2<- ggarrange(figure_2a_seq, figure_2b_ddPCR, ncol = 1, labels = c('A','B'))
figure_2

Part 2.3: Figure 3: for the male and queen genotypes

males<- subset(df, grepl('M',Sample)) #take the male samples
males <- separate(males, 'Sample', into= c("colony","caste","type"), sep = '_') #generate sample info from names

#making data proportions rather than percentages
males$min <- as.numeric(as.character(males$min))/100 
males$max <- as.numeric(as.character(males$max))/100
males$fraction <- as.numeric(as.character(males$fraction))/100
male_pools<- subset(males, type == 'DNA') 

# FOR S1, the focal allele for the ddPCR was the other allele, so for this data find the reciprocal of the fraction
male_pools $max <- ifelse(male_pools $Target == 'S1_SNP1', 1 - male_pools $max, male_pools $max)
male_pools $min <- ifelse(male_pools $Target == 'S1_SNP1', 1 - male_pools $min, male_pools $min)
male_pools $fraction <- ifelse(male_pools $Target == 'S1_SNP1', 1 - male_pools $fraction, male_pools $fraction)


#draw the figure for the queen genotypes
figure_3a<- 
ggplot(male_pools, aes(x = colony, y = fraction, fill = colony)) + 
    geom_bar(stat='identity') + geom_errorbar(aes(ymin = min, ymax = max, width = 0.4)) + 
    geom_hline(yintercept = 1, linetype = 'dashed', colour = 'green', size = 1) + 
    geom_hline(yintercept = 0.5, linetype = 'dashed', colour = 'blue', size = 1) + 
    geom_hline(yintercept = 0, linetype = 'dashed', colour = 'red', size = 1) + 
    theme_bw() + 
    scale_fill_manual(values = colours[c(1,2,4,5,6)]) + 
    facet_wrap(~Target, nrow = 1, scales = 'free_x')


#individual males
males_ind<- subset(males, type != 'DNA') #these samples distinguished from the pooled by not having DNA in Type column
males_ind$fraction<- as.numeric(as.character(males_ind$fraction)) #make fraction numeric
males_ind$min<- as.numeric(as.character(males_ind$min)) #make min numeric
males_ind$max<- as.numeric(as.character(males_ind$max)) #same with max

males_ind$max <- ifelse(males_ind$Target == 'S1_SNP1', 1 - males_ind$max, males_ind$max) #As above, reverse the S1 data to make this match the sequencing data 'allele_1'
males_ind$min <- ifelse(males_ind$Target == 'S1_SNP1', 1 - males_ind$min, males_ind$min)
males_ind$fraction <- ifelse(males_ind$Target == 'S1_SNP1', 1 - males_ind$fraction, males_ind$fraction)


#draw fig 3b
figure_3b<- 
ggplot(males_ind, aes(x = interaction(type,colony), y = fraction, fill = colony)) + 
    geom_bar(stat='identity') + geom_errorbar(aes(ymin = min, ymax = max, width = 0.4)) + 
    theme_bw() + theme(axis.text.x = element_blank()) + xlab('Individual male') +
    scale_fill_manual(values = colours[c(3,5,6)]) + 
    facet_wrap(~Target, nrow = 1, scales = 'free_x') 
    
#use ggarrange to draw entirety of fig 3
figure_3 <- ggarrange(figure_3a, figure_3b, nrow = 2,labels = c('A: queen genotypes','B: individual male genotypes'))
figure_3

Part 2.4: Figure 4: the relative concentrations of genes

df<- read.csv('../input_data/dilution_data.csv', header = TRUE, row.names = NULL, sep=',') # read in data

df<- subset(df, Sample!='CONTAMINATED') #remove the contaminated wells
df<- subset(df, Sample!='ntc') #remove the blanks


split<-c(strsplit(as.character(df$Sample),'_', fixed=TRUE)) #split the  samples column to get sample info (could be done with tidyverse, not updated here)
sample_info<- as.data.frame(do.call(rbind, split))
colnames(sample_info)<- c('sample','type','dilution') #rename sample info columns


df<- cbind(sample_info,df) #cbind the sample info and the data
df_simple<- df[c('Target','dilution','Concentration')] #simplify 
df_simple$dilution<- as.numeric(as.character(df_simple$dilution)) #make numeric for axes
df_simple$Concentration<- as.numeric(as.character(df_simple$Concentration)) #make numeric

df_TBP<- subset(df_simple, Target == 'TBP') #take the TBP target (single copy gene)
df_others<- subset(df_simple, Target != 'TBP') #and take the others
colnames(df_TBP)<- c('TBP_Target','dilution','TBP_conc') #rename the columns

df_comparison<- merge(df_others, df_TBP, by= 'dilution', all.x=TRUE) #merge the two

#pdf('dilution_plots.pdf', height = 10, width = 10, useDingbats=FALSE)
figure_4<- ggplot(df_comparison, aes(x=log2(TBP_conc),y=log2(Concentration), colour = Target)) + geom_point(size = 3) + geom_smooth(method= lm, aes(colour =Target)) + theme_bw()
figure_4 <- figure_4 + geom_abline(aes(intercept = log2(1), slope = 1), linetype = 2, colour = 'black', alpha =1, size = 1)
figure_4 <- figure_4 + geom_abline(aes(intercept = log2(2), slope = 1),linetype = 2, colour = 'black', alpha =1, size = 1)+
labs(x="Log2 Concentration of TATA-Box Protein", y= "Log2 Concentration")+
theme(axis.text=element_text(size=15), axis.title=element_text(size = 30, face = 'bold', family = 'Helvetica'), title = element_text(size = 30, face = 'bold', family='Helvetica'), legend.text=element_text(size = 13, family = 'Helvetica'))

figure_4

## Part 2.5: saving figs to pdf

pdf('../output/figs.pdf', height = 8, width = 10)
figure_1
figure_2
figure_3
figure_4
dev.off()
## quartz_off_screen 
##                 2