Intro

Reanalysis of the genome formula data with PERMANOVA. The script follows the method as presented in Boezen et al. 2023, as deposited in Zenodo.

Load the data and necessary libraries

Load the experimental data from Wu et al. 2016, only for Nicotiana benthamiana as this is the only host with a range of different inocula genome formulae.

# Load libraries
library(magrittr)
library(utils)
library(plyr)
library(dplyr)
library(vegan)

# Load data
setwd("C:/R_Data")
c.data <- read.csv(file = "Dataset_6_AMV_Wu_2.csv", header = TRUE)  %>%
    as.data.frame()
# head(c.data, 20)

Analysis of data with PERMANOVA

We analyze the data, for simplicity focusing only on the inoculated leaves (sample = 1) in this version of the script. Change the leaf level here in the filtering to analyze higher leaf levels if desired. The selection of data made here is also used for the re-sampling analysis performed after PERMANOVA in the code.

# First the data for each host separately (Host == 1 to 3), but with all four 
# methods still included in the analysis.
c.data.f <- c.data %>%
  filter(sample == 1)

# Generate distances matrix and run PERMANOVA
set.seed(42)
dist.f <- vegdist(c.data.f[,15:17], method = "euclidean") 
per.gf <- adonis2(formula = dist.f ~ ratio_in, data = c.data.f, permutations = 10^4) 
print(per.gf)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 10000
## 
## adonis2(formula = dist.f ~ ratio_in, data = c.data.f, permutations = 10^4)
##          Df SumOfSqs      R2      F Pr(>F)
## ratio_in  1  0.10993 0.05508 0.9909  0.344
## Residual 17  1.88592 0.94492              
## Total    18  1.99584 1.00000
# Check for significant differences in spread between the groups, using a PERMDISP2 procedure
pd.gf <- betadisper(d = dist.f, group = c.data.f$ratio_in, type = "median", bias.adjust = TRUE, 
           sqrt.dist = FALSE, add = FALSE)
print(pd.gf)
## 
##  Homogeneity of multivariate dispersions
## 
## Call: betadisper(d = dist.f, group = c.data.f$ratio_in, type =
## "median", bias.adjust = TRUE, sqrt.dist = FALSE, add = FALSE)
## 
## No. of Positive Eigenvalues: 2
## No. of Negative Eigenvalues: 0
## 
## Average distance to median:
##       1       2       3       4       5       6       7 
## 0.04883 0.06666 0.15405 0.30620 0.21661 0.11263 0.09989 
## 
## Eigenvalues for PCoA axes:
##  PCoA1  PCoA2 
## 1.7324 0.2635
ph.pd.gf <- permutest(pd.gf, pairwise = TRUE, permutations = 10^4)
print(ph.pd.gf)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 10000
## 
## Response: Distances
##           Df  Sum Sq Mean Sq      F N.Perm Pr(>F)
## Groups     6 0.14256 0.02376 0.5198  10000 0.8121
## Residuals 12 0.54852 0.04571                     
## 
## Pairwise comparisons:
## (Observed p-value below diagonal, permuted p-value above diagonal)
##            1          2          3          4          5          6      7
## 1            8.1432e-01 5.0585e-01 4.3486e-01 1.0599e-02 1.1519e-01 0.4967
## 2 7.1763e-01            6.2204e-01 4.9285e-01 4.0896e-02 4.3906e-01 0.7456
## 3 4.4183e-01 5.2743e-01            7.4143e-01 7.8822e-01 8.6131e-01 0.8051
## 4 3.9353e-01 4.2640e-01 6.3216e-01            8.6881e-01 6.9313e-01 0.5734
## 5 1.5378e-02 5.4703e-02 7.1481e-01 8.1255e-01            3.9996e-04 0.1461
## 6 1.5333e-01 4.1649e-01 8.0746e-01 6.1510e-01 2.5828e-31            0.9022
## 7 4.3809e-01 6.3828e-01 7.0237e-01 4.9251e-01 1.8874e-01 8.6506e-01
p.adjust(ph.pd.gf$pairwise$permuted, method = "holm")
##        1-2        1-3        1-4        1-5        1-6        1-7        2-3 
## 1.00000000 1.00000000 1.00000000 0.21197880 1.00000000 1.00000000 1.00000000 
##        2-4        2-5        2-6        2-7        3-4        3-5        3-6 
## 1.00000000 0.77702230 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 
##        3-7        4-5        4-6        4-7        5-6        5-7        6-7 
## 1.00000000 1.00000000 1.00000000 1.00000000 0.00839916 1.00000000 1.00000000

GF distance to the inoculum: resampling approach for testing for an inoculum effect

We want to know the mean GF distance to the inoculum. Later, we will resample the data and recalculate these distances. But first we want the actual value for the experimental data. Calculate the GF distance using some simple R code, after creating an arrays with the inoculum GFs (already included for each sample in the data here) and observed GFs in the leaf/tissue selected for analysis.

# Make necessary arrays to calculate the distance to the corresponding inoculum
# per experiment.

gf.data <- c.data.f[,15:17]
gf.inoc <- c.data.f[,6:8]
data_points <- nrow(gf.data)
dist.data <- rep(NA, data_points)

# Calculate the distance of the lesions to the inocula.
for(i in 1:data_points) {
  dist.data[i] = sqrt( sum ( (gf.data[i,] - gf.inoc[i,])^2)  )
}

# Determine and print the mean and SE of the distance metric
mean.dist.exp = mean(dist.data)
sd.dist.exp = sd(dist.data)

print(mean.dist.exp)
## [1] 0.3999638
print(sd.dist.exp)
## [1] 0.2427979

Now we will re-sample the data, randomly assign each observation to an inoculum, and determine the distribution of predicted genome formula distances.

# Set the seed for reproducibility
set.seed(42)

# Make necessary arrays to calculate the distance to the corresponding inoculum
# per experiment.
num.bs <- 10000   # number of times to resample
bs.results <- array(data = NA, dim = c(num.bs, 3))

# gf.data.inc <- df_quinoa %>%
#  filter(type =="lesion") %>%
#  select(rel_RNA1, rel_RNA2, rel_RNA3)

# How many datapoints are there for each replicate
#rep.data <- df_quinoa %>%
#  filter(type =="lesion") %>%
#  select(exp_num)


# Loop for resampling the data
for(j in 1:num.bs) {
  
  # complete the resampled data by adding a random assignment to an experiment
  # for each lesion
  bs.now = sample(x = data_points, size = data_points, replace = FALSE)
  #  bs.gf.data <- cbind(gf.data.inc, bs.now)
  
  # Now calculate the distance to the extant inocula for the three experiments
  bs.dist.data <- rep(NA, data_points)
  
  for(i in 1:data_points) {
    bs.dist.data[i] = sqrt( sum ( (gf.data[i,] - gf.inoc[bs.now[i],])^2)  )
  }

  # Calculate distance and send result to array
  bs.results[j,] = c(j, mean(bs.dist.data), sd(bs.dist.data))
  # print (j)
}

# Describe the results statistically
print(mean(bs.results[,2])) # mean distance of resampled data
## [1] 0.5558361
# Order the resampling results
ord.rs <- order(bs.results[,2])
lo.n <- round(0.005*num.bs)
hi.n <- round(0.995*num.bs)

# Print the 99% confidence interval
bs.results[ord.rs[lo.n],2] # lower CI
## [1] 0.4344655
bs.results[ord.rs[hi.n],2] # Upper CI
## [1] 0.6522136
# How many bootstraps gave equal to or less then the empirical value?
print(length(which(bs.results[,2] <= mean.dist.exp)))
## [1] 5
# plot the results
hist(bs.results[,2], col = "lightblue", main = " ", xlim = c(0.3, 0.7), n = 50,
     xlab = "Genome formula distance", ylab = "Frequency")
lines(x = c(mean.dist.exp, mean.dist.exp), y = c(0, 1000), col = "red", lwd = 2)