##########################################
##### Multivariate Ratio Analysis (MRA), see Baur H., Leuenberger C. (2011) Systematic Biology 60: 813-825
##### 1. Tutorial: The *Shape PCA* and *Ratio Spectra*
##########################################


# **** Make sure, source and data files are in one and the same folder! ****

rm(list=ls()); ls()  # Tidying up workspace
library(ggplot2) # Use install.packages("ggplot2"), in case the library is not yet installed
library(plyr) # Use install.packages("plyr"), in case the library is not yet installed
source("Shape_PCA_v1.02.R", chdir = TRUE) # Load newest verson of the script

#### Load Dataset
filename <- "Anisopteromalus_data.csv"
dat0 <- read.csv(filename)



#### Using a Shape PCA


#### Example 1 ####

# Use all data
dat <- dat0
X <- dat[,3:21] # Select data variables. One variable, gst.b, is excluded, since it contains artifacts (see Baur et al. 2014, p. 695-696 and Fig. 1)
X <- X[, order(names(X))] # Order variables alphabetically (for better overview, no necessity otherwise)
species <- factor(dat$species) # Grouping variable, here species
locality <- factor(dat$locality) # Variable concerning origin of specimens
no <- dat$no # Individual code number for specimens


# Calculate Shape PCA
SPCA <- ShapePCA(U=X, 
                 npc=4, # Number of shapePCs retained, default setting
                 rpc=1  # Rounding of variance explained by shapePCs, default setting
                 )

SPCA # Print entire function output (just to become familiar with)


#### Using result from the Shape PCA

## Plot variance explained of the first 4 shapePCs by using a screeplot
barplot(SPCA$pc_var, main="Screeplot", xlab="shapePC", ylab="% variance explained")

# Remark:
# Obviously, only the first two shapePCs are significant
# (in a shape PCA, at most the first three contain useful information, 
# hence plotting the screeplot is usually unnecessary).
# If you were interested to see the variance explained of all 19 shapePCs, 
# then redo the analysis by setting npc=19. 
# As you may see, a PCA is indeed an incredibly effective data reduction tool - 
# Karl Pearson, the inventor of the PCA, should have received the Nobel prize!


## Making a scatterplot of shapePC1 and shapePC2 using ggplot2

# Select colours for symbols in scatterplot (colours specified by RGB codes according to https://colorbrewer2.org/#type=sequential&scheme=BuGn&n=3)
colours  <- c("apiovorus"="#6A3D9A", "calandrae"="#1F78B4", "caryedophagus"="#33A02C", "ceylonensis"="black", "cornis"="#FF7F00", "quinarius"="#E31A1C")
# Select shape of symbols
shapes  <- c("apiovorus"=5, "calandrae"=16, "caryedophagus"=6, "ceylonensis"=3, "cornis"=2, "quinarius"=15)
# Specify, how the names should appear in the legend
labels  <- c("ceyl", "apio", "cala", "cary", "corn", "quin")
# Specify the sequence of those names
breaks  <- c("ceylonensis", "apiovorus", "calandrae", "caryedophagus", "cornis", "quinarius")


# Extract isosize, shapePC1 and shapePC2 to be plotted from the ShapePCA function
isosize <- as.numeric(SPCA$isosize)
shapePC1 <- SPCA$PCmatrix[,1] 
shapePC2 <- SPCA$PCmatrix[,2]*-1 # shapePC2 multiplied by -1
    # IMPORTANT NOTE:
    # Multiplying a shapePC by -1 does not have any bearing on its interpretation.
    # It is just used to make it fit to another plot (here to the scatterplot in Baur et al. 2014, p. 695, Fig. 1B)


# Put the numeric variables together a grouping, a locality and a code variable in a data frame
df <- data.frame(isosize, shapePC1, shapePC2, species, locality, no)

## Plot shapePC1 against shapePC2 using ggplot2
# Have a look at the various options to fiddle with the plot!
plot <- ggplot(df, aes(shapePC1, shapePC2, shape = species, color = species)) # Main ggplot2 function
find_hull <- function(df)df[chull(df$shapePC1, df$shapePC2),] # Function used for finding the points for drawing the convex hulls
hulls <- ddply(df, "species", find_hull) # Apply the above function by using the grouping variable 'species'
plot <- plot + geom_polygon(data = hulls, aes(group=species), colour="gray80", fill = NA) # Draw convex hulls around species
plot <- plot + geom_point(size=3) # Adjust the size of points
plot <- plot + theme(aspect.ratio=1) # Determine the aspect ratio of plot
plot <- plot + labs(title="Shape PCA of all 6 Anisopteromalus species") # Main title of plot
plot <- plot + xlab(paste("shape PC1 (",SPCA$pc_var[1],"%)", sep="")) # Label for x-axis; note, the percentage of variance explained by shapePC1 is automatically inserted from the output of the ShapePCA function  
plot <- plot + ylab(paste("shape PC2 (",SPCA$pc_var[2],"%)", sep="")) # Label for y-axis
plot <- plot + scale_shape_manual(values=shapes, labels=labels, breaks=breaks) # Applying values and sequence of symbol shape for the plot and its legend
plot <- plot + scale_colour_manual(values=colours, labels=labels, breaks=breaks) # Applying values and sequence of symbol colours for the plot and its legend
plot <- plot + labs(colour = "SPECIES", shape= "SPECIES") # Title of legend
# plot <- plot + theme(legend.position = c(0.01, 0.18)) # Alternative positioning of legend, e.g. inside plot
# plot <- plot + geom_text(aes(label=no), hjust=1.4, vjust=0) # Labelling of each point using the variable 'no'; great for locating outliers
plot_A <- plot + theme(axis.title.x = element_text(size=14), axis.title.y = element_text(size=14), plot.title = element_text(size=14), legend.text = element_text(face="italic", size=13)) # Various adjustments for the plot. Try out yourself and checkout the various information on ggplot2 in the internet!
plot_A # Print scatterplot (can be done also elsewhere in the script)




## Plot isosize against shapePC1
plot <- ggplot(df, aes(isosize, shapePC1, shape = species, color = species))
# find_hull <- function(df)df[chull(df$isosize, df$shapePC1),]
# hulls <- ddply(df, "species", find_hull)
# plot <- plot + geom_polygon(data = hulls, aes(group=species), colour="gray80", fill = NA)
plot <- plot + geom_point(size=3) + theme(aspect.ratio=1)
plot <- plot + labs(title="Shape PCA of all 6 Anisopteromalus species")
plot <- plot + xlab("isometric size")
plot <- plot + ylab(paste("shape PC1 (",SPCA$pc_var[1],"%)", sep=""))
plot <- plot + scale_shape_manual(values=shapes, labels=labels, breaks=breaks)
plot <- plot + scale_colour_manual(values=colours, labels=labels, breaks=breaks)
plot <- plot + labs(colour = "SPECIES", shape= "SPECIES")
# plot <- plot + theme(legend.position = c(0.01, 0.18))
# plot <- plot + geom_text(aes(label=no), hjust=1.4, vjust=0)
plot_B <- plot + theme(axis.title.x = element_text(size=14), axis.title.y = element_text(size=14), plot.title = element_text(size=14), legend.text = element_text(face="italic", size=13))
plot_B

# Remark:
# The plot of isosize against shapePC1 shows no obvious correlation. 
# Hence, no allometric variation is present.



## Plot isosize against shapePC1
plot <- ggplot(df, aes(isosize, shapePC2, shape = species, color = species))
# find_hull <- function(df)df[chull(df$isosize, df$shapePC1),]
# hulls <- ddply(df, "species", find_hull)
# plot <- plot + geom_polygon(data = hulls, aes(group=species), colour="gray80", fill = NA)
plot <- plot + geom_point(size=3) + theme(aspect.ratio=1)
plot <- plot + labs(title="Shape PCA of all 6 Anisopteromalus species")
plot <- plot + xlab("isometric size")
plot <- plot + ylab(paste("shape PC2 (",SPCA$pc_var[2],"%)", sep=""))
plot <- plot + scale_shape_manual(values=shapes, labels=labels, breaks=breaks)
plot <- plot + scale_colour_manual(values=colours, labels=labels, breaks=breaks)
plot <- plot + labs(colour = "SPECIES", shape= "SPECIES")
# plot <- plot + theme(legend.position = c(0.01, 0.18))
# plot <- plot + geom_text(aes(label=no), hjust=1.4, vjust=0)
plot_C <- plot + theme(axis.title.x = element_text(size=14), axis.title.y = element_text(size=14), plot.title = element_text(size=14), legend.text = element_text(face="italic", size=13))
plot_C

# Remark:
# The plot of isosize against shapePC1 shows obvious correlation. 
# Hence, strong allometric variation is present.




#### The use of the ratio spectra

## Have a look again at the first plot of shapePC1 against shapePC2
plot_A # Print first plot again

# Remark:
# Along shapePC1, species ceyl+cary+quin appear almost perfectly separated
# from the 3 other species, apio+cala+corn. 
# Hence, shapePC1 has a excellent discriminating power and we may ask, 
# what does this axis actually stand for, 
# i.e. what are the most important characters represented by this shapePC?


## So let us now print the PCA Ratio Spectrum for shapePC1
# Here the various default options are explained (below it will only be used in a simplified manner)
pcaRS(U=X, # Numeric matrix or data frame
      pc=1, # ShapePC for which the Ratio Spectrum should be drawn
      sequence=1, # Sequence of characters displayed. When chosen -1, characters are listed in reverse order.
      bootrep=999, # Number of bootstrap replicates
      barcol="#1F78B4", # Colour of bars
      barlwd=4,  # Width of bars
      linecol="black",  # Colour of vertical line
      linelwd=0.5, # Width of vertical line
      labelsize=0.9, # Font size of text labels (labels showing character abbreviations) 
      labelfont=1, # Font type of text labels (1=standard, 2=bold, 3=italics)
      nosize=0.7,  # Font size of number labels
      nofont=1, # Font type of number labels (1=standard, 2=bold, 3=italics)
      maina="PCA Ratio Spectrum for shape PC", # Main title first part
      # mainb=pc,  # Main title middle part (automatic according to number of shapePC chosen)
      mainc="", # Main title last part
      suba="bars = 68% confidence intervals based on ", # Subtitle first part
      # subb=bootrep  # Subtitle middle part (automatic according to number of bootstrap replicates chosen) 
      subc=" bootstrap replicates" # Subtitle last part
      )

# Remark:
# The PCA Ratio Spectrum depicts the characters (represented 
# by their abbreviations) along a vertical line.
# How can the Ratio Spectrum be intepreted?
# From any two characters, a ratio can be calculated. For instance stv.l/eye.b.
# However, only those ratios are really important, which are calculated
# from characters lying far appart in the spectrum, at or near its ends.
# As a consequence, the ratio stv.l/eye.b is entirely irrelevant, 
# while stv.l/ool.l is the most important one.
# In practice, only a very few ratios at the end are important, 
# here stv.l/ool.l and eye.b/ool.l and perhaps also eye.h/ool.l.
# The other ratios can safely be neglected.
# Hence, the PCA Ratio Spectrum shows at a glance the most important information
# in a shapePC and allows for an intuitive interpretation of shape!


# Let us have a look at the PCA Ratio Spectrum for the shape PC2
pcaRS(X, pc=2, sequence=-1) # We use sequence = -1, because shapePC2 was also multiplied by -1

# Remark:
# By using the above example, can you figure out 
# what are the most important ratios concering shape PC2?



# If we are interested in the most allometric direction in the entire data, 
# then we can use the Allometry Ratio Spectrum.
allometryRS(X)

# Remark:
# The interpetation is essentially the same as for the PCA Ratio Spectrum.
# However, we have now identified the most allometric direction in the ENTIRE data
# with ratios msc.l/gst.l, msc.l/ool.l and msc.l/scp.l being the most important ones.
# Not surprisingly, the Allometry Ratio Spectrum is quite similar to the 
# PCA Ratio Spectrum of shapePC2, which was shown above 
# to contain a lot of allometric variation (see plot_C).




#### Interpreting size
# Isosize is a general measure of size, 
# because it is the geometric mean of all variables.
# Isosize we have seen above in plot_B and plot_C, 
# but it can also be plotted using a simple boxplot.

boxplot(isosize~species, main="Isosize of 6 Anisopteromalus species")

# Remark:
# The species apiovorus is by far the largest one, ceylonensis the smallest.





#### Example 2 ####

#### Shape PCA of a subgroup of species (A. calandrae and A. quinarius)

dat <- subset(dat0, locality == "Bamberg" | locality == "Savannah" | locality == "Slough" | locality == "Fresno" | locality == "Michurinsk" | locality == "Moscow")
X <- dat[,3:21] # Select data variables. One variable, gst.b, is excluded, since it contains artifacts (see Baur et al. 2014, p. 695-696 and Fig. 1)
X <- X[, order(names(X))] # Order variables alphabetically (for better overview, no necessity otherwise)
species <- factor(dat$species) # Grouping variable, here species
locality <- factor(dat$locality) # Variable concerning origin of specimens
no <- dat$no # Individual code number for specimens


#### Calculating a Shape PCA

SPCA <- ShapePCA(X)

# Extract isosize, shapePC1 and shapePC2 to be plotted from the ShapePCA function
isosize <- as.numeric(SPCA$isosize)
shapePC1 <- SPCA$PCmatrix[,1] 
shapePC2 <- SPCA$PCmatrix[,2]



## Making a scatterplot of shapePC1 and shapePC2 using ggplot2

# Select colours for symbols in scatterplot for species
colours_s  <- c("apiovorus"="#6A3D9A", "calandrae"="#1F78B4", "caryedophagus"="#33A02C", "ceylonensis"="black", "cornis"="#FF7F00", "quinarius"="#E31A1C")
# Shape of symbols for species
shapes_s  <- c("calandrae"=1, "quinarius"=0)
# Name abbreviation for species
labels_s  <- c("quin", "cala")
# Name sequence for species
breaks_s  <- c("quinarius", "calandrae")


# Shape of symbols for locality
shapes_l  <- c("Bamberg"=5, "Savannah"=1, "Slough"=6, "Fresno"=3, "Michurinsk"=2, "Moscow"=0)
# Name abbreviation for locality
labels_l  <- c("Bamb", "Sava", "Slou", "Fres", "Mich", "Mosc")
# Name sequence for locality
breaks_l  <- c("Bamberg", "Savannah", "Slough", "Fresno", "Michurinsk", "Moscow")


# Put the numeric variables together a grouping, a locality and a code variable in a data frame
df <- data.frame(isosize, shapePC1, shapePC2, species, locality, no)

## Plot shapePC1 against shapePC2 using ggplot2
plot <- ggplot(df, aes(shapePC1, shapePC2, shape = locality, color = species))
find_hull <- function(df)df[chull(df$shapePC1, df$shapePC2),]
hulls <- ddply(df, "species", find_hull)
plot <- plot + geom_polygon(data = hulls, aes(group=species), colour="gray80", fill = NA)
plot <- plot + geom_point(size=3) + theme(aspect.ratio=1)
plot <- plot + labs(title="Shape PCA of A. calandrae and A. quinarius")
plot <- plot + xlab(paste("shape PC1 (",SPCA$pc_var[1],"%)", sep="")) 
plot <- plot + ylab(paste("shape PC2 (",SPCA$pc_var[2],"%)", sep=""))
plot <- plot + scale_shape_manual(values=shapes_l, labels=labels_l, breaks=breaks_l)
plot <- plot + scale_colour_manual(values=colours_s, labels=labels_s, breaks=breaks_s)
plot <- plot + labs(colour = "SPECIES", shape= "LOCALITY")
plot_D <- plot + theme(axis.title.x = element_text(size=14), axis.title.y = element_text(size=14), plot.title = element_text(size=14), legend.text = element_text(face="italic", size=13))
plot_D # Print scatterplot


## Plot isosize against shapePC1, localities highlighted
plot <- ggplot(df, aes(isosize, shapePC1, shape = locality, color = species))
plot <- plot + geom_point(size=3) + theme(aspect.ratio=1)
plot <- plot + labs(title="Shape PCA of A. calandrae and A. quinarius")
plot <- plot + xlab("isometric size")
plot <- plot + ylab(paste("shape PC1 (",SPCA$pc_var[1],"%)", sep=""))
plot <- plot + scale_shape_manual(values=shapes_l, labels=labels_l, breaks=breaks_l)
plot <- plot + scale_colour_manual(values=colours_s, labels=labels_s, breaks=breaks_s)
plot <- plot + labs(colour = "SPECIES", shape= "LOCALITY")
plot_E <- plot + theme(axis.title.x = element_text(size=14), axis.title.y = element_text(size=14), plot.title = element_text(size=14), legend.text = element_text(face="italic", size=13))
plot_E # Print scatterplot


## Plot isosize against shapePC1, localities not shown
plot <- ggplot(df, aes(isosize, shapePC1, shape = species, color = species))
plot <- plot + geom_point(size=3) + theme(aspect.ratio=1)
plot <- plot + geom_smooth(method=lm, se=FALSE, size=1)
plot <- plot + labs(title="Shape PCA of A. calandrae and A. quinarius")
plot <- plot + xlab("isometric size")
plot <- plot + ylab(paste("shape PC1 (",SPCA$pc_var[1],"%)", sep=""))
plot <- plot + scale_shape_manual(values=shapes_s, labels=labels_s, breaks=breaks_s)
plot <- plot + scale_colour_manual(values=colours_s, labels=labels_s, breaks=breaks_s)
plot <- plot + labs(colour = "SPECIES", shape= "SPECIES")
plot_F <- plot + theme(axis.title.x = element_text(size=14), axis.title.y = element_text(size=14), plot.title = element_text(size=14), legend.text = element_text(face="italic", size=13))
plot_F # Print scatterplot

# Remark:
# Function geom_smooth shows within group correlation of size and shape in plot_F.
# For species quinarius intraspecific allometric variation is evident. For calandrae it is insignificant.
# The species overlap in size, but not in shape.
# Remarkably, size range in calandrae is almost twice that of quinarius.



# PCA Ratio Spectrum for the shape PC1
pcaRS(X)

# Remark:
# The ratios ool.l/eye.b, ool.l/stv.l, ool.l/msc.l, and perhaps also ool.l/eye.h are among 
# the most important ones on the ratio spectrum.
# Are these REALLY the best ratios for separating the two species? 
# Checkout now the 2. TUTORIAL on the *LDA RATIO EXTRACTOR* 
# to see what is the very best separating ratio!




