Processing QIIME output

Manuscript information
Script information
Load packages
Import data
Tidy and filter
Remove lab contaminants
Identify sand contaminants
Look at microbiomes by storage and type (faecal/sand)
Remove sand contaminants
Add internal standard to metadata and remove from dataset

Manuscript information

Diurnal oscillations in gut bacterial load and composition eclipse seasonal and lifetime dynamics in wild meerkats, Suricata suricatta

Risely, A.1*, Wilhelm, K.1, Clutton-Brock, T.2, 3, 4, Manser, M.B. 3, 4, 5, and Sommer, S.1

Institute for Evolutionary Ecology and Conservation Genomics, Ulm, Germany
Large Animal Research Group, Department of Zoology, University of Cambridge, Downing Street, CB2 3EJ Cambridge, United Kingdom
University of Pretoria, Mammal Research Institute, Pretoria, ZA
Kalahari Research Trust, Kuruman River Reserve, Northern Cape, ZA
Department of Evolutionary Biology and Environmental Studies, University of Zurich, Zurich

Corresponding author: Alice Risely, alice.risely@uni-ulm.de

Script information

This script just processes the raw data and removes lab and sand contaminants, and looks at data distributions.
It does not add meerkat metadata used in the final analysis, as this requires access to the Kalahari Meerkat Project database.

Load packages

library(phyloseq)
library(plyr)
library(dplyr)
library(ggplot2)
library(viridis)
library("ggsci")
library(tidyverse)
library(expss)
library(metagMisc)
library(forcats)
library(decontam)
library(lubridate)
library("birk")
library(gridExtra)
library(ape)
library(ggrepel)
library(RColorBrewer)
library(speedyseq)

# session info

sessionInfo()

## R version 3.6.2 (2019-12-12)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19042)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_Australia.1252  LC_CTYPE=English_Australia.1252   
## [3] LC_MONETARY=English_Australia.1252 LC_NUMERIC=C                      
## [5] LC_TIME=English_Australia.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] speedyseq_0.5.3.9002 RColorBrewer_1.1-2   ggrepel_0.8.2       
##  [4] ape_5.4              gridExtra_2.3        birk_2.1.2          
##  [7] lubridate_1.7.9      decontam_1.6.0       metagMisc_0.0.4     
## [10] expss_0.10.2         forcats_0.5.0        stringr_1.4.0       
## [13] purrr_0.3.4          readr_1.3.1          tidyr_1.1.0         
## [16] tibble_3.0.1         tidyverse_1.3.0      ggsci_2.9           
## [19] viridis_0.5.1        viridisLite_0.3.0    ggplot2_3.3.2       
## [22] dplyr_1.0.0          plyr_1.8.6           phyloseq_1.30.0     
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-143        matrixStats_0.56.0  fs_1.4.1           
##  [4] httr_1.4.1          tools_3.6.2         backports_1.1.7    
##  [7] R6_2.4.1            vegan_2.5-6         DBI_1.1.0          
## [10] BiocGenerics_0.32.0 mgcv_1.8-31         colorspace_1.4-1   
## [13] permute_0.9-5       ade4_1.7-15         withr_2.2.0        
## [16] tidyselect_1.1.0    compiler_3.6.2      cli_2.0.2          
## [19] rvest_0.3.5         Biobase_2.46.0      htmlTable_2.0.0    
## [22] xml2_1.3.2          checkmate_2.0.0     scales_1.1.1       
## [25] digest_0.6.27       foreign_0.8-74      rmarkdown_2.8      
## [28] XVector_0.26.0      pkgconfig_2.0.3     htmltools_0.5.1.1  
## [31] dbplyr_1.4.4        htmlwidgets_1.5.1   rlang_0.4.11       
## [34] readxl_1.3.1        rstudioapi_0.11     generics_0.0.2     
## [37] jsonlite_1.7.0      magrittr_1.5        biomformat_1.14.0  
## [40] Matrix_1.2-18       Rcpp_1.0.4.6        munsell_0.5.0      
## [43] S4Vectors_0.24.4    Rhdf5lib_1.8.0      fansi_0.4.1        
## [46] lifecycle_0.2.0     stringi_1.4.6       yaml_2.2.1         
## [49] MASS_7.3-54         zlibbioc_1.32.0     rhdf5_2.30.1       
## [52] grid_3.6.2          blob_1.2.1          parallel_3.6.2     
## [55] crayon_1.3.4        lattice_0.20-38     Biostrings_2.54.0  
## [58] haven_2.3.1         splines_3.6.2       multtest_2.42.0    
## [61] hms_0.5.3           knitr_1.33          pillar_1.4.4       
## [64] igraph_1.2.5        reshape2_1.4.4      codetools_0.2-16   
## [67] stats4_3.6.2        reprex_0.3.0        glue_1.4.1         
## [70] evaluate_0.14       data.table_1.12.8   modelr_0.1.8       
## [73] vctrs_0.3.1         foreach_1.5.0       cellranger_1.1.0   
## [76] gtable_0.3.0        assertthat_0.2.1    xfun_0.22          
## [79] broom_0.5.6         survival_3.2-7      iterators_1.0.12   
## [82] IRanges_2.20.2      cluster_2.1.0       ellipsis_0.3.1

memory.limit(1000000)

## [1] 1e+06

Import data

# ASV table and taxonomy
meerkat_biom <-import_biom("C:/Users/risel/Dropbox/Sommer postdoc/Meerkat project/PROJECTS/MeerkatSpikeInData/Analysis updated/DATA/meerkat_otu_table.biom") 

# sample metadata
meerkat_map <-import_qiime_sample_data ('C:/Users/risel/Dropbox/Sommer postdoc/Meerkat project/PROJECTS/MeerkatSpikeInData/Analysis updated/DATA/meerkat_metadata_updated.txt')

## phylo tree using fragement insertion method

meerkat_tree_FI<-read_tree("C:/Users/risel/Dropbox/Sommer postdoc/Meerkat project/PROJECTS/MeerkatSpikeInData/Analysis updated/DATA/meerkat_tree.nwk") # FI = fragemnt insertion

# merge phyloseq
meerkat_original <-merge_phyloseq(meerkat_biom, meerkat_map, meerkat_tree_FI)

meerkat_original

## phyloseq-class experiment-level object
## otu_table()   OTU Table:          [ 34284 taxa and 1223 samples ]:
## sample_data() Sample Data:        [ 1223 samples by 8 sample variables ]:
## tax_table()   Taxonomy Table:     [ 34284 taxa by 7 taxonomic ranks ]:
## phy_tree()    Phylogenetic Tree:  [ 34284 tips and 34282 internal nodes ]:
## taxa are rows

meerkat<-meerkat_original

Tidy and filter

meerkat

## phyloseq-class experiment-level object
## otu_table()   OTU Table:          [ 34284 taxa and 1223 samples ]:
## sample_data() Sample Data:        [ 1223 samples by 8 sample variables ]:
## tax_table()   Taxonomy Table:     [ 34284 taxa by 7 taxonomic ranks ]:
## phy_tree()    Phylogenetic Tree:  [ 34284 tips and 34282 internal nodes ]:
## taxa are rows

colnames(tax_table(meerkat)) <- c("Kingdom", "Phylum", "Class",  "Order", "Family", "Genus", "Species")


###get rid of the D_1 etc characters infront of tax names

taxonomy<-data.frame(meerkat@tax_table@.Data)

taxonomy$Kingdom<-substr(taxonomy$Kingdom, 6, 60)
taxonomy$Phylum<-substr(taxonomy$Phylum, 6, 60)
taxonomy$Class<-substr(taxonomy$Class, 6, 60)
taxonomy$Order<-substr(taxonomy$Order, 6, 60)
taxonomy$Family<-substr(taxonomy$Family, 6, 60)
taxonomy$Genus<-substr(taxonomy$Genus, 6, 60)

taxonomy1<-tax_table(as.matrix(taxonomy))

tax_table(meerkat)<-taxonomy1

meerkat

## phyloseq-class experiment-level object
## otu_table()   OTU Table:          [ 34284 taxa and 1223 samples ]:
## sample_data() Sample Data:        [ 1223 samples by 8 sample variables ]:
## tax_table()   Taxonomy Table:     [ 34284 taxa by 7 taxonomic ranks ]:
## phy_tree()    Phylogenetic Tree:  [ 34284 tips and 34282 internal nodes ]:
## taxa are rows

##filter taxa that are not bacteria or not assigned at phylum level

meerkat <- meerkat %>%
  subset_taxa(
    Kingdom == "Bacteria")
meerkat

## phyloseq-class experiment-level object
## otu_table()   OTU Table:          [ 34012 taxa and 1223 samples ]:
## sample_data() Sample Data:        [ 1223 samples by 8 sample variables ]:
## tax_table()   Taxonomy Table:     [ 34012 taxa by 7 taxonomic ranks ]:
## phy_tree()    Phylogenetic Tree:  [ 34012 tips and 34010 internal nodes ]:
## taxa are rows

meerkat <- meerkat %>%
  subset_taxa(
    Phylum != "NA")
meerkat

## phyloseq-class experiment-level object
## otu_table()   OTU Table:          [ 33811 taxa and 1223 samples ]:
## sample_data() Sample Data:        [ 1223 samples by 8 sample variables ]:
## tax_table()   Taxonomy Table:     [ 33811 taxa by 7 taxonomic ranks ]:
## phy_tree()    Phylogenetic Tree:  [ 33811 tips and 33809 internal nodes ]:
## taxa are rows

meerkat <- meerkat %>%
  subset_taxa(
    Family != "Mitochondria")
meerkat

## phyloseq-class experiment-level object
## otu_table()   OTU Table:          [ 31596 taxa and 1223 samples ]:
## sample_data() Sample Data:        [ 1223 samples by 8 sample variables ]:
## tax_table()   Taxonomy Table:     [ 31596 taxa by 7 taxonomic ranks ]:
## phy_tree()    Phylogenetic Tree:  [ 31596 tips and 31594 internal nodes ]:
## taxa are rows

meerkat <- meerkat %>%
  subset_taxa(
    Class != "Chloroplast")

meerkat <- meerkat %>%
  subset_taxa(
    Order != "Chloroplast")


meerkat

## phyloseq-class experiment-level object
## otu_table()   OTU Table:          [ 31587 taxa and 1223 samples ]:
## sample_data() Sample Data:        [ 1223 samples by 8 sample variables ]:
## tax_table()   Taxonomy Table:     [ 31587 taxa by 7 taxonomic ranks ]:
## phy_tree()    Phylogenetic Tree:  [ 31587 tips and 31585 internal nodes ]:
## taxa are rows

paste(100-((sum(taxa_sums(meerkat))/sum(taxa_sums(meerkat_original))) *100), "% reads removed")

## [1] "1.68412612071259 % reads removed"

Remove lab contaminants

#### lab contaminants

meerkat<-subset_samples(meerkat, Storage !="ZYMO")

sample_data(meerkat)$Sample_or_Control<-ifelse(sample_data(meerkat)$Storage == "EXTRACTION CONTROL" | sample_data(meerkat)$Storage == "PCR CONTROL", "CONTROL", "TRUE SAMPLE")

table(sample_data(meerkat)$Sample_or_Control)

## 
##     CONTROL TRUE SAMPLE 
##          31        1191

df <- as.data.frame(sample_data(meerkat)) # Put sample_data into a ggplot-friendly data.frame
df$LibrarySize <- sample_sums(meerkat)
df <- df[order(df$LibrarySize),]
df$Index <- seq(nrow(df))

ggplot(data=df, aes(x=Index, y=LibrarySize, color=Sample_or_Control)) + 
  geom_point()+
  ggtitle("Sequencing depth of negative controls")

############ prevalence method

sample_data(meerkat)$is.neg <- sample_data(meerkat)$Sample_or_Control == "CONTROL"
contamdf.prev <- isContaminant(meerkat, method="prevalence", neg="is.neg")
table(contamdf.prev$contaminant)

## 
## FALSE  TRUE 
## 31576    11

contaminants<- row.names(subset(contamdf.prev, contaminant == TRUE))
contaminants_phylo<-prune_taxa(contaminants, meerkat)


### rmeove contaminants

taxa.names<-taxa_names(meerkat)

ToKeep <- taxa.names[!(taxa.names %in% contaminants)]

meerkat_nocontams<-prune_taxa(ToKeep, meerkat)

100-((sum(taxa_sums(meerkat_nocontams))/sum(taxa_sums(meerkat))) *100)

## [1] 0.00292979

# 0.003% reads removed

Identify sand contaminants

unique(sample_data(meerkat_nocontams)$Storage)

## [1] FROZEN               FREEZEDRIED          FREEZEDRIED_RNALATER
## [4] SAND                 EXTRACTION CONTROL   PCR CONTROL         
## [7] SAND-SAMPLE         
## 7 Levels: EXTRACTION CONTROL FREEZEDRIED FREEZEDRIED_RNALATER ... SAND-SAMPLE

sandcontamination<-subset_samples(meerkat_nocontams, sample_data(meerkat_nocontams)$Storage != "EXTRACTION CONTROL"  & sample_data(meerkat_nocontams)$Storage != "PCR CONTROL"& sample_data(meerkat_nocontams)$Storage != "ZYMO" & sample_data(meerkat_nocontams)$Storage != "SAND-SAMPLE")
sandcontamination1<-subset_samples(meerkat_nocontams, sample_data(meerkat_nocontams)$Storage != "EXTRACTION CONTROL"  & sample_data(meerkat_nocontams)$Storage != "PCR CONTROL"& sample_data(meerkat_nocontams)$Storage != "ZYMO" & sample_data(meerkat_nocontams)$Storage != "SAND-SAMPLE")
unique(sample_data(sandcontamination)$Storage)

## [1] FROZEN               FREEZEDRIED          FREEZEDRIED_RNALATER
## [4] SAND                
## Levels: FREEZEDRIED FREEZEDRIED_RNALATER FROZEN SAND

########

sample_data(sandcontamination)$is.neg <- sample_data(sandcontamination)$Storage == "SAND"
contamdf.prev <- decontam::isContaminant(sandcontamination, method="prevalence", neg="is.neg", threshold = 0.25)

#to be less strict use this threshold:
#contamdf.prev <- isContaminant(sandcontamination, method="prevalence", neg="is.neg", threshold = 0.2)


table(contamdf.prev$contaminant)

## 
## FALSE  TRUE 
## 28919  2657

sample_data(sandcontamination)$Sand_or_Faecal<-ifelse(sample_data(sandcontamination)$Storage == "SAND", "SAND", "FAECAL")


ps.pa <- transform_sample_counts(sandcontamination, function(abund) 1*(abund>0))
ps.pa.neg <- prune_samples(sample_data(ps.pa)$Sand_or_Faecal == "SAND", ps.pa)
ps.pa.pos <- prune_samples(sample_data(ps.pa)$Sand_or_Faecal == "FAECAL", ps.pa)
# Make data.frame of prevalence in positive and negative samples
df.pa <- data.frame(pa.pos=taxa_sums(ps.pa.pos), pa.neg=taxa_sums(ps.pa.neg),
                    contaminant=contamdf.prev$contaminant)
df.pa$ASV<-row.names(df.pa)

subset(df.pa, pa.neg>4 & pa.pos > 900)

ggplot(data=df.pa, aes(x=pa.neg, y=pa.pos, fill=contaminant, label =ASV)) + 
  geom_jitter(alpha = 0.5, size = 3, pch = 21, col = "black") +
  theme_bw(base_size = 16)+
  xlab("Prevalence (Sand samples)") + 
  ylab("Prevalence (Faecal Samples)")+
  ggtitle("Choosing sand contaminants")+
  #  scale_y_log10()+
  geom_text_repel( data = subset(df.pa, pa.neg>10 & pa.pos > 900))+
  geom_text_repel( data = subset(df.pa, pa.neg>5 & pa.neg <7 & pa.pos > 300 & pa.pos < 500 ))+
  scale_fill_manual(values = c("skyblue", "red"))

### CHANGE ONE ASV TO CONTAMINANT

df.pa$contaminant<-ifelse(df.pa$pa.neg>5 & df.pa$pa.neg <7 & df.pa$pa.pos > 300 & df.pa$pa.pos < 500 , "TRUE", df.pa$contaminant)

ggplot(data=df.pa, aes(x=pa.neg/max(pa.neg), y=pa.pos/max(pa.pos), fill=contaminant, label =ASV)) + 
  geom_jitter(alpha = 0.5, size = 3, pch = 21, col = "black") +
  theme_bw(base_size = 12)+
  xlab("Prevalence (Sand samples)") + 
  ylab("Prevalence (Faecal samples)")+
  ggtitle("Choosing sand contaminants")+
  scale_fill_manual(values = c("skyblue", "red"))

Look at microbiomes by storage and type (faecal/sand)

sand_beta<-sandcontamination1
sand_beta<-subset_samples(sand_beta, sample_sums(sand_beta)>1000)

sand_beta<- phyloseq::rarefy_even_depth(sand_beta, sample.size = 6000,rngsee = 100, replace = TRUE, trimOTUs=TRUE,verbose=TRUE)

sample_data(sand_beta)$Storage <-ifelse(sample_data(sand_beta)$Storage=="FREEZEDRIED_RNALATER", "FREEZEDRIED", as.character(sample_data(sand_beta)$Storage))

## beta diversity plot
## beta diversity plot
## beta diversity plot

meerkat.ord.bray <- ordinate(sand_beta, "MDS", "bray")
plot_ordination(sand_beta, meerkat.ord.bray, "samples", color="Storage")+ggtitle("Bray Curtis")+
  scale_color_viridis(discrete = TRUE, option = "D")

meerkat.ord.jac <- ordinate(sand_beta, "MDS", "jaccard")

plot_ordination(sand_beta, meerkat.ord.jac, "samples", color="Storage")+ggtitle("Jaccard")+
  scale_color_viridis(discrete = TRUE, option = "D")

########### stacked barplot 
########### stacked barplot 
########### stacked barplot 


sand_bar<-merge_samples(sand_beta, "Storage", fun=mean)
#sand_bar<-transform(sand_bar, transform = "compositional")
sand_bar<-tax_glom(sand_bar, taxrank = "Family")


sand_bar<-merge_samples(sand_beta, "Storage", fun=mean)

barplot_df <- sand_bar %>%
  tax_glom(taxrank = "Phylum") %>%                     # agglomerate at phylum level
  transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
  psmelt() %>%                                         # Melt to long format
  arrange(Phylum)  %>%                                    # Sort data frame alphabetically by phylum
  filter(Abundance > 0.01)                        # Filter out low abundance taxa


mypal = pal_npg("nrc", alpha = 0.7)(10)

ggplot(barplot_df, aes(x = Sample, y = Abundance, fill = Phylum)) + 
  geom_bar(stat = "identity") +
  scale_fill_manual(values = mypal)+
  theme_bw(base_size = 16)

#### by family
#### by family
#### by family


barplot_df_fam <- sand_bar %>%
  speedyseq::tax_glom(taxrank = "Family") %>%                     # agglomerate at phylum level
  transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
  psmelt() %>%                                         # Melt to long format
  arrange(Phylum)  %>%                                    # Sort data frame alphabetically by phylum
  filter(Abundance > 0.01)                        # Filter out low abundance taxa


## colour palette 

n <- 35
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))


ggplot(barplot_df_fam, aes(x = Sample, y = Abundance, fill = Family)) + 
  geom_bar(stat = "identity") +
  scale_fill_manual(values = col_vector)+
  theme_bw(base_size = 14)

#######

detach(package:plyr) # plyr messes with the mutate function

Remove sand contaminants

taxa.names<-taxa_names(meerkat_nocontams)

sand_contaminants<-row.names(subset(df.pa, contaminant == TRUE))
length(sand_contaminants)

## [1] 2658

ToKeep <- taxa.names[!(taxa.names %in% sand_contaminants)]

meerkat_nosandcontams<-prune_taxa(ToKeep, meerkat_nocontams)

meerkat_nocontams

## phyloseq-class experiment-level object
## otu_table()   OTU Table:          [ 31576 taxa and 1222 samples ]:
## sample_data() Sample Data:        [ 1222 samples by 10 sample variables ]:
## tax_table()   Taxonomy Table:     [ 31576 taxa by 7 taxonomic ranks ]:
## phy_tree()    Phylogenetic Tree:  [ 31576 tips and 31574 internal nodes ]:
## taxa are rows

meerkat_nosandcontams

## phyloseq-class experiment-level object
## otu_table()   OTU Table:          [ 28918 taxa and 1222 samples ]:
## sample_data() Sample Data:        [ 1222 samples by 10 sample variables ]:
## tax_table()   Taxonomy Table:     [ 28918 taxa by 7 taxonomic ranks ]:
## phy_tree()    Phylogenetic Tree:  [ 28918 tips and 28917 internal nodes ]:
## taxa are rows

paste(100-((sum(taxa_sums(meerkat_nosandcontams))/sum(taxa_sums(meerkat_nocontams))) *100), "% reads removed")

## [1] "4.34253158851055 % reads removed"

paste(100-((sum(taxa_sums(meerkat_nosandcontams))/sum(taxa_sums(meerkat_original))) *100), "% reads removed")

## [1] "6.04056886555094 % reads removed"

### 28918 ASVs removed as sand contaminants

#########

unique(sample_data(meerkat_nosandcontams)$Storage )

## [1] FROZEN               FREEZEDRIED          FREEZEDRIED_RNALATER
## [4] SAND                 EXTRACTION CONTROL   PCR CONTROL         
## [7] SAND-SAMPLE         
## 7 Levels: EXTRACTION CONTROL FREEZEDRIED FREEZEDRIED_RNALATER ... SAND-SAMPLE

meerkat_final<-subset_samples(meerkat_nosandcontams, sample_data(meerkat_nosandcontams)$Storage == "FROZEN" | sample_data(meerkat_nosandcontams)$Storage == "FREEZEDRIED" |sample_data(meerkat_nosandcontams)$Storage == "FREEZEDRIED_RNALATER")

min(taxa_sums(meerkat_final))

## [1] 0

meerkat_final <- prune_taxa(taxa_sums(meerkat_final) > 0, meerkat_final) 


100 - (sum(taxa_sums(meerkat_final))/sum(taxa_sums(meerkat_nocontams)))*100

## [1] 5.129891

sum(taxa_sums(meerkat_final))/sum(taxa_sums(meerkat_original))

## [1] 0.9318605

## overal 7% of reads were removed during filtering process

Add internal standard to metadata and remove from dataset

imtechella<-subset_taxa(meerkat_final, Genus =="Imtechella")
allobacillus<-subset_taxa(meerkat_final, Genus =="Allobacillus")

sample_data(meerkat_final)$Imtechella<-sample_sums(imtechella)
sample_data(meerkat_final)$Allobacillus<-sample_sums(allobacillus)
sample_data(meerkat_final)$Seq_depth<-sample_sums(meerkat_final)


ggplot(sample_data(meerkat_final), aes(x = sample_data(meerkat_final)$Imtechella, y = sample_data(meerkat_final)$Allobacillus))+
  geom_point(aes(fill = sample_data(meerkat_final)$Storage), pch = 21, col = "black", size = 1.5)+
  theme_bw(base_size = 14)+
  #scale_x_log10()+
  #scale_y_log10()+
  theme(legend.position = "none")+
  ylab("No. Allobacillus reads")+
  xlab("No. Imtechella reads")+
  ggtitle("Correlation between Imtechella and Allobacillus")+
  geom_abline(slope = 1)+
  ylim(0,NA)+
  xlim(0,NA)

ggplot(sample_data(meerkat_final), aes(x = sample_data(meerkat_final)$Imtechella, y = sample_data(meerkat_final)$Allobacillus))+
  geom_point(aes(fill = sample_data(meerkat_final)$Storage), pch = 21, col = "black", size = 1.5)+
  theme_bw(base_size = 14)+
  scale_x_log10(limits = c(1e1,1e5))+
  scale_y_log10(limits = c(1e1,1e5))+
  theme(legend.position = "none")+
  ylab("No. Allobacillus reads")+
  xlab("No. Imtechella reads")+
  ggtitle("Correlation between Imtechella and Allobacillus")+
  geom_abline(slope = 1)

####################


############### remove two samples with low spike in counts (these have replicates anyway in next runs)

meerkat_final<- subset_samples(meerkat_final, Imtechella > 0 & Allobacillus > 0)

sample_data(meerkat_final)$Mean_spikein<-(sample_data(meerkat_final)$Imtechella+ sample_data(meerkat_final)$Allobacillus)/2

########## generate scaling factor: this is the number to multiply all reads with so that theoretically the abundance of ALlobacillus would be equal across all samples

sample_data(meerkat_final)$scaling_factor<-mean(sample_data(meerkat_final)$Allobacillus) /sample_data(meerkat_final)$Allobacillus


################## now remove all spike in from phylo object
################## now remove all spike in from phylo object
################## now remove all spike in from phylo object
################## now remove all spike in from phylo object

spikein<- subset_taxa(meerkat_final, Genus =="Imtechella" | Genus =="Allobacillus")
tax_table(spikein)

## Taxonomy Table:     [ 13 taxa by 7 taxonomic ranks ]:
##                    Kingdom  Phylum   Class  Order   Family   Genus  Species     
##                    <fct>    <fct>    <fct>  <fct>   <fct>    <fct>  <fct>       
##  1 1c4da9ec7e9217~ Bacteria Firmicu~ Bacil~ Bacill~ Bacilla~ Allob~ <NA>        
##  2 b92c7cfb137fee~ Bacteria Firmicu~ Bacil~ Bacill~ Bacilla~ Allob~ D_6__Alloba~
##  3 098b1d1068844c~ Bacteria Firmicu~ Bacil~ Bacill~ Bacilla~ Allob~ D_6__Alloba~
##  4 973fa08cb5cc57~ Bacteria Firmicu~ Bacil~ Bacill~ Bacilla~ Allob~ D_6__Alloba~
##  5 8d64369815c986~ Bacteria Firmicu~ Bacil~ Bacill~ Bacilla~ Allob~ D_6__Alloba~
##  6 b1b62a9ef68483~ Bacteria Firmicu~ Bacil~ Bacill~ Bacilla~ Allob~ <NA>        
##  7 2d904743b53167~ Bacteria Bactero~ Bacte~ Flavob~ Flavoba~ Imtec~ D_6__Imtech~
##  8 69822a606cf2da~ Bacteria Bactero~ Bacte~ Flavob~ Flavoba~ Imtec~ D_6__Imtech~
##  9 59f5254a41f617~ Bacteria Bactero~ Bacte~ Flavob~ Flavoba~ Imtec~ D_6__Imtech~
## 10 e62d5872faf30e~ Bacteria Bactero~ Bacte~ Flavob~ Flavoba~ Imtec~ D_6__Imtech~
## 11 941351349c123e~ Bacteria Bactero~ Bacte~ Flavob~ Flavoba~ Imtec~ D_6__Imtech~
## 12 8d4c77cdd516a4~ Bacteria Bactero~ Bacte~ Flavob~ Flavoba~ Imtec~ D_6__Imtech~
## 13 71037fedb85db4~ Bacteria Bactero~ Bacte~ Flavob~ Flavoba~ Imtec~ D_6__Imtech~

otu_table(spikein)

## OTU Table:          [ 13 taxa and 1170 samples ]:
## Taxa are rows
##               `1012_run2` `1012` `10158_run2` `10158` `10789_run2` `10789`
##  1 1c4da9ec7~           0      0            0       0            0       0
##  2 b92c7cfb1~         396    331          486     562          372     301
##  3 098b1d106~           0      0            0       0            0       0
##  4 973fa08cb~           0      0            0       0            0       0
##  5 8d6436981~           0      0            0       0            0       0
##  6 b1b62a9ef~           0      0            0       0            0       0
##  7 2d904743b~         314    247          497     539          383     252
##  8 69822a606~           0      0            0       0            0       0
##  9 59f5254a4~           0      0            0       0            0       0
## 10 e62d5872f~           0      0            0       0            0       0
## 11 941351349~           0      0            0       0            0       0
## 12 8d4c77cdd~           0      0            0       0            0       0
## 13 71037fedb~           0      0            0       0            0       0
## # ... with 11 more samples (columns)

spikein_list<-taxa_names(spikein)

taxa.names<-taxa_names(meerkat_final)

ToKeep <- taxa.names[!(taxa.names %in% spikein_list)]

meerkat_final<-prune_taxa(ToKeep, meerkat_final)


### add sample density to metadata file

sample_data(meerkat_final)$Seq_depth_nospikein<-sample_sums(meerkat_final)


meerkat_nocontams<-meerkat_final ## save copy

spikein<-data.frame(sample_data(meerkat_final))
spikein$spikein_total<-spikein$Imtechella + spikein$Allobacillus
spikein$Percent_spikein<-spikein$spikein_total/ spikein$Seq_depth

summary(spikein$Percent_spikein)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.001973 0.015558 0.033224 0.104153 0.113640 0.918437

ggplot(spikein, aes(x= Percent_spikein))+
  geom_histogram(bins = 50, fill = "grey", col = "black")+
  theme_light(base_size = 14)+
  # ylim(0,1)+
  # theme(axis.text.x = element_blank())+
  ylab("Frequency")+
  xlab("Proportion spike-in")+
  ggtitle("Histogram of spike-in proportion per sample")