Generate summary statistics for the colony genotypes

Here I’ll calculate summary statistics for the microhaplotype markers for the breeding colony samples, including HWE and F-statistics.

Calculate gene frequencies - need to do all of these calculations separately for the individual populations.

exp_heterozygosity = 2 * Freq * (1 - Freq)) 1 - sum(p2 + q2)

Fis = exp_heterozygosity - observed_heterozygosity / expected heterozygosity

library(tidyverse)
library(CKMRsim)
library(hierfstat)

# genotypes
no_hi_missers_clean <- read_csv("csv_outputs/baseline_no_hi_missers_clean.csv")

# read in population information
pops <- read_csv("data/nofu_ids_04022019.csv")

Separate the populations

subpop_genos <- no_hi_missers_clean %>%
  left_join(., pops)
Joining, by = "NMFS_DNA_ID"

Allele frequencies

# we will use this some more
kg2 <- subpop_genos %>% 
  select(NMFS_DNA_ID, locus, allele, group) %>%
  mutate(Chrom = "GTseq") %>% 
  mutate(Pos = as.integer(factor(locus, levels = unique(locus)))) %>%
  rename(Locus = locus,
         Allele = allele) %>%
  select(NMFS_DNA_ID, Chrom, Locus, Pos, Allele, group) %>%
  ungroup()

# get the allele freqs
kg_ckmr_markers <- kg2 %>%
  filter(!is.na(Allele)) %>% # it is vital to filter out the NAs at this stage
  group_by(Chrom, Locus, Pos, Allele) %>%
  summarise(counts = n()) %>%
  group_by(Locus, Pos) %>%
  mutate(Freq = counts / sum(counts)) %>%
  select(-counts) %>%
  mutate(AlleIdx = 1,
         LocIdx = 1) %>%
  reindex_markers(.)

Turn alleles into integers, spread them and then get them into the right format to run rubias.

# first make integers of the alleles
alle_idxs <- subpop_genos %>% 
  select(NMFS_DNA_ID, locus, gene_copy, allele, group) %>%
  group_by(locus) %>%
  mutate(alleidx = as.integer(factor(allele, levels = unique(allele)))) %>%
  ungroup() %>%
  arrange(group, NMFS_DNA_ID, locus, alleidx) %>%
  select(group, NMFS_DNA_ID, locus, gene_copy, alleidx) %>%
  mutate(alleidx = ifelse(is.na(alleidx), 0, alleidx)) # do not do this for rubias - leave as NA's

F stats

# reformat
# first the first symbol
alle_idxs$locus <- gsub(":", "_", alle_idxs$locus)

# and the second
alle_idxs$locus <- gsub("-", "_", alle_idxs$locus)

# spread the genotypes
two_col <- alle_idxs %>%
  unite(loc, locus, gene_copy, sep = "_") %>%
  spread(loc, alleidx)

# convert that to an object "loci"
data_loci <- alleles2loci(two_col, ploidy = 2, rownames = 2, population = 1)

# convert that into a genind object
data_gen <- loci2genind(data_loci, ploidy = 2, na.alleles = 0)

Now, using hierfstat for the F-statistcs

test_h <- basic.stats(data_gen, diploid = TRUE)

# FIS per population
pop_fis <- test_h$Fis %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  rename(locus = rowname)

# observed heterozygosity
pop_Ho <- test_h$Ho %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  rename(locus = rowname)

# expected heterozygosity
pop_Hs <- test_h$Hs %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  rename(locus = rowname)

HWE

HWE using the genind object

# overall
hw_gen <- pegas::hw.test(data_gen)

By population (Super helpful tutorial on these things here: https://grunwaldlab.github.io/Population_Genetics_in_R/Locus_Stats.html)

# separately, by population
data.pop <- seppop(data_gen) %>% lapply(hw.test, B = 0) # by setting B = 0, we're just looking at the analytical pvalue

data.mat <- sapply(data.pop, "[", i = TRUE, j = 3) # Take the third column with all rows

# filtering and identifying loci out of HWE in multiple colonies
loci_out_of_hw <- data.mat %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  rename(Locus = rowname) %>%
  pivot_longer(names_to = "Colony", 2:5) %>%
  rename(pval = value) %>%
  filter(pval < 0.05) %>%
  group_by(Locus) %>%
  tally() %>%
  arrange(desc(n))

Three loci out of HWE in all four colonies and five loci out of HWE in three of 4. I’ll remove all 8 loci.

Create a list of those loci to remove before the self-assignment.

loci_out_of_hw %>%
  filter(n > 2) %>%
  write_csv("csv_outputs/loci_out_hwe.csv")

I also should probably have a list of the HWE pval for each colony.

data.mat %>%
  as.data.frame() %>%
  rownames_to_column() %>%
  rename(Locus = rowname) %>%
  write_csv("csv_outputs/locus_pop_hwe_pval.csv")

And then make a dataframe with the per-colony heterozygosities and FIS:

pop_Ho %>%
  rename(Chag_Ho = Chagulak) %>%
  rename(Prib_Ho = Pribilof) %>%
  rename(Semi_Ho = Semidi) %>%
  rename(StMa_Ho = StMatthew) %>%
  left_join(., Hs) %>%
  left_join(., Fis) %>%
  select(locus, Chag_Ho, Chag_Hs, Chag_Fis, Prib_Ho, Prib_Hs, Prib_Fis, Semi_Ho, Semi_Hs, Semi_Fis, StMa_Ho, StMa_Hs, StMa_Fis)
Joining, by = "locus"
Joining, by = "locus"
LS0tCnRpdGxlOiAiY29sb255IHN1bW1hcnkgc3RhdGlzdGljcyBhbmQgSFdFIGNhbGN1bGF0aW9ucyIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKIyMgR2VuZXJhdGUgc3VtbWFyeSBzdGF0aXN0aWNzIGZvciB0aGUgY29sb255IGdlbm90eXBlcwoKSGVyZSBJJ2xsIGNhbGN1bGF0ZSBzdW1tYXJ5IHN0YXRpc3RpY3MgZm9yIHRoZSBtaWNyb2hhcGxvdHlwZSBtYXJrZXJzIGZvciB0aGUgYnJlZWRpbmcgY29sb255IHNhbXBsZXMsIGluY2x1ZGluZyBIV0UgYW5kIEYtc3RhdGlzdGljcy4KCkNhbGN1bGF0ZSBnZW5lIGZyZXF1ZW5jaWVzIC0gbmVlZCB0byBkbyBhbGwgb2YgdGhlc2UgY2FsY3VsYXRpb25zIHNlcGFyYXRlbHkgZm9yIHRoZSBpbmRpdmlkdWFsIHBvcHVsYXRpb25zLgoKZXhwX2hldGVyb3p5Z29zaXR5ID0gMiAqIEZyZXEgKiAoMSAtIEZyZXEpKQoxIC0gc3VtKHAyICsgcTIpCgpGaXMgPSBleHBfaGV0ZXJvenlnb3NpdHkgLSBvYnNlcnZlZF9oZXRlcm96eWdvc2l0eSAvIGV4cGVjdGVkIGhldGVyb3p5Z29zaXR5CgoKYGBge3IgbG9hZC1kYXRhLWFuZC1saWJzfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKbGlicmFyeShDS01Sc2ltKQpsaWJyYXJ5KGhpZXJmc3RhdCkKCiMgZ2Vub3R5cGVzCm5vX2hpX21pc3NlcnNfY2xlYW4gPC0gcmVhZF9jc3YoImNzdl9vdXRwdXRzL2Jhc2VsaW5lX25vX2hpX21pc3NlcnNfY2xlYW4uY3N2IikKCiMgcmVhZCBpbiBwb3B1bGF0aW9uIGluZm9ybWF0aW9uCnBvcHMgPC0gcmVhZF9jc3YoImRhdGEvbm9mdV9pZHNfMDQwMjIwMTkuY3N2IikKCmBgYAoKU2VwYXJhdGUgdGhlIHBvcHVsYXRpb25zCmBgYHtyfQpzdWJwb3BfZ2Vub3MgPC0gbm9faGlfbWlzc2Vyc19jbGVhbiAlPiUKICBsZWZ0X2pvaW4oLiwgcG9wcykKYGBgCgoKIyMgQWxsZWxlIGZyZXF1ZW5jaWVzCgpgYGB7ciBrZWxwLWdlbm9zfQojIHdlIHdpbGwgdXNlIHRoaXMgc29tZSBtb3JlCmtnMiA8LSBzdWJwb3BfZ2Vub3MgJT4lIAogIHNlbGVjdChOTUZTX0ROQV9JRCwgbG9jdXMsIGFsbGVsZSwgZ3JvdXApICU+JQogIG11dGF0ZShDaHJvbSA9ICJHVHNlcSIpICU+JSAKICBtdXRhdGUoUG9zID0gYXMuaW50ZWdlcihmYWN0b3IobG9jdXMsIGxldmVscyA9IHVuaXF1ZShsb2N1cykpKSkgJT4lCiAgcmVuYW1lKExvY3VzID0gbG9jdXMsCiAgICAgICAgIEFsbGVsZSA9IGFsbGVsZSkgJT4lCiAgc2VsZWN0KE5NRlNfRE5BX0lELCBDaHJvbSwgTG9jdXMsIFBvcywgQWxsZWxlLCBncm91cCkgJT4lCiAgdW5ncm91cCgpCgojIGdldCB0aGUgYWxsZWxlIGZyZXFzCmtnX2NrbXJfbWFya2VycyA8LSBrZzIgJT4lCiAgZmlsdGVyKCFpcy5uYShBbGxlbGUpKSAlPiUgIyBpdCBpcyB2aXRhbCB0byBmaWx0ZXIgb3V0IHRoZSBOQXMgYXQgdGhpcyBzdGFnZQogIGdyb3VwX2J5KENocm9tLCBMb2N1cywgUG9zLCBBbGxlbGUpICU+JQogIHN1bW1hcmlzZShjb3VudHMgPSBuKCkpICU+JQogIGdyb3VwX2J5KExvY3VzLCBQb3MpICU+JQogIG11dGF0ZShGcmVxID0gY291bnRzIC8gc3VtKGNvdW50cykpICU+JQogIHNlbGVjdCgtY291bnRzKSAlPiUKICBtdXRhdGUoQWxsZUlkeCA9IDEsCiAgICAgICAgIExvY0lkeCA9IDEpICU+JQogIHJlaW5kZXhfbWFya2VycyguKQpgYGAKClR1cm4gYWxsZWxlcyBpbnRvIGludGVnZXJzLCBzcHJlYWQgdGhlbSBhbmQgdGhlbiBnZXQgdGhlbSBpbnRvIHRoZSByaWdodCBmb3JtYXQgdG8gcnVuIHJ1Ymlhcy4KYGBge3Igc3ByZWFkLWdlbm9zfQojIGZpcnN0IG1ha2UgaW50ZWdlcnMgb2YgdGhlIGFsbGVsZXMKYWxsZV9pZHhzIDwtIHN1YnBvcF9nZW5vcyAlPiUgCiAgc2VsZWN0KE5NRlNfRE5BX0lELCBsb2N1cywgZ2VuZV9jb3B5LCBhbGxlbGUsIGdyb3VwKSAlPiUKICBncm91cF9ieShsb2N1cykgJT4lCiAgbXV0YXRlKGFsbGVpZHggPSBhcy5pbnRlZ2VyKGZhY3RvcihhbGxlbGUsIGxldmVscyA9IHVuaXF1ZShhbGxlbGUpKSkpICU+JQogIHVuZ3JvdXAoKSAlPiUKICBhcnJhbmdlKGdyb3VwLCBOTUZTX0ROQV9JRCwgbG9jdXMsIGFsbGVpZHgpICU+JQogIHNlbGVjdChncm91cCwgTk1GU19ETkFfSUQsIGxvY3VzLCBnZW5lX2NvcHksIGFsbGVpZHgpICU+JQogIG11dGF0ZShhbGxlaWR4ID0gaWZlbHNlKGlzLm5hKGFsbGVpZHgpLCAwLCBhbGxlaWR4KSkgIyBkbyBub3QgZG8gdGhpcyBmb3IgcnViaWFzIC0gbGVhdmUgYXMgTkEncwoKYGBgCgoKIyMgRiBzdGF0cwoKYGBge3IgZnN0YXRzLWluLWhpZXJmc3RhdH0KIyByZWZvcm1hdAojIGZpcnN0IHRoZSBmaXJzdCBzeW1ib2wKYWxsZV9pZHhzJGxvY3VzIDwtIGdzdWIoIjoiLCAiXyIsIGFsbGVfaWR4cyRsb2N1cykKCiMgYW5kIHRoZSBzZWNvbmQKYWxsZV9pZHhzJGxvY3VzIDwtIGdzdWIoIi0iLCAiXyIsIGFsbGVfaWR4cyRsb2N1cykKCiMgc3ByZWFkIHRoZSBnZW5vdHlwZXMKdHdvX2NvbCA8LSBhbGxlX2lkeHMgJT4lCiAgdW5pdGUobG9jLCBsb2N1cywgZ2VuZV9jb3B5LCBzZXAgPSAiXyIpICU+JQogIHNwcmVhZChsb2MsIGFsbGVpZHgpCgojIGNvbnZlcnQgdGhhdCB0byBhbiBvYmplY3QgImxvY2kiCmRhdGFfbG9jaSA8LSBhbGxlbGVzMmxvY2kodHdvX2NvbCwgcGxvaWR5ID0gMiwgcm93bmFtZXMgPSAyLCBwb3B1bGF0aW9uID0gMSkKCiMgY29udmVydCB0aGF0IGludG8gYSBnZW5pbmQgb2JqZWN0CmRhdGFfZ2VuIDwtIGxvY2kyZ2VuaW5kKGRhdGFfbG9jaSwgcGxvaWR5ID0gMiwgbmEuYWxsZWxlcyA9IDApCgpgYGAKCk5vdywgdXNpbmcgaGllcmZzdGF0IGZvciB0aGUgRi1zdGF0aXN0Y3MKYGBge3J9CnRlc3RfaCA8LSBiYXNpYy5zdGF0cyhkYXRhX2dlbiwgZGlwbG9pZCA9IFRSVUUpCgojIEZJUyBwZXIgcG9wdWxhdGlvbgpwb3BfZmlzIDwtIHRlc3RfaCRGaXMgJT4lCiAgYXMuZGF0YS5mcmFtZSgpICU+JQogIHJvd25hbWVzX3RvX2NvbHVtbigpICU+JQogIHJlbmFtZShsb2N1cyA9IHJvd25hbWUpCgojIG9ic2VydmVkIGhldGVyb3p5Z29zaXR5CnBvcF9IbyA8LSB0ZXN0X2gkSG8gJT4lCiAgYXMuZGF0YS5mcmFtZSgpICU+JQogIHJvd25hbWVzX3RvX2NvbHVtbigpICU+JQogIHJlbmFtZShsb2N1cyA9IHJvd25hbWUpCgojIGV4cGVjdGVkIGhldGVyb3p5Z29zaXR5CnBvcF9IcyA8LSB0ZXN0X2gkSHMgJT4lCiAgYXMuZGF0YS5mcmFtZSgpICU+JQogIHJvd25hbWVzX3RvX2NvbHVtbigpICU+JQogIHJlbmFtZShsb2N1cyA9IHJvd25hbWUpCgpgYGAKCgoKIyMgSFdFCgpIV0UgdXNpbmcgdGhlIGdlbmluZCBvYmplY3QKYGBge3J9CiMgb3ZlcmFsbApod19nZW4gPC0gcGVnYXM6Omh3LnRlc3QoZGF0YV9nZW4pCmBgYAoKCkJ5IHBvcHVsYXRpb24KKFN1cGVyIGhlbHBmdWwgdHV0b3JpYWwgb24gdGhlc2UgdGhpbmdzIGhlcmU6IGh0dHBzOi8vZ3J1bndhbGRsYWIuZ2l0aHViLmlvL1BvcHVsYXRpb25fR2VuZXRpY3NfaW5fUi9Mb2N1c19TdGF0cy5odG1sKQpgYGB7cn0KIyBzZXBhcmF0ZWx5LCBieSBwb3B1bGF0aW9uCmRhdGEucG9wIDwtIHNlcHBvcChkYXRhX2dlbikgJT4lIGxhcHBseShody50ZXN0LCBCID0gMCkgIyBieSBzZXR0aW5nIEIgPSAwLCB3ZSdyZSBqdXN0IGxvb2tpbmcgYXQgdGhlIGFuYWx5dGljYWwgcHZhbHVlCgpkYXRhLm1hdCA8LSBzYXBwbHkoZGF0YS5wb3AsICJbIiwgaSA9IFRSVUUsIGogPSAzKSAjIFRha2UgdGhlIHRoaXJkIGNvbHVtbiB3aXRoIGFsbCByb3dzCgojIGZpbHRlcmluZyBhbmQgaWRlbnRpZnlpbmcgbG9jaSBvdXQgb2YgSFdFIGluIG11bHRpcGxlIGNvbG9uaWVzCmxvY2lfb3V0X29mX2h3IDwtIGRhdGEubWF0ICU+JQogIGFzLmRhdGEuZnJhbWUoKSAlPiUKICByb3duYW1lc190b19jb2x1bW4oKSAlPiUKICByZW5hbWUoTG9jdXMgPSByb3duYW1lKSAlPiUKICBwaXZvdF9sb25nZXIobmFtZXNfdG8gPSAiQ29sb255IiwgMjo1KSAlPiUKICByZW5hbWUocHZhbCA9IHZhbHVlKSAlPiUKICBmaWx0ZXIocHZhbCA8IDAuMDUpICU+JQogIGdyb3VwX2J5KExvY3VzKSAlPiUKICB0YWxseSgpICU+JQogIGFycmFuZ2UoZGVzYyhuKSkKYGBgClRocmVlIGxvY2kgb3V0IG9mIEhXRSBpbiBhbGwgZm91ciBjb2xvbmllcyBhbmQgZml2ZSBsb2NpIG91dCBvZiBIV0UgaW4gdGhyZWUgb2YgNC4gCkknbGwgcmVtb3ZlIGFsbCA4IGxvY2kuCgpDcmVhdGUgYSBsaXN0IG9mIHRob3NlIGxvY2kgdG8gcmVtb3ZlIGJlZm9yZSB0aGUgc2VsZi1hc3NpZ25tZW50LgpgYGB7cn0KbG9jaV9vdXRfb2ZfaHcgJT4lCiAgZmlsdGVyKG4gPiAyKSAlPiUKICB3cml0ZV9jc3YoImNzdl9vdXRwdXRzL2xvY2lfb3V0X2h3ZS5jc3YiKQpgYGAKCkkgYWxzbyBzaG91bGQgcHJvYmFibHkgaGF2ZSBhIGxpc3Qgb2YgdGhlIEhXRSBwdmFsIGZvciBlYWNoIGNvbG9ueS4KYGBge3J9CmRhdGEubWF0ICU+JQogIGFzLmRhdGEuZnJhbWUoKSAlPiUKICByb3duYW1lc190b19jb2x1bW4oKSAlPiUKICByZW5hbWUoTG9jdXMgPSByb3duYW1lKSAlPiUKICB3cml0ZV9jc3YoImNzdl9vdXRwdXRzL2xvY3VzX3BvcF9od2VfcHZhbC5jc3YiKQpgYGAKCkFuZCB0aGVuIG1ha2UgYSBkYXRhZnJhbWUgd2l0aCB0aGUgcGVyLWNvbG9ueSBoZXRlcm96eWdvc2l0aWVzIGFuZCBGSVM6CmBgYHtyfQpIcyA8LSBwb3BfSHMgJT4lCiAgcmVuYW1lKENoYWdfSHMgPSBDaGFndWxhaykgJT4lCiAgcmVuYW1lKFByaWJfSHMgPSBQcmliaWxvZikgJT4lCiAgcmVuYW1lKFNlbWlfSHMgPSBTZW1pZGkpICU+JQogIHJlbmFtZShTdE1hX0hzID0gU3RNYXR0aGV3KQoKRmlzIDwtIHBvcF9maXMgJT4lCiAgcmVuYW1lKENoYWdfRmlzID0gQ2hhZ3VsYWspICU+JQogIHJlbmFtZShQcmliX0ZpcyA9IFByaWJpbG9mKSAlPiUKICByZW5hbWUoU2VtaV9GaXMgPSBTZW1pZGkpICU+JQogIHJlbmFtZShTdE1hX0ZpcyA9IFN0TWF0dGhldykKICAKcG9wX0hvICU+JQogIHJlbmFtZShDaGFnX0hvID0gQ2hhZ3VsYWspICU+JQogIHJlbmFtZShQcmliX0hvID0gUHJpYmlsb2YpICU+JQogIHJlbmFtZShTZW1pX0hvID0gU2VtaWRpKSAlPiUKICByZW5hbWUoU3RNYV9IbyA9IFN0TWF0dGhldykgJT4lCiAgbGVmdF9qb2luKC4sIEhzKSAlPiUKICBsZWZ0X2pvaW4oLiwgRmlzKSAlPiUKICBzZWxlY3QobG9jdXMsIENoYWdfSG8sIENoYWdfSHMsIENoYWdfRmlzLCBQcmliX0hvLCBQcmliX0hzLCBQcmliX0ZpcywgU2VtaV9IbywgU2VtaV9IcywgU2VtaV9GaXMsIFN0TWFfSG8sIFN0TWFfSHMsIFN0TWFfRmlzKSAlPiUKICB3cml0ZV9jc3YoImNzdl9vdXRwdXRzL2NvbG9ueV9zdW1tYXJ5X3N0YXRzLmNzdiIpCgpgYGAKCgoK