Step 1 for bycatch data

Here, we will take a first stab at filtering the haplotype data from the bycatch birds and then assigning those samples to our colony baseline using rubias.

To start off with, let’s load data and libs:

library(tidyverse)
library(CKMRsim)
library(stringr)

meta <- readRDS("bycatch_data/meta-data-tibble.rds")
genos <- readRDS("bycatch_data/april_called_genos_na_explicit.rds") %>%
  filter(NMFS_DNA_ID %in% meta$NMFS_DNA_ID)  # drop those we don't have meta data for
samples <- readRDS("bycatch_data/sample-sheet-tibble.rds") %>%
  filter(NMFS_DNA_ID %in% meta$NMFS_DNA_ID)

# loci that deviate from HWE in the baseline populations
outliers <- read_csv("csv_outputs/loci_out_hwe.csv")
Parsed with column specification:
cols(
  Locus = col_character(),
  n = col_double()
)

Some initial filters

Take highest read-depth call for multiply-genotyped DNA_IDs

There shouldn’t be any of these. But just to make sure… Now, here is a harder operation: if an individual is multiply-genotyped, take the genotype with the highest total read depth.

# slow-ish function to get the total read depth column
tdepth <- function(a, d) {
  if(any(is.na(a))) {
    return(NA)
  }
  if(a[1]==a[2]) {
    return(d[1])
  } else {
    return(d[1] + d[2])
  }

}
# this takes the highest read-depth instance of each duplicately-genotyped individual.
geno_one_each <- genos %>%
  group_by(NMFS_DNA_ID, locus, gtseq_run, id) %>%
  mutate(total_depth = tdepth(allele, depth)) %>%
  ungroup() %>%
  arrange(NMFS_DNA_ID, locus, total_depth, gtseq_run, id, depth) %>%
  group_by(NMFS_DNA_ID, locus) %>%
  mutate(rank = 1:n()) %>%
  ungroup() %>%
  filter(rank <= 2)

Before tossing individuals with too much missing data, we want to implement our total read depth threshold.

# if total depth >= 20, print the allele; if not, print NA
geno_one_each2 <- geno_one_each %>%
   mutate(allele = ifelse(total_depth >=20, allele, NA))

Toss out indivs with more than 28 missing loci

no_hi_missers <- geno_one_each2 %>% 
  group_by(NMFS_DNA_ID) %>%
  filter(sum(!is.na(allele)) >= (113*2))

Remove the loci that significantly deviate from HWE in the baseline samples

I’ll remove all 8 loci

outliers$Locus <- gsub("_1", "", outliers$Locus)
# replace the weird characters in the locus name to match
no_hi_missers$locus <- gsub(":", "_", no_hi_missers$locus)
no_hi_missers$locus <- gsub("-", "_", no_hi_missers$locus)
# now anti-join the loci to remove and the no_hi_misser genotypes
no_hi_missers_loc <- no_hi_missers %>% 
  anti_join(., outliers, by = c("locus" = "Locus"))

# save that output
no_hi_missers_loc %>%
  write_csv("csv_outputs/bycatch_no_hi_missers_hwe.csv")
LS0tCnRpdGxlOiAiMDQtdGlkeS1ieWNhdGNoLWRhdGEiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KClN0ZXAgMSBmb3IgYnljYXRjaCBkYXRhCgpIZXJlLCB3ZSB3aWxsIHRha2UgYSBmaXJzdCBzdGFiIGF0IGZpbHRlcmluZyB0aGUgaGFwbG90eXBlIGRhdGEgZnJvbSB0aGUgYnljYXRjaCBiaXJkcyBhbmQgdGhlbiBhc3NpZ25pbmcgdGhvc2Ugc2FtcGxlcyB0byBvdXIgY29sb255IGJhc2VsaW5lIHVzaW5nIHJ1Ymlhcy4KClRvIHN0YXJ0IG9mZiB3aXRoLCBsZXQncyBsb2FkIGRhdGEgYW5kIGxpYnM6CmBgYHtyIGxvYWQtc3R1ZmZ9CmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KENLTVJzaW0pCmxpYnJhcnkoc3RyaW5ncikKCm1ldGEgPC0gcmVhZFJEUygiYnljYXRjaF9kYXRhL21ldGEtZGF0YS10aWJibGUucmRzIikKZ2Vub3MgPC0gcmVhZFJEUygiYnljYXRjaF9kYXRhL2FwcmlsX2NhbGxlZF9nZW5vc19uYV9leHBsaWNpdC5yZHMiKSAlPiUKICBmaWx0ZXIoTk1GU19ETkFfSUQgJWluJSBtZXRhJE5NRlNfRE5BX0lEKSAgIyBkcm9wIHRob3NlIHdlIGRvbid0IGhhdmUgbWV0YSBkYXRhIGZvcgpzYW1wbGVzIDwtIHJlYWRSRFMoImJ5Y2F0Y2hfZGF0YS9zYW1wbGUtc2hlZXQtdGliYmxlLnJkcyIpICU+JQogIGZpbHRlcihOTUZTX0ROQV9JRCAlaW4lIG1ldGEkTk1GU19ETkFfSUQpCgojIGxvY2kgdGhhdCBkZXZpYXRlIGZyb20gSFdFIGluIHRoZSBiYXNlbGluZSBwb3B1bGF0aW9ucwpvdXRsaWVycyA8LSByZWFkX2NzdigiY3N2X291dHB1dHMvbG9jaV9vdXRfaHdlLmNzdiIpCgpgYGAKCgojIyBTb21lIGluaXRpYWwgZmlsdGVycwoKIyMjIFRha2UgaGlnaGVzdCByZWFkLWRlcHRoIGNhbGwgZm9yIG11bHRpcGx5LWdlbm90eXBlZCBETkFfSURzCgpUaGVyZSBzaG91bGRuJ3QgYmUgYW55IG9mIHRoZXNlLiBCdXQganVzdCB0byBtYWtlIHN1cmUuLi4KTm93LCBoZXJlIGlzIGEgaGFyZGVyIG9wZXJhdGlvbjogaWYgYW4gaW5kaXZpZHVhbCBpcyBtdWx0aXBseS1nZW5vdHlwZWQsIHRha2UgdGhlCmdlbm90eXBlIHdpdGggdGhlIGhpZ2hlc3QgdG90YWwgcmVhZCBkZXB0aC4gCmBgYHtyIHRha2UtanVzdC1vbmV9CiMgc2xvdy1pc2ggZnVuY3Rpb24gdG8gZ2V0IHRoZSB0b3RhbCByZWFkIGRlcHRoIGNvbHVtbgp0ZGVwdGggPC0gZnVuY3Rpb24oYSwgZCkgewogIGlmKGFueShpcy5uYShhKSkpIHsKICAgIHJldHVybihOQSkKICB9CiAgaWYoYVsxXT09YVsyXSkgewogICAgcmV0dXJuKGRbMV0pCiAgfSBlbHNlIHsKICAgIHJldHVybihkWzFdICsgZFsyXSkKICB9Cgp9CiMgdGhpcyB0YWtlcyB0aGUgaGlnaGVzdCByZWFkLWRlcHRoIGluc3RhbmNlIG9mIGVhY2ggZHVwbGljYXRlbHktZ2Vub3R5cGVkIGluZGl2aWR1YWwuCmdlbm9fb25lX2VhY2ggPC0gZ2Vub3MgJT4lCiAgZ3JvdXBfYnkoTk1GU19ETkFfSUQsIGxvY3VzLCBndHNlcV9ydW4sIGlkKSAlPiUKICBtdXRhdGUodG90YWxfZGVwdGggPSB0ZGVwdGgoYWxsZWxlLCBkZXB0aCkpICU+JQogIHVuZ3JvdXAoKSAlPiUKICBhcnJhbmdlKE5NRlNfRE5BX0lELCBsb2N1cywgdG90YWxfZGVwdGgsIGd0c2VxX3J1biwgaWQsIGRlcHRoKSAlPiUKICBncm91cF9ieShOTUZTX0ROQV9JRCwgbG9jdXMpICU+JQogIG11dGF0ZShyYW5rID0gMTpuKCkpICU+JQogIHVuZ3JvdXAoKSAlPiUKICBmaWx0ZXIocmFuayA8PSAyKQoKYGBgCgpCZWZvcmUgdG9zc2luZyBpbmRpdmlkdWFscyB3aXRoIHRvbyBtdWNoIG1pc3NpbmcgZGF0YSwgd2Ugd2FudCB0byBpbXBsZW1lbnQgb3VyIHRvdGFsIHJlYWQgZGVwdGggdGhyZXNob2xkLgpgYGB7ciBmaWx0ZXItbG93LWRlcHRofQojIGlmIHRvdGFsIGRlcHRoID49IDIwLCBwcmludCB0aGUgYWxsZWxlOyBpZiBub3QsIHByaW50IE5BCmdlbm9fb25lX2VhY2gyIDwtIGdlbm9fb25lX2VhY2ggJT4lCiAgIG11dGF0ZShhbGxlbGUgPSBpZmVsc2UodG90YWxfZGVwdGggPj0yMCwgYWxsZWxlLCBOQSkpCgpgYGAKCgojIyMgVG9zcyBvdXQgaW5kaXZzIHdpdGggbW9yZSB0aGFuIDI4IG1pc3NpbmcgbG9jaQoKYGBge3IgdG9zcy1taXNzZXJzfQpub19oaV9taXNzZXJzIDwtIGdlbm9fb25lX2VhY2gyICU+JSAKICBncm91cF9ieShOTUZTX0ROQV9JRCkgJT4lCiAgZmlsdGVyKHN1bSghaXMubmEoYWxsZWxlKSkgPj0gKDExMyoyKSkKCmBgYAoKIyMgUmVtb3ZlIHRoZSBsb2NpIHRoYXQgc2lnbmlmaWNhbnRseSBkZXZpYXRlIGZyb20gSFdFIGluIHRoZSBiYXNlbGluZSBzYW1wbGVzCgpJJ2xsIHJlbW92ZSBhbGwgOCBsb2NpCmBgYHtyfQpvdXRsaWVycyRMb2N1cyA8LSBnc3ViKCJfMSIsICIiLCBvdXRsaWVycyRMb2N1cykKYGBgCgpgYGB7cn0KIyByZXBsYWNlIHRoZSB3ZWlyZCBjaGFyYWN0ZXJzIGluIHRoZSBsb2N1cyBuYW1lIHRvIG1hdGNoCm5vX2hpX21pc3NlcnMkbG9jdXMgPC0gZ3N1YigiOiIsICJfIiwgbm9faGlfbWlzc2VycyRsb2N1cykKbm9faGlfbWlzc2VycyRsb2N1cyA8LSBnc3ViKCItIiwgIl8iLCBub19oaV9taXNzZXJzJGxvY3VzKQpgYGAKCmBgYHtyfQojIG5vdyBhbnRpLWpvaW4gdGhlIGxvY2kgdG8gcmVtb3ZlIGFuZCB0aGUgbm9faGlfbWlzc2VyIGdlbm90eXBlcwpub19oaV9taXNzZXJzX2xvYyA8LSBub19oaV9taXNzZXJzICU+JSAKICBhbnRpX2pvaW4oLiwgb3V0bGllcnMsIGJ5ID0gYygibG9jdXMiID0gIkxvY3VzIikpCgojIHNhdmUgdGhhdCBvdXRwdXQKbm9faGlfbWlzc2Vyc19sb2MgJT4lCiAgd3JpdGVfY3N2KCJjc3Zfb3V0cHV0cy9ieWNhdGNoX25vX2hpX21pc3NlcnNfaHdlLmNzdiIpCmBgYAo=