########## UTexas Stacks ###########

### 8 Feb 2021 ###

mkdir stacks_demultiplex
mkdir stacks_demultiplex/samples
mkdir stacks_demultiplex/barcodes
 
### general demultiplexing script
process_radtags -T 16 -p ~/UT/stacks_demultiplex/raw -i fastq -o ~/UT/stacks_demultiplex/samples -b ~/UT/stacks_demultiplex/barcodes/UT_barcodes_pool1.txt --renz_1 nlaIII --renz_2 mluCI -r -q -P --inline_null


### -T 16 = use 16 threads
### -p = path to directory
### -i input file type
### -o path to output
### -b path to barcodes file
### --renz_1 restriction enzyme 1
### --renz_2 restriction enzyme 2
### -r — rescue barcodes and RAD-Tags
### -c clean data, remove any read with an uncalled base
### -q discard reads with low quality scores
### -P files contained within the directory are paired
### --inline_null: barcode is inline with sequence, occurs only on single-end read (default).


### things I tried to make this work: swapping out the enzymes; using -1 and -2 with R1 and R2 vs -p /raw/, just the barcodes, barcodes and index in index file, swap the barcodes and index column order, every --index/inline_inline/index combo for all three barcode combos; putting 1st sequencing run pools in -p /raw/ with 2nd run, different barcodes versions. ###ACK it was just that I was using the wrong files...


cat > stacks_demultiplex/barcodes/UT_barcodes.txt

AACCA	CGTACG	RN_NEOR003
AAGGA	CGTACG	RN_CAB005
AATTA	CGTACG	RN_CO001
ACACA	CGTACG	RN_CAB007
ACGGT	CGTACG	RN_CO002
ACTGG	CGTACG	RN_CO004
ACTTC	CGTACG	RN_CO007
AGCTA	CGTACG	RN_CAB006
ATACG	CGTACG	RN_NM017
ATGAG	CGTACG	RN_NM018
CAACC	CGTACG	RN_CAB003
CGATC	CGTACG	RN_NEOR004
GCATG	CGTACG	RN_NEOR002
GGTTG	CGTACG	RN_CAB004
TCGAT	CGTACG	RN_NEOR007
TGCAT	CGTACG	RN_NEOR008
ATTAC	TAAGAA	RN_NM019
CATAT	TAAGAA	RN_SAB001
CGAAT	TAAGAA	RN_SAB002
CGGCT	TAAGAA	RN_SAB006
CGGTA	TAAGAA	RN_SAB017
CGTAC	TAAGAA	RN_SAB022
CGTCG	TAAGAA	RN_SAB029
CTGAT	TAAGAA	RN_SAB030
GCATG	TAAGAA	RN_SAB031
AACCA	TAAGAA	RN_SD001
CGATC	TAAGAA	RN_SD002
TCGAT	TAAGAA	RN_SD003
TGCAT	TAAGAA	RN_SD004
CAACC	TAAGAA	RN_SD005
GGTTG	TAAGAA	RN_SD006
AAGGA	TAAGAA	RN_UT001
AGCTA	ACGATT	RN_UT002
ACACA	ACGATT	RN_UT003
AATTA	ACGATT	RN_UT005
ACGGT	ACGATT	RN_NEWA004
ACTGG	ACGATT	RN_NEWA005
ACTTC	ACGATT	RN_MT003
ATACG	ACGATT	RN_MT004
ACGGT	CCGCGT	YB_CAB001
ACTGG	CCGCGT	YB_CAB006
ACTTC	CCGCGT	YB_CAB016
ATACG	CCGCGT	YB_CAB022
ATGAG	CCGCGT	YB_MI018
ATTAC	GCCGCG	YB_MI019
CATAT	GCCGCG	YB_MI020
CGAAT	GCCGCG	YB_MI021
CGGCT	GCCGCG	YB_MI022
CGGTA	GCCGCG	YB_MI023
CGTAC	GCCGCG	YB_ND001
CGTCG	GCCGCG	YB_ND002
CTGAT	GCCGCG	YB_ND003
GCATG	GCCGCG	YB_ND004
AACCA	GCCGCG	YB_ND005
CGATC	GCCGCG	YB_ND006
TCGAT	GCCGCG	YB_ND007
TGCAT	GCCGCG	YB_SK002
CAACC	GCCGCG	YB_SK003
GGTTG	GCCGCG	YB_SK007
AAGGA	GCCGCG	YB_SK008
AGCTA	GTATTA	YB_SK009
ACACA	GTATTA	YB_SK012
AATTA	GTATTA	YB_SK013
ACGGT	GTATTA	YB_SK014
ACTGG	GTATTA	RN_SEBC009
ACTTC	GTATTA	RNYSEBC001
ATACG	GTATTA	RNYSEBC002
ATGAG	GTATTA	RNYSEBC003
ATTAC	GTATTA	RNxYSAB001
CATAT	GTATTA	RNxYSAB002
CGAAT	GTATTA	RNxYSAB003
CGGCT	GTATTA	RNxYSAB004
CGGTA	GTATTA	RNxYSAB005
CGTAC	GTATTA	RNxYCAB025
CGTCG	GTATTA	RNxYCAB026
CTGAT	GTATTA	RNxYCAB033
AGCTA	TGCAAA	RNxYCAB034
ACACA	TGCAAA	RNxYCAB035
AATTA	TGCAAA	RNxYCAB036
ACGGT	TGCAAA	RNxYCAB037
ACTGG	TGCAAA	RNxYCAB038
ACTTC	TGCAAA	RNxYCAB039
ATACG	TGCAAA	RNxYCAB040
ATGAG	TGCAAA	RNxYCAB041

### it doesn't take them all at once, the 2nd column is in the header and that corresponds to a different barcode per pool, each pool has its own file so not really necessary. Took it out, then ran with --inline_null because the individual barcode in column 1 is inline
### there were originally a bunch of RBSA files in here which we didn't include the barcodes for so all of pool 4 is omitted, as are any missing samples from pools with fewer than 16 samples. This explains why some of the smaller pools have so many dropped barcodes, because those are RBSA and we didn't lose any data from the samples we wanted.

rm stacks_demultiplex/raw/*
cp final/JA14436-ddRAD-Pool-1_S4_R* stacks_demultiplex/raw/

cat > stacks_demultiplex/barcodes/UT_barcodes_pool1.txt

AACCA	RN_NEOR003
AAGGA	RN_CAB005
AATTA	RN_CO001
ACACA	RN_CAB007
ACGGT	RN_CO002
ACTGG	RN_CO004
ACTTC	RN_CO007
AGCTA	RN_CAB006
ATACG	RN_NM017
ATGAG	RN_NM018
CAACC	RN_CAB003
CGATC	RN_NEOR004
GCATG	RN_NEOR002
GGTTG	RN_CAB004
TCGAT	RN_NEOR007
TGCAT	RN_NEOR008

*****
78139516 total sequences
  666484 barcode not found drops (0.9%)
  126361 low quality read drops (0.2%)
13573819 RAD cutsite not found drops (17.4%)
63772852 retained reads (81.6%)
*****

cat > stacks_demultiplex/barcodes/UT_barcodes_pool2.txt

ATTAC	RN_NM019
CATAT	RN_SAB001
CGAAT	RN_SAB002
CGGCT	RN_SAB006
CGGTA	RN_SAB017
CGTAC	RN_SAB022
CGTCG	RN_SAB029
CTGAT	RN_SAB030
GCATG	RN_SAB031
AACCA	RN_SD001
CGATC	RN_SD002
TCGAT	RN_SD003
TGCAT	RN_SD004
CAACC	RN_SD005
GGTTG	RN_SD006
AAGGA	RN_UT001

cp final/JA14436-ddRAD-Pool-2_S5_R* stacks_demultiplex/raw/
gunzip stacks_demultiplex/raw/*

process_radtags -T 16 -p ~/UT/stacks_demultiplex/raw -i fastq -o ~/UT/stacks_demultiplex/samples -b ~/UT/stacks_demultiplex/barcodes/UT_barcodes_pool2.txt --renz_1 nlaIII --renz_2 mluCI -r -q -P --inline_null

*****
69357894 total sequences
  610230 barcode not found drops (0.9%)
  117529 low quality read drops (0.2%)
11649677 RAD cutsite not found drops (16.8%)
56980458 retained reads (82.2%)
*****

rm stacks_demultiplex/raw/*
cp final/JA14436-ddRAD-Pool-3_S6_R* stacks_demultiplex/raw/
gunzip stacks_demultiplex/raw/*
head stacks_demultiplex/raw/JA14436-ddRAD-Pool-3_S6_R1_001.fastq

cat > stacks_demultiplex/barcodes/UT_barcodes_pool3.txt

AGCTA	RN_UT002
ACACA	RN_UT003
AATTA	RN_UT005
ACGGT	RN_NEWA004
ACTGG	RN_NEWA005
ACTTC	RN_MT003
ATACG	RN_MT004

process_radtags -T 16 -p ~/UT/stacks_demultiplex/raw -i fastq -o ~/UT/stacks_demultiplex/samples -b ~/UT/stacks_demultiplex/barcodes/UT_barcodes_pool3.txt --renz_1 nlaIII --renz_2 mluCI -r -q -P --inline_null

*****
62076194 total sequences
22599602 barcode not found drops (36.4%)
   70558 low quality read drops (0.1%)
 6789285 RAD cutsite not found drops (10.9%)
32616749 retained reads (52.5%)
*****

rm stacks_demultiplex/raw/*
cp final/JA14436-ddRAD-Pool-4_S7_R* stacks_demultiplex/raw/
gunzip stacks_demultiplex/raw/*
head stacks_demultiplex/raw/JA14436-ddRAD-Pool-4_S7_R1_001.fastq

cat > stacks_demultiplex/barcodes/UT_barcodes_pool4.txt

ACGGT	YB_CAB001
ACTGG	YB_CAB006
ACTTC	YB_CAB016
ATACG	YB_CAB022
ATGAG	YB_MI018

process_radtags -T 16 -p ~/UT/stacks_demultiplex/raw -i fastq -o ~/UT/stacks_demultiplex/samples -b ~/UT/stacks_demultiplex/barcodes/UT_barcodes_pool4.txt --renz_1 nlaIII --renz_2 mluCI -r -q -P --inline_null


cat > stacks_demultiplex/barcodes/UT_barcodes_pool4.txt

ATTAC	RN_NM019
CATAT	RN_SAB001
CGAAT	RN_SAB002
CGGCT	RN_SAB006
CGGTA	RN_SAB017
CGTAC	RN_SAB022
CGTCG	RN_SAB029
CTGAT	RN_SAB030
GCATG	RN_SAB031
AACCA	RN_SD001
CGATC	RN_SD002
TCGAT	RN_SD003
TGCAT	RN_SD004
CAACC	RN_SD005
GGTTG	RN_SD006
AAGGA	RN_UT001



rm stacks_demultiplex/raw/*
cp final/JA14436-ddRAD-Pool-5_S8_R* stacks_demultiplex/raw/
gunzip stacks_demultiplex/raw/*gz
head stacks_demultiplex/raw/JA14436-ddRAD-Pool-5_S8_R1_001.fastq

cat > stacks_demultiplex/barcodes/UT_barcodes_pool5.txt
ACGGT	YB_CAB001
ACTGG	YB_CAB006
ACTTC	YB_CAB016
ATACG	YB_CAB022
ATGAG	YB_MI018

process_radtags -T 16 -p ~/UT/stacks_demultiplex/raw -i fastq -o ~/UT/stacks_demultiplex/samples -b ~/UT/stacks_demultiplex/barcodes/UT_barcodes_pool5.txt --renz_1 nlaIII --renz_2 mluCI -r -q -P --inline_null

*****
39351960 total sequences
27497484 barcode not found drops (69.9%)
   17633 low quality read drops (0.0%)
 2031629 RAD cutsite not found drops (5.2%)
 9805214 retained reads (24.9%)
*****

rm stacks_demultiplex/raw/*
cp final/JA14436-ddRAD-Pool-6_S9_R* stacks_demultiplex/raw/
gunzip stacks_demultiplex/raw/*gz
head stacks_demultiplex/raw/JA14436-ddRAD-Pool-6_S9_R1_001.fastq

cat > stacks_demultiplex/barcodes/UT_barcodes_pool6.txt

ATTAC	YB_MI019
CATAT	YB_MI020
CGAAT	YB_MI021
CGGCT	YB_MI022
CGGTA	YB_MI023
CGTAC	YB_ND001
CGTCG	YB_ND002
CTGAT	YB_ND003
GCATG	YB_ND004
AACCA	YB_ND005
CGATC	YB_ND006
TCGAT	YB_ND007
TGCAT	YB_SK002
CAACC	YB_SK003
GGTTG	YB_SK007
AAGGA	YB_SK008

process_radtags -T 16 -p ~/UT/stacks_demultiplex/raw -i fastq -o ~/UT/stacks_demultiplex/samples -b ~/UT/stacks_demultiplex/barcodes/UT_barcodes_pool6.txt --renz_1 nlaIII --renz_2 mluCI -r -q -P --inline_null

*****
51930260 total sequences
  444468 barcode not found drops (0.9%)
   83267 low quality read drops (0.2%)
 8726894 RAD cutsite not found drops (16.8%)
42675631 retained reads (82.2%)
*****

rm stacks_demultiplex/raw/*
cp final/JA14436-ddRAD-Pool-7_S10_R* stacks_demultiplex/raw/
gunzip stacks_demultiplex/raw/*gz
head stacks_demultiplex/raw/JA14436-ddRAD-Pool-7_S10_R1_001.fastq

cat > stacks_demultiplex/barcodes/UT_barcodes_pool7.txt

AGCTA	YB_SK009
ACACA	YB_SK012
AATTA	YB_SK013
ACGGT	YB_SK014
ACTGG	RN_SEBC009
ACTTC	RNYSEBC001
ATACG	RNYSEBC002
ATGAG	RNYSEBC003
ATTAC	RNxYSAB001
CATAT	RNxYSAB002
CGAAT	RNxYSAB003
CGGCT	RNxYSAB004
CGGTA	RNxYSAB005
CGTAC	RNxYCAB025
CGTCG	RNxYCAB026
CTGAT	RNxYCAB033

process_radtags -T 16 -p ~/UT/stacks_demultiplex/raw -i fastq -o ~/UT/stacks_demultiplex/samples -b ~/UT/stacks_demultiplex/barcodes/UT_barcodes_pool7.txt --renz_1 nlaIII --renz_2 mluCI -r -q -P --inline_null

*****
78945918 total sequences
  642428 barcode not found drops (0.8%)
  141994 low quality read drops (0.2%)
13500193 RAD cutsite not found drops (17.1%)
64661303 retained reads (81.9%)
*****

rm stacks_demultiplex/raw/*
cp final/JA14436-ddRAD-Pool-8_S11_R* stacks_demultiplex/raw/
gunzip stacks_demultiplex/raw/*gz
head stacks_demultiplex/raw/JA14436-ddRAD-Pool-8_S11_R1_001.fastq

cat > stacks_demultiplex/barcodes/UT_barcodes_pool8.txt

AGCTA	RNxYCAB034
ACACA	RNxYCAB035
AATTA	RNxYCAB036
ACGGT	RNxYCAB037
ACTGG	RNxYCAB038
ACTTC	RNxYCAB039
ATACG	RNxYCAB040
ATGAG	RNxYCAB041

process_radtags -T 16 -p ~/UT/stacks_demultiplex/raw -i fastq -o ~/UT/stacks_demultiplex/samples -b ~/UT/stacks_demultiplex/barcodes/UT_barcodes_pool8.txt --renz_1 nlaIII --renz_2 mluCI -r -q -P --inline_null

*****
55523368 total sequences
  527714 barcode not found drops (1.0%)
  106409 low quality read drops (0.2%)
 9364199 RAD cutsite not found drops (16.9%)
45525046 retained reads (82.0%)
*****

### Okay dokay. Let's make the popmap

mkdir stacks_demultiplex/popmaps
cat > stacks_demultiplex/popmaps/popmap_UT.txt

RN_NEOR003	RNSA
RN_CAB005	RNSA
RN_CO001	RNSA
RN_CAB007	RNSA
RN_CO002	RNSA
RN_CO004	RNSA
RN_CO007	RNSA
RN_CAB006	RNSA
RN_NM017	RNSA
RN_NM018	RNSA
RN_CAB003	RNSA
RN_NEOR004	RNSA
RN_NEOR002	RNSA
RN_CAB004	RNSA
RN_NEOR007	RNSA
RN_NEOR008	RNSA
RN_NM019	RNSA
RN_SAB001	RNSA
RN_SAB002	RNSA
RN_SAB006	RNSA
RN_SAB017	RNSA
RN_SAB022	RNSA
RN_SAB029	RNSA
RN_SAB030	RNSA
RN_SAB031	RNSA
RN_SD001	RNSA
RN_SD002	RNSA
RN_SD003	RNSA
RN_SD004	RNSA
RN_SD005	RNSA
RN_SD006	RNSA
RN_UT001	RNSA
RN_UT002	RNSA
RN_UT003	RNSA
RN_UT005	RNSA
RN_NEWA004	RNSA
RN_NEWA005	RNSA
RN_MT003	RNSA
RN_MT004	RNSA
YB_CAB001	YBSA
YB_CAB006	YBSA
YB_CAB016	YBSA
YB_CAB022	YBSA
YB_MI018	YBSA
YB_MI019	YBSA
YB_MI020	YBSA
YB_MI021	YBSA
YB_MI022	YBSA
YB_MI023	YBSA
YB_ND001	YBSA
YB_ND002	YBSA
YB_ND003	YBSA
YB_ND004	YBSA
YB_ND005	YBSA
YB_ND006	YBSA
YB_ND007	YBSA
YB_SK002	YBSA
YB_SK003	YBSA
YB_SK007	YBSA
YB_SK008	YBSA
YB_SK009	YBSA
YB_SK012	YBSA
YB_SK013	YBSA
YB_SK014	YBSA
RN_SEBC009	RNSA
RNYSEBC001	HYSA
RNYSEBC002	HYSA
RNYSEBC003	HYSA
RNxYSAB001	HYSA
RNxYSAB002	HYSA
RNxYSAB003	HYSA
RNxYSAB004	HYSA
RNxYSAB005	HYSA
RNxYCAB025	HYSA
RNxYCAB026	HYSA
RNxYCAB033	HYSA
RNxYCAB034	HYSA
RNxYCAB035	HYSA
RNxYCAB036	HYSA
RNxYCAB037	HYSA
RNxYCAB038	HYSA
RNxYCAB039	HYSA
RNxYCAB040	HYSA
RNxYCAB041	HYSA

### make output directory
mkdir stacks_demultiplex/denovomap_UT

### WHAT I AM GOING TO DOOOO (on mank02)
screen -S denovo_map.pl
denovo_map.pl --samples stacks_demultiplex/samples --popmap stacks_demultiplex/popmaps/popmap_UT.txt -X "ustacks: -m 10" -o stacks_demultiplex/denovomap_UT -T 16 -X "populations: --min-maf 0.05 --fstats --fst_correction p_value --vcf --structure --plink --hzar --genepop"

### --samples [path] — specify a path to the directory of samples (samples will be read from population map).
### --popmap [path] — path to a population map file (format is "[name] TAB [pop]", one sample per line).
### -X — additional options for specific pipeline components, e.g. -X "populations: --min-maf 0.05".
### "ustacks: -m 10" -m — Minimum depth of coverage required to create a stack (default 3)
### -o — output path to write results.
### -T  — the number of threads/CPUs to use (default: 1).
### --min-maf [float] — specify a minimum minor allele frequency required to process a nucleotide site at a locus (0 < min_maf < 0.5).
### --fstats — enable SNP and haplotype-based F statistics.
### --fst_correction — specify a correction to be applied to Fst values: 'p_value', 'bonferroni_win', or 'bonferroni_gen'. Default: off.
### --vcf — output SNPs and haplotypes in Variant Call Format (VCF).
### --genepop — output results in GenePop format.
### --structure — output results in Structure format.
### --plink — output genotypes in PLINK format.
### --hzar — output genotypes in Hybrid Zone Analysis using R (HZAR) format.

*****
Genotyped 82048 loci:
  effective per-sample coverage: mean=31.9x, stdev=10.4x, min=19.2x, max=70.3x
  mean number of sites per locus: 146.3
  a consistent phasing was found for 12988 of out 15689 (82.8%) diploid loci needing phasing
*****

****/Linux/bin/populations: unrecognized option '--fst_correction'
### heck, I think there is a difference between this version and the manual I'm looking at, --fst_correction doesn't have the 'p_value' option?
### --fst-correction: specify a p-value correction to be applied to Fst values based on a Fisher's exact test. Default: off.
### I'll try it without the fst_correction because I don't know what value to use, and I don't think we need that?

  --fst_correction p_value "

populations -P stacks_demultiplex/denovomap_UT -O stacks_demultiplex/denovomap_UT -M stacks_demultiplex/popmaps/popmap_UT.txt -t 16 --min-maf 0.05 --fstats --vcf --structure --plink --hzar --genepop


# Remove sites with more than 60% missing genotypes (note the parameter below is intuition times minus one):

vcftools --vcf stacks_demultiplex/denovomap_UT/populations.snps.vcf --max-missing 0.4 --recode --recode-INFO-all --out stacks_demultiplex/denovomap_UT/populations.snps.maxmiss60.vcf.idx

*****
Parameters as interpreted:
	--vcf stacks_demultiplex/denovomap_UT/populations.snps.vcf
	--recode-INFO-all
	--max-missing 0.4
	--out stacks_demultiplex/denovomap_UT/populations.snps.maxmiss60.vcf.idx
	--recode

After filtering, kept 84 out of 84 Individuals
Outputting VCF file...
After filtering, kept 127 out of a possible 35622 Sites
*****

vcftools --vcf stacks_demultiplex/denovomap_UT/populations.snps.vcf --max-missing 0.2 --recode --recode-INFO-all --out stacks_demultiplex/denovomap_UT/populations.snps.maxmiss80.vcf.idx

*****
Parameters as interpreted:
	--vcf stacks_demultiplex/denovomap_UT/populations.snps.vcf
	--recode-INFO-all
	--max-missing 0.2
	--out stacks_demultiplex/denovomap_UT/populations.snps.maxmiss80.vcf.idx
	--recode

After filtering, kept 84 out of 84 Individuals
Outputting VCF file...
After filtering, kept 3166 out of a possible 35622 Sites
*****

### give AC VCF files, from her CC scratch/LN_filtered_vcfs/
rsync -av natola@zoology.ubc.ca:flex/UT/stacks_demultiplex/denovomap_UT/populations.snps.maxmiss* .

### i wonder if this has something to do with the different popmap groups not mapping, because it looked like mean number of sites per locus was high (146). I'll try it with just one pop.

cat > stacks_demultiplex/popmaps/popmap_UT_onepop.txt

RN_NEOR003	Sap
RN_CAB005	Sap
RN_CO001	Sap
RN_CAB007	Sap
RN_CO002	Sap
RN_CO004	Sap
RN_CO007	Sap
RN_CAB006	Sap
RN_NM017	Sap
RN_NM018	Sap
RN_CAB003	Sap
RN_NEOR004	Sap
RN_NEOR002	Sap
RN_CAB004	Sap
RN_NEOR007	Sap
RN_NEOR008	Sap
RN_NM019	Sap
RN_SAB001	Sap
RN_SAB002	Sap
RN_SAB006	Sap
RN_SAB017	Sap
RN_SAB022	Sap
RN_SAB029	Sap
RN_SAB030	Sap
RN_SAB031	Sap
RN_SD001	Sap
RN_SD002	Sap
RN_SD003	Sap
RN_SD004	Sap
RN_SD005	Sap
RN_SD006	Sap
RN_UT001	Sap
RN_UT002	Sap
RN_UT003	Sap
RN_UT005	Sap
RN_NEWA004	Sap
RN_NEWA005	Sap
RN_MT003	Sap
RN_MT004	Sap
YB_CAB001	Sap
YB_CAB006	Sap
YB_CAB016	Sap
YB_CAB022	Sap
YB_MI018	Sap
YB_MI019	Sap
YB_MI020	Sap
YB_MI021	Sap
YB_MI022	Sap
YB_MI023	Sap
YB_ND001	Sap
YB_ND002	Sap
YB_ND003	Sap
YB_ND004	Sap
YB_ND005	Sap
YB_ND006	Sap
YB_ND007	Sap
YB_SK002	Sap
YB_SK003	Sap
YB_SK007	Sap
YB_SK008	Sap
YB_SK009	Sap
YB_SK012	Sap
YB_SK013	Sap
YB_SK014	Sap
RN_SEBC009	Sap
RNYSEBC001	Sap
RNYSEBC002	Sap
RNYSEBC003	Sap
RNxYSAB001	Sap
RNxYSAB002	Sap
RNxYSAB003	Sap
RNxYSAB004	Sap
RNxYSAB005	Sap
RNxYCAB025	Sap
RNxYCAB026	Sap
RNxYCAB033	Sap
RNxYCAB034	Sap
RNxYCAB035	Sap
RNxYCAB036	Sap
RNxYCAB037	Sap
RNxYCAB038	Sap
RNxYCAB039	Sap
RNxYCAB040	Sap
RNxYCAB041	Sap

mkdir stacks_demultiplex/denovomap_UT/onepop

populations -P stacks_demultiplex/denovomap_UT -O stacks_demultiplex/denovomap_UT/onepop -M stacks_demultiplex/popmaps/popmap_UT_onepop.txt -t 16 --min-maf 0.05 --fstats --vcf --structure --plink --hzar --genepop

# Remove sites with more than 60% missing genotypes (note the parameter below is intuition times minus one):

vcftools --vcf stacks_demultiplex/denovomap_UT/onepop/populations.snps.vcf --max-missing 0.4 --recode --recode-INFO-all --out stacks_demultiplex/denovomap_UT/onepop/populations.snps.maxmiss60.vcf.idx

### okay that didn't help but it showed me the loci average 8.2 samples per locus, I'm going to add this filter
### -R,--min-samples-overall [float] — minimum percentage of individuals across populations required to process a locus.

mkdir stacks_demultiplex/denovomap_UT/onepop/min60

populations -P stacks_demultiplex/denovomap_UT -O stacks_demultiplex/denovomap_UT/onepop/min60 -M stacks_demultiplex/popmaps/popmap_UT_onepop.txt -t 16 --min-maf 0.05 --min-samples-overall 60 --fstats --vcf --structure --plink --hzar --genepop

*****
Removed 81959 loci that did not pass sample/population constraints from 82048 loci.
Kept 89 loci, composed of 13118 sites; 94 of those sites were filtered, 31 variant sites remained.
Mean genotyped sites per locus: 147.39bp (stderr 0.25).

Population summary statistics (more detail in populations.sumstats_summary.tsv):
  Sap: 59.581 samples per locus; pi: 0.36077; all/variant/polymorphic sites: 13118/31/31; private alleles: 0
*****

mkdir stacks_demultiplex/denovomap_UT/onepop/min40

populations -P stacks_demultiplex/denovomap_UT -O stacks_demultiplex/denovomap_UT/onepop/min40 -M stacks_demultiplex/popmaps/popmap_UT_onepop.txt -t 16 --min-maf 0.05 --min-samples-overall 40 --fstats --vcf --structure --plink --hzar --genepop

*****
Removed 81701 loci that did not pass sample/population constraints from 82048 loci.
Kept 347 loci, composed of 51163 sites; 306 of those sites were filtered, 127 variant sites remained.
Mean genotyped sites per locus: 147.44bp (stderr 0.15).

Population summary statistics (more detail in populations.sumstats_summary.tsv):
  Sap: 45.307 samples per locus; pi: 0.35837; all/variant/polymorphic sites: 51163/127/127; private alleles: 0
*****

mkdir stacks_demultiplex/denovomap_UT/onepop/min20

populations -P stacks_demultiplex/denovomap_UT -O stacks_demultiplex/denovomap_UT/onepop/min20 -M stacks_demultiplex/popmaps/popmap_UT_onepop.txt -t 16 --min-maf 0.05 --min-samples-overall 20 --fstats --vcf --structure --plink --hzar --genepop

*****
Removed 74900 loci that did not pass sample/population constraints from 82048 loci.
Kept 7148 loci, composed of 1047983 sites; 4002 of those sites were filtered, 3187 variant sites remained.
Mean genotyped sites per locus: 146.61bp (stderr 0.02).

Population summary statistics (more detail in populations.sumstats_summary.tsv):
  Sap: 21.542 samples per locus; pi: 0.26893; all/variant/polymorphic sites: 1047983/3187/3187; private alleles: 0
*****

### give AC VCF files, from her CC scratch/LN_filtered_vcfs/
rsync -av natola@zoology.ubc.ca:flex/UT/stacks_demultiplex/denovomap_UT/onepop/min60/ .
rsync -av natola@zoology.ubc.ca:flex/UT/stacks_demultiplex/denovomap_UT/onepop/min40/ .
rsync -av natola@zoology.ubc.ca:flex/UT/stacks_demultiplex/denovomap_UT/onepop/min20/ .


### Trying it by removing all samples whos .snps.tsv files < 1MB

cat > stacks_demultiplex/popmaps/popmap_UT_onepop_bigfiles.txt

RN_SD005	Sap
RN_CO004	Sap
RN_UT003	Sap
YB_MI023	Sap
YB_SK007	Sap
RNxYSAB002	Sap
RN_UT005	Sap
RNxYCAB036	Sap
RN_UT002	Sap
RNxYCAB025	Sap
RNxYCAB038	Sap
RN_NEOR007	Sap
RNYSEBC002	Sap
RN_UT001	Sap
RN_NEOR008	Sap
RNxYCAB034	Sap
RN_CAB005	Sap
RNxYCAB040	Sap
RNxYSAB005	Sap
RNxYCAB037	Sap
YB_SK002	Sap
RNxYSAB004	Sap
YB_ND002	Sap
RNxYCAB039	Sap
RN_SD004	Sap
YB_MI018	Sap
RNxYCAB033	Sap
YB_MI020	Sap
YB_ND003	Sap
RN_CAB004	Sap
RN_NEOR003	Sap
RN_MT004	Sap
RN_NEOR004	Sap
RN_SAB029	Sap
YB_ND007	Sap
RN_SAB031	Sap
RN_CAB003	Sap
RN_SD006	Sap
RN_SAB017	Sap
RN_SD002	Sap
RN_CO002	Sap
RN_SAB006	Sap
RN_NM019	Sap
RN_CO007	Sap
YB_ND004	Sap
RN_SAB022	Sap
RN_SAB030	Sap
RN_NEOR002	Sap
RNxYCAB026	Sap
YB_SK012	Sap
RN_SAB002	Sap
RNxYCAB035	Sap
RN_SEBC009	Sap
YB_MI019	Sap
RN_NEWA005	Sap
RN_MT003	Sap
YB_MI021	Sap
YB_SK013	Sap
RNxYSAB001	Sap
RNxYSAB003	Sap
RN_CO001	Sap
RN_SD001	Sap
RN_NEWA004	Sap
YB_SK009	Sap
YB_SK003	Sap

mkdir stacks_demultiplex/denovomap_UT/onepop/bigfiles/min60

populations -P stacks_demultiplex/denovomap_UT -O stacks_demultiplex/denovomap_UT/onepop/bigfiles/min60 -M stacks_demultiplex/popmaps/popmap_UT_onepop_bigfiles.txt -t 16 --min-maf 0.05 --min-samples-overall 60 --fstats --vcf --structure --plink --hzar --genepop

*****
Removed 81865 loci that did not pass sample/population constraints from 82048 loci.
Kept 183 loci, composed of 27015 sites; 181 of those sites were filtered, 72 variant sites remained.
Mean genotyped sites per locus: 147.62bp (stderr 0.21).

Population summary statistics (more detail in populations.sumstats_summary.tsv):
  Sap: 47.333 samples per locus; pi: 0.36532; all/variant/polymorphic sites: 27015/72/72; private alleles: 0
*****

### checking if I lose fewer alleles when I look at alleles within pools

cat > stacks_demultiplex/popmaps/popmap_UT_poolpops.txt

RN_NEOR003	Pool1
RN_CAB005	Pool1
RN_CO001	Pool1
RN_CAB007	Pool1
RN_CO002	Pool1
RN_CO004	Pool1
RN_CO007	Pool1
RN_CAB006	Pool1
RN_NM017	Pool1
RN_NM018	Pool1
RN_CAB003	Pool1
RN_NEOR004	Pool1
RN_NEOR002	Pool1
RN_CAB004	Pool1
RN_NEOR007	Pool1
RN_NEOR008	Pool1
RN_NM019	Pool2
RN_SAB001	Pool2
RN_SAB002	Pool2
RN_SAB006	Pool2
RN_SAB017	Pool2
RN_SAB022	Pool2
RN_SAB029	Pool2
RN_SAB030	Pool2
RN_SAB031	Pool2
RN_SD001	Pool2
RN_SD002	Pool2
RN_SD003	Pool2
RN_SD004	Pool2
RN_SD005	Pool2
RN_SD006	Pool2
RN_UT001	Pool2
RN_UT002	Pool3
RN_UT003	Pool3
RN_UT005	Pool3
RN_NEWA004	Pool3
RN_NEWA005	Pool3
RN_MT003	Pool3
RN_MT004	Pool3
YB_CAB001	Pool5
YB_CAB006	Pool5
YB_CAB016	Pool5
YB_CAB022	Pool5
YB_MI018	Pool5
YB_MI019	Pool6
YB_MI020	Pool6
YB_MI021	Pool6
YB_MI022	Pool6
YB_MI023	Pool6
YB_ND001	Pool6
YB_ND002	Pool6
YB_ND003	Pool6
YB_ND004	Pool6
YB_ND005	Pool6
YB_ND006	Pool6
YB_ND007	Pool6
YB_SK002	Pool6
YB_SK003	Pool6
YB_SK007	Pool6
YB_SK008	Pool6
YB_SK009	Pool7
YB_SK012	Pool7
YB_SK013	Pool7
YB_SK014	Pool7
RN_SEBC009	Pool7
RNYSEBC001	Pool7
RNYSEBC002	Pool7
RNYSEBC003	Pool7
RNxYSAB001	Pool7
RNxYSAB002	Pool7
RNxYSAB003	Pool7
RNxYSAB004	Pool7
RNxYSAB005	Pool7
RNxYCAB025	Pool7
RNxYCAB026	Pool7
RNxYCAB033	Pool7
RNxYCAB034	Pool8
RNxYCAB035	Pool8
RNxYCAB036	Pool8
RNxYCAB037	Pool8
RNxYCAB038	Pool8
RNxYCAB039	Pool8
RNxYCAB040	Pool8
RNxYCAB041	Pool8

mkdir stacks_demultiplex/denovomap_UT/poolpop

populations -P stacks_demultiplex/denovomap_UT -O stacks_demultiplex/denovomap_UT/onepop/bigfiles/min60 -M stacks_demultiplex/popmaps/popmap_UT_poolpops.txt -t 16 --min-maf 0.05 --min-samples-per-pop 60 --fstats --vcf --structure --plink --hzar --genepop

*****
Removed 76302 loci that did not pass sample/population constraints from 82048 loci.
Kept 5746 loci, composed of 842244 sites; 364 of those sites were filtered, 1979 variant sites remained.
Mean genotyped sites per locus: 146.57bp (stderr 0.03).

Population summary statistics (more detail in populations.sumstats_summary.tsv):
  Pool1: 11.51 samples per locus; pi: 0.38551; all/variant/polymorphic sites: 18895/49/46; private alleles: 0
  Pool2: 11.898 samples per locus; pi: 0.37315; all/variant/polymorphic sites: 19140/49/47; private alleles: 3
  Pool3: 5.2919 samples per locus; pi: 0.29859; all/variant/polymorphic sites: 790209/1747/1686; private alleles: 17
  Pool5: 3.5385 samples per locus; pi: 0.30412; all/variant/polymorphic sites: 9297/26/15; private alleles: 0
  Pool6: 11.057 samples per locus; pi: 0.31123; all/variant/polymorphic sites: 11214/35/25; private alleles: 0
  Pool7: 11.431 samples per locus; pi: 0.38845; all/variant/polymorphic sites: 21974/58/55; private alleles: 2
  Pool8: 5.2513 samples per locus; pi: 0.32053; all/variant/polymorphic sites: 107497/378/359; private alleles: 59

47.33/84 (all) = 0.53
1 11.51/16 = 0.72
2 11.898/16 = 0.74
3 5.2919/7 = 0.7557
5 3.5385/5 = 0.7077
6 11.057/16 = 0.691
7 11.431/16 = 0.714
8 5.2513/8 = 0.656

So all together has a much lower mean # samples/locus than per pool, I think maybe the pools are not super comparable and that's where a lot of the drop out is coming from.
*****

### let's turn those vcfs into 012NA files so I can run the PCAs and see if the patterns hold with lower quality data

# Convert to tab file in 012NA format (run two commands below):
### 60 min
vcftools --vcf stacks_demultiplex/denovomap_UT/onepop/min60/populations.snps.vcf --012 --out stacks_demultiplex/denovomap_UT/onepop/min60/populations.snps.min60.tab

cat stacks_demultiplex/denovomap_UT/onepop/min60/populations.snps.min60.tab.012 | sed 's/-1/NA/g' > stacks_demultiplex/denovomap_UT/onepop/min60/populations.snps.min60.tab.012NA

### 40 min
vcftools --vcf stacks_demultiplex/denovomap_UT/onepop/min40/populations.snps.vcf --012 --out stacks_demultiplex/denovomap_UT/onepop/min40/populations.snps.min40.tab

cat stacks_demultiplex/denovomap_UT/onepop/min40/populations.snps.min40.tab.012 | sed 's/-1/NA/g' > stacks_demultiplex/denovomap_UT/onepop/min40/populations.snps.min40.tab.012NA

### 20 min
vcftools --vcf stacks_demultiplex/denovomap_UT/onepop/min20/populations.snps.vcf --012 --out stacks_demultiplex/denovomap_UT/onepop/min20/populations.snps.min20.tab

cat stacks_demultiplex/denovomap_UT/onepop/min20/populations.snps.min20.tab.012 | sed 's/-1/NA/g' > stacks_demultiplex/denovomap_UT/onepop/min20/populations.snps.min20.tab.012NA

scp natola@files.zoology.ubc.ca:flex/UT/stacks_demultiplex/denovomap_UT/onepop/min60/populations.snps.min60.tab.012* /Users/libbynatola/Desktop/UofLsnps/
scp natola@files.zoology.ubc.ca:flex/UT/stacks_demultiplex/denovomap_UT/onepop/min40/populations.snps.min40.tab.012* /Users/libbynatola/Desktop/UofLsnps/
scp natola@files.zoology.ubc.ca:flex/UT/stacks_demultiplex/denovomap_UT/onepop/min20/populations.snps.min20.tab.012* /Users/libbynatola/Desktop/UofLsnps/

### should I have used the paired flag?

screen -S denovo_map.pl
denovo_map.pl --samples stacks_demultiplex/samples --popmap stacks_demultiplex/popmaps/popmap_UT_onepop.txt --paired -X "ustacks: -m 10" -o stacks_demultiplex/denovomap_UT -T 16 -X "populations: --min-maf 0.05 --fstats --min-samples-per-pop 40 --vcf --structure --plink --hzar --genepop"

rsync -av natola@zoology.ubc.ca:flex/UT/stacks_demultiplex/denovomap_UT/populations.snps.vcf .

### Not sure if I ever reran the filtering after I reran ustacks with paired flag
### also have to fix the popmap file

cat > stacks_demultiplex/popmaps/popmap_UT.txt

RN_NEOR003	RNSA
RN_CAB005	RNSA
RN_CO001	RNSA
RN_CAB007	RNSA
RN_CO002	RNSA
RN_CO004	RNSA
RN_CO007	RNSA
RN_CAB006	RNSA
RN_NM017	RNSA
RN_NM018	RNSA
RN_CAB003	RNSA
RN_NEOR004	RNSA
RN_NEOR002	RNSA
RN_CAB004	RNSA
RN_NEOR007	RNSA
RN_NEOR008	RNSA
RN_NM019	RNSA
RN_SAB001	RNSA
RN_SAB002	RNSA
RN_SAB006	RNSA
RN_SAB017	RNSA
RN_SAB022	RNSA
RN_SAB029	RNSA
RN_SAB030	RNSA
RN_SAB031	RNSA
RN_SD001	RNSA
RN_SD002	RNSA
RN_SD003	RNSA
RN_SD004	RNSA
RN_SD005	RNSA
RN_SD006	RNSA
RN_UT001	RNSA
RN_UT002	RNSA
RN_UT003	RNSA
RN_UT005	RNSA
RN_NEWA004	RNSA
RN_NEWA005	RNSA
RN_MT003	RNSA
RN_MT004	RNSA
YB_CAB001	YBSA
YB_CAB006	YBSA
YB_CAB016	HYSA
YB_CAB022	HYSA
YB_MI018	YBSA
YB_MI019	YBSA
YB_MI020	YBSA
YB_MI021	YBSA
YB_MI022	YBSA
YB_MI023	YBSA
YB_ND001	YBSA
YB_ND002	YBSA
YB_ND003	YBSA
YB_ND004	YBSA
YB_ND005	YBSA
YB_ND006	YBSA
YB_ND007	YBSA
YB_SK002	YBSA
YB_SK003	YBSA
YB_SK007	YBSA
YB_SK008	YBSA
YB_SK009	YBSA
YB_SK012	YBSA
YB_SK013	YBSA
YB_SK014	YBSA
RN_SEBC009	RNSA
RNYSEBC001	HYSA
RNYSEBC002	HYSA
RNYSEBC003	HYSA
RNxYSAB001	HYSA
RNxYSAB002	HYSA
RNxYSAB003	HYSA
RNxYSAB004	HYSA
RNxYSAB005	HYSA
RNxYCAB025	HYSA
RNxYCAB026	YBSA
RNxYCAB033	HYSA
RNxYCAB034	HYSA
RNxYCAB035	HYSA
RNxYCAB036	HYSA
RNxYCAB037	HYSA
RNxYCAB038	HYSA
RNxYCAB039	HYSA
RNxYCAB040	HYSA
RNxYCAB041	HYSA


populations -P stacks_demultiplex/denovomap_UT -O stacks_demultiplex/denovomap_UT/onepop/min60 -M stacks_demultiplex/popmaps/popmap_UT.txt -t 16 --min-maf 0.05 --min-samples-overall 60 --fstats --vcf --structure --plink --hzar --genepop

*****
Removed 80736 loci that did not pass sample/population constraints from 80819 loci.
Kept 83 loci, composed of 26953 sites; 252 of those sites were filtered, 42 variant sites remained.
Number of loci with PE contig: 83.00 (100.0%);
  Mean length of loci: 314.73bp (stderr 4.72);
Number of loci with SE/PE overlap: 14.00 (16.9%);
  Mean length of overlapping loci: 314.29bp (stderr 8.77); mean overlap: 24.86bp (stderr 0.91);
Mean genotyped sites per locus: 316.57bp (stderr 4.69).

Population summary statistics (more detail in populations.sumstats_summary.tsv):
  RNSA: 29.238 samples per locus; pi: 0.33695; all/variant/polymorphic sites: 26275/42/40; private alleles: 0
  YBSA: 15.119 samples per locus; pi: 0.30282; all/variant/polymorphic sites: 26275/42/38; private alleles: 0
  HYSA: 13.667 samples per locus; pi: 0.36746; all/variant/polymorphic sites: 26275/42/42; private alleles: 0

Population pair divergence statistics (more in populations.fst_summary.tsv and populations.phistats_summary.tsv):
  RNSA-YBSA: mean Fst: 0.13321; mean Phi_st: 0.14267; mean Fst': 0.10728; mean Dxy: 0.0018181
  RNSA-HYSA: mean Fst: 0.02279; mean Phi_st: 0.0037068; mean Fst': -0.0048693; mean Dxy: 0.0015801
  YBSA-HYSA: mean Fst: 0.080634; mean Phi_st: 0.077066; mean Fst': 0.05214; mean Dxy: 0.0016639
*****


populations -P stacks_demultiplex/denovomap_UT -O stacks_demultiplex/denovomap_UT/onepop/min40 -M stacks_demultiplex/popmaps/popmap_UT.txt -t 16 --min-maf 0.05 --min-samples-overall 40 --fstats --vcf --structure --plink --hzar --genepop

*****
Removed 80493 loci that did not pass sample/population constraints from 80819 loci.
Kept 326 loci, composed of 106333 sites; 918 of those sites were filtered, 205 variant sites remained.
Number of loci with PE contig: 326.00 (100.0%);
  Mean length of loci: 316.17bp (stderr 2.21);
Number of loci with SE/PE overlap: 53.00 (16.3%);
  Mean length of overlapping loci: 321.43bp (stderr 3.97); mean overlap: 26.70bp (stderr 0.45);
Mean genotyped sites per locus: 317.77bp (stderr 2.19).

Population summary statistics (more detail in populations.sumstats_summary.tsv):
  RNSA: 22.776 samples per locus; pi: 0.30641; all/variant/polymorphic sites: 103594/205/186; private alleles: 1
  YBSA: 9.5317 samples per locus; pi: 0.31291; all/variant/polymorphic sites: 103594/205/181; private alleles: 3
  HYSA: 9.9024 samples per locus; pi: 0.34458; all/variant/polymorphic sites: 103594/205/190; private alleles: 0

Population pair divergence statistics (more in populations.fst_summary.tsv and populations.phistats_summary.tsv):
  RNSA-YBSA: mean Fst: 0.13737; mean Phi_st: 0.15121; mean Fst': 0.13081; mean Dxy: 0.0024678
  RNSA-HYSA: mean Fst: 0.028332; mean Phi_st: 0.012288; mean Fst': 0.00045702; mean Dxy: 0.0020349
  YBSA-HYSA: mean Fst: 0.10076; mean Phi_st: 0.085447; mean Fst': 0.081936; mean Dxy: 0.0023574
  *****

populations -P stacks_demultiplex/denovomap_UT -O stacks_demultiplex/denovomap_UT/onepop/min20 -M stacks_demultiplex/popmaps/popmap_UT.txt -t 16 --min-maf 0.05 --min-samples-overall 20 --fstats --vcf --structure --plink --hzar --genepop

*****
Removed 73743 loci that did not pass sample/population constraints from 80819 loci.
Kept 7076 loci, composed of 2232493 sites; 11353 of those sites were filtered, 9357 variant sites remained.
Number of loci with PE contig: 7076.00 (100.0%);
  Mean length of loci: 305.50bp (stderr 0.24);
Number of loci with SE/PE overlap: 206.00 (2.9%);
  Mean length of overlapping loci: 308.81bp (stderr 0.72); mean overlap: 18.43bp (stderr 0.18);
Mean genotyped sites per locus: 305.79bp (stderr 0.24).

Population summary statistics (more detail in populations.sumstats_summary.tsv):
  RNSA: 12.115 samples per locus; pi: 0.21459; all/variant/polymorphic sites: 2163747/9357/7262; private alleles: 532
  YBSA: 3.5636 samples per locus; pi: 0.31036; all/variant/polymorphic sites: 2118500/9180/6016; private alleles: 567
  HYSA: 5.6557 samples per locus; pi: 0.28812; all/variant/polymorphic sites: 2163447/9354/7541; private alleles: 254

Population pair divergence statistics (more in populations.fst_summary.tsv and populations.phistats_summary.tsv):
  RNSA-YBSA: mean Fst: 0.17785; mean Phi_st: 0.26542; mean Fst': 0.27022; mean Dxy: 0.00269
  RNSA-HYSA: mean Fst: 0.051887; mean Phi_st: 0.041213; mean Fst': 0.02263; mean Dxy: 0.0019255
  YBSA-HYSA: mean Fst: 0.14282; mean Phi_st: 0.13566; mean Fst': 0.16369; mean Dxy: 0.0026473
*****

### give AC VCF files, from her CC scratch/LN_filtered_vcfs/
rsync -av natola@zoology.ubc.ca:flex/UT/stacks_demultiplex/denovomap_UT/onepop/min60/ .
rsync -av natola@zoology.ubc.ca:flex/UT/stacks_demultiplex/denovomap_UT/onepop/min40/ .
rsync -av natola@zoology.ubc.ca:flex/UT/stacks_demultiplex/denovomap_UT/onepop/min20/ .







###### CORNELL FILES #######
### Ashley already demultiplexed these with the following code:
process_radtags -p /home/aijc911/projects/def-tburg/aijc911/ -i gzfastq -b home/aijc911/scratch/cornellprocrad/cornellbarcodes.txt -o /home/aijc911/projects/def-tburg/aijc911/cornellprocrad/ -q -r --inline_null -e pstI -y fastq

### I transferred her files from CC to zoology cluster because CC IS THE DEVIL
### pwd to ./ is /home/aijc911/project/aijc911/cornellprocrad btw
rsync -av ./* natola@zoology.ubc.ca:flex/UT/cornell

### make Cornell output directory
mkdir stacks_demultiplex/denovomap_C

### put away AC popmap file
mv sapsC_popmap.txt stacks_demultiplex/popmaps/

### now I can use her script (on mank03), just changing pathways and doubling the threads, otherwise identical. Also same settings as UT set above
screen -S denovo_map.pl_Cornell
denovo_map.pl --samples cornell/ --popmap stacks_demultiplex/popmaps/sapsC_popmap.txt -X "ustacks: -m 10" -o stacks_demultiplex/denovomap_C -T 16 -X "populations: --min-maf 0.05 --fstats --vcf --structure --plink --hzar --genepop"

rsync -av natola@zoology.ubc.ca:flex/UT/stacks_demultiplex/denovomap_C/ .

### looking at stacks-dist-extract populations.log.distribs samples_per_loc_postfilters it didn't filter out reads that were not well sampled. Ack! I forgot to add the flag dagnabit

populations -P stacks_demultiplex/denovomap_C -O stacks_demultiplex/denovomap_C/min60/ -M stacks_demultiplex/popmaps/sapsC_popmap.txt -t 16 --min-maf 0.05 --min-samples-overall 60 --fstats --vcf --structure --plink --hzar --genepop

stacks-dist-extract stacks_demultiplex/denovomap_C/min60/populations.log.distribs samples_per_loc_postfilters

populations -P stacks_demultiplex/denovomap_C -O stacks_demultiplex/denovomap_C/min40/ -M stacks_demultiplex/popmaps/sapsC_popmap.txt -t 16 --min-maf 0.05 --min-samples-overall 40 --fstats --vcf --structure --plink --hzar --genepop

stacks-dist-extract stacks_demultiplex/denovomap_C/min40/populations.log.distribs samples_per_loc_postfilters
rsync -av natola@zoology.ubc.ca:flex/UT/stacks_demultiplex/denovomap_C/min40 .

cat > stacks_demultiplex/popmaps/sapsC_popmap_onepop.txt

### want to try one pop 
YBSA_NWBC001	Sap
YBSA_NWBC002	Sap
YBSA_NWBC003	Sap
YBSA_NWBC004	Sap
YBSA_NWBC005	Sap
YBSA_CAB002	Sap
YBSA_CAB005	Sap
YBSA_CAB013	Sap
YBSA_CAB021	Sap
YBSA_CAB022	Sap
YBSA_CAB025	Sap
YBSA_CAB027	Sap
YBSA_CAB028	Sap
YBSA_SK001	Sap
YBSA_SK004	Sap
YBSA_SK010	Sap
YBSA_SK011	Sap
YBSA_NC003	Sap
YBSA_NC004	Sap
RNSAXYBSA_CAB056	Sap
RNSA_CAB002	Sap
RNSA_CAB003	Sap
RNSA_CAB004	Sap
RNSA_UT001	Sap
RNSA_UT002	Sap
RNSA_UT004	Sap
RNSA_UT005	Sap
RNSA_UT007	Sap
RNSA_SAB003	Sap
RNSA_SAB016	Sap
RNSA_SAB023	Sap
RNSA_SAB024	Sap
RNSA_SAB026	Sap
RNSA_CO001	Sap
RNSA_CO002	Sap
RNSA_CO003	Sap
RNSA_CO005	Sap
RNSA_CO006	Sap
RNSA_NEOR001	Sap
RNSA_NEOR005	Sap
RNSA_NEOR006	Sap
RNSA_NEOR007	Sap
RNSA_NEOR008	Sap
RNSA_NEOR009	Sap
RNSA_MT001	Sap
RNSA_MT002	Sap
RNSA_WA001	Sap
RNSA_WA003	Sap
RNSA_WA004	Sap
RNSA_WA006	Sap
RNSA_NM003	Sap
RNSA_NM008	Sap
RNSA_WY003	Sap
RNSAXYBSA_CAB001	Sap
RNSAXYBSA_CAB002	Sap
RNSAXYBSA_CAB005	Sap
RNSAXYBSA_CAB009	Sap
RNSAXYBSA_CAB011	Sap
RNSAXYBSA_CAB014	Sap
RNSAXYBSA_CAB018	Sap
RNSAXYBSA_CAB024	Sap
RNSAXYBSA_CAB030	Sap
RNSAXYBSA_CAB023	Sap

populations -P stacks_demultiplex/denovomap_C -O stacks_demultiplex/denovomap_C/min60/onepop -M stacks_demultiplex/popmaps/sapsC_popmap_onepop.txt -t 16 --min-maf 0.05 --min-samples-overall 60 --fstats --vcf --structure --plink --hzar --genepop

stacks-dist-extract stacks_demultiplex/denovomap_C/min60/populations.log.distribs samples_per_loc_postfilters

vcftools --vcf stacks_demultiplex/denovomap_C/min60/populations.snps.vcf --012 --out stacks_demultiplex/denovomap_C/min60/populations.snps.cornell.min60.tab


# Convert to tab file in 012NA format (run two commands below):
### 60 min
vcftools --vcf stacks_demultiplex/denovomap_C/min60/populations.snps.vcf --012 --out stacks_demultiplex/denovomap_C/min60/populations.snps.cornell.min60.tab

cat stacks_demultiplex/denovomap_C/min60/populations.snps.cornell.min60.tab.012 | sed 's/-1/NA/g' > stacks_demultiplex/denovomap_C/min60/populations.snps.cornell.min60.tab.012NA

### 40 min
vcftools --vcf stacks_demultiplex/denovomap_C/min40/populations.snps.vcf --012 --out stacks_demultiplex/denovomap_C/min40/populations.snps.cornell.min40.tab

cat stacks_demultiplex/denovomap_C/min40/populations.snps.cornell.min40.tab.012 | sed 's/-1/NA/g' > stacks_demultiplex/denovomap_C/min40/populations.snps.cornell.min40.tab.012NA


scp natola@files.zoology.ubc.ca:flex/UT/stacks_demultiplex/denovomap_C/min40/populations.snps.cornell.min40.tab* /Users/libbynatola/Desktop/UofL_GBS/UofLsnps/
scp natola@files.zoology.ubc.ca:flex/UT/stacks_demultiplex/denovomap_C/min60/populations.snps.cornell.min60.tab* /Users/libbynatola/Desktop/UofL_GBS/UofLsnps/


### the PCA makes no sense. There are 3 groups but they don't correspond to anything. I'm going to try again with the whole pipeline run on the one pop...

screen -S denovo_map.pl_Cornell
denovo_map.pl --samples cornell/ --popmap stacks_demultiplex/popmaps/sapsC_popmap_onepop.txt -X "ustacks: -m 10" -o stacks_demultiplex/denovomap_C/onepop -T 16 -X "populations: --min-maf 0.05 --fstats --min-samples-overall 60 --vcf --structure --plink --hzar --genepop"
cstacks -P stacks_demultiplex/denovomap_C/ -M stacks_demultiplex/popmaps/sapsC_popmap_onepop.txt -p 16
tsv2bam -P stacks_demultiplex/denovomap_C/ -M stacks_demultiplex/popmaps/sapsC_popmap_onepop.txt -t 16
tsv2bam -P ./stacks/ -M ./popmap -R ./samples -t 8


populations -P stacks_demultiplex/denovomap_C/ -O stacks_demultiplex/denovomap_C/onepop -M stacks_demultiplex/popmaps/sapsC_popmap_onepop.txt -t 16 --min-maf 0.05 --min-samples-overall 60 --fstats --vcf --structure --plink --hzar --genepop

### 60 min
vcftools --vcf stacks_demultiplex/denovomap_C/onepop/populations.snps.vcf --012 --out stacks_demultiplex/denovomap_C/onepop/populations.snps.cornell.min60.onepop.tab

cat stacks_demultiplex/denovomap_C/onepop/populations.snps.cornell.min60.onepop.tab.012 | sed 's/-1/NA/g' > stacks_demultiplex/denovomap_C/onepop/populations.snps.cornell.min60.onepop.tab.012NA

scp natola@files.zoology.ubc.ca:flex/UT/stacks_demultiplex/denovomap_C/onepop/populations.snps.cornell.min60.onepop.tab.* /Users/libbynatola/Desktop/UofL_GBS/UofLsnps/

#### Super filter
mkdir stacks_demultiplex/denovomap_C/onepop/90
populations -P stacks_demultiplex/denovomap_C/ -O stacks_demultiplex/denovomap_C/onepop/90 -M stacks_demultiplex/popmaps/sapsC_popmap_onepop.txt -t 16 --min-maf 0.05 --min-samples-overall 90 --fstats --vcf --structure --plink --hzar --genepop

vcftools --vcf stacks_demultiplex/denovomap_C/onepop/90/populations.snps.vcf --012 --out stacks_demultiplex/denovomap_C/onepop/90/populations.snps.cornell.min90.onepop.tab

cat stacks_demultiplex/denovomap_C/onepop/90/populations.snps.cornell.min90.onepop.tab.012 | sed 's/-1/NA/g' > stacks_demultiplex/denovomap_C/onepop/90/populations.snps.cornell.min90.onepop.tab.012NA

scp natola@files.zoology.ubc.ca:flex/UT/stacks_demultiplex/denovomap_C/onepop/90/populations.snps.cornell.min90.onepop.tab.* /Users/libbynatola/Desktop/UofL_GBS/UofLsnps/


### Super weak filter
mkdir stacks_demultiplex/denovomap_C/onepop/15
populations -P stacks_demultiplex/denovomap_C/ -O stacks_demultiplex/denovomap_C/onepop/15 -M stacks_demultiplex/popmaps/sapsC_popmap_onepop.txt -t 16 --min-maf 0.05 --min-samples-overall 15 --fstats --vcf --structure --plink --hzar --genepop

vcftools --vcf stacks_demultiplex/denovomap_C/onepop/15/populations.snps.vcf --012 --out stacks_demultiplex/denovomap_C/onepop/15/populations.snps.cornell.min15.onepop.tab

cat stacks_demultiplex/denovomap_C/onepop/15/populations.snps.cornell.min15.onepop.tab.012 | sed 's/-1/NA/g' > stacks_demultiplex/denovomap_C/onepop/15/populations.snps.cornell.min15.onepop.tab.012NA

scp natola@files.zoology.ubc.ca:flex/UT/stacks_demultiplex/denovomap_C/onepop/15/populations.snps.cornell.min15.onepop.tab.* /Users/libbynatola/Desktop/UofL_GBS/UofLsnps/


mkdir stacks_demultiplex/denovomap_C/-m5
screen -S -m5
denovo_map.pl --samples cornell/ --popmap stacks_demultiplex/popmaps/sapsC_popmap_onepop.txt -X "ustacks: -m 5" -o stacks_demultiplex/denovomap_C/-m5 -T 16 -X "populations: --min-maf 0.05 --fstats --min-samples-overall 60 --vcf --structure --plink --hzar --genepop"




#### AC's tassel files
vcftools --vcf cornell_tassel/40ind_80snpfiltered.vcf --012 --out cornell_tassel/40ind_80snpfiltered.tab 

### the tassel vcf file is formatted wrong I'm getting an error: Error: ID required in FORMAT field description: T,1,String,"Genotype" the format sections for the headers of this vs functional vcf file are:
##FORMAT=GT,1,String,"Genotype"
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">

###went in and changed it in nano to the second one, filename cornell_tassel/40ind_80snpfiltered_v2.vcf and it worked

vcftools --vcf cornell_tassel/40ind_80snpfiltered_v2.vcf --012 --out cornell_tassel/40ind_80snpfiltered.tab 

cat cornell_tassel/40ind_80snpfiltered.tab.012 | sed 's/-1/NA/g' > cornell_tassel/40ind_80snpfiltered.tab.012NA

scp natola@files.zoology.ubc.ca:flex/UT/cornell_tassel/40ind_80snpfiltered.tab.* /Users/libbynatola/Desktop/UofL_GBS/UofLsnps/









vcftools --vcf utexas4010FINAL.vcf --012 --out utexas4010FINAL.tab 

cat utexas4010FINAL.tab.012 | sed 's/-1/NA/g' > utexas4010FINAL.tab.012NA

scp natola@files.zoology.ubc.ca:flex/UT/utexas4010FINAL.tab.* /Users/libbynatola/Desktop/UofL_GBS/UofLsnps/

###### 7 April 2021 ######

### get avg depth/sample
cd UT/stacks_demultiplex/denovomap_UT/onepop/min40
vcftools --vcf populations.snps.vcf --depth -c > depth_summary.txt

cd UT/cornell_tassel
vcftools --vcf 40ind_80snpfiltered_v2.vcf --depth -c > depth_summary.txt