# First ...
# 1. Download all 1000G from ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/
# 2. Concatenate all autosomal biallelic SNPs into a single vcf.gz
# 3. Subset concatenated chromosomes to only sites that are on a microarray platform

# wget the entire 20130502 release variant calls and all associated data
wget -r --level=1 ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/
# Move it all into a folder called 'ftp'
mkdir -p ftp
mv ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/* ftp/
rm -rf ftp.1000genomes.ebi.ac.uk/

# CONCATENATE ALL AUTOSOMES
# /tmp is too small to perform the sort. Make a tmp dir here.
mkdir -p tmp
# concat all autosomes, get only biallelic snps, sort, write out to file, index (takes most of a day)
bcftools concat -a ftp/ALL.chr[0-9]*.vcf.gz | bcftools view -v snps -m2 -M2 | bcftools sort --temp-dir ./tmp/ -Oz -o 1000g.concat.autosomes.vcf.gz
tabix 1000g.concat.autosomes.vcf.gz

# Next ...
# 1. Download, unzip, and dos2unix format the Illumina GSA manifest
# 2. Create a tab-delimited file ready for bcftools --regions-file

# Get the manifest directly from Illumina, unzip, remove incompatible line endings, clean up
wget https://support.illumina.com/content/dam/illumina-support/documents/downloads/productfiles/global-screening-array-24/v3-0/GSA-24v3-0-A1-manifest-file-csv.zip
unzip GSA-24v3-0-A1-manifest-file-csv.zip
dos2unix GSA-24v3-0_A1.csv
rm GSA-24v3-0-A1-manifest-file-csv.zip

# Parse that manifest
# The last 20 or so lines are unformatted "control" sequences. The sed line gets rid of everything including and after a line that contains "Controls"
# awk: F, indicates fields separated by commas; OFS="\t" indicates output should be tab delimited. NR>8 skips the first 8 line header, columns 10 and 11 contain chr/position.
# grep out autosomes, sort numerically and get unique rows
cat GSA-24v3-0_A1.csv | sed '/Controls/Q' | awk -F, -v OFS="\t" 'NR>8 {print $10,$11}' | grep "^[1-9]" | sort -nk1,2 | uniq > GSA-24v3-0_A1.csv.autosomes.chr-pos.tsv

# Get the "reliable" sites from the AKT pruned sites from https://github.com/Illumina/akt/tree/master/data
wget https://raw.githubusercontent.com/Illumina/akt/master/data/wgs.grch37.vcf.gz -O reliable.grch37.vcf.gz
wget https://raw.githubusercontent.com/Illumina/akt/master/data/wgs.grch37.vcf.gz.tbi -O reliable.grch37.vcf.gz.tbi
bcftools query -f '%CHROM\t%POS\n' reliable.grch37.vcf.gz > reliable.grch37.chr-pos.tsv

# Concatenate GSA autosomes and reliable sites, get autosomes, sort, write to tsv for bcftools view -R
cat GSA-24v3-0_A1.csv.autosomes.chr-pos.tsv reliable.grch37.chr-pos.tsv | grep "^[1-9]" | sort -nk1,2 | uniq  > GSA-reliable.chr-pos.tsv

# Next ...
# 1. Download the sex-specific recombination
# 2. Assemble map file for ped-sim

# per ped-sim documentation https://github.com/williamslab/ped-sim#map-file
wget https://github.com/cbherer/Bherer_etal_SexualDimorphismRecombination/raw/master/Refined_genetic_map_b37.tar.gz
tar xvzf Refined_genetic_map_b37.tar.gz
printf "#chr\tpos\tmale_cM\tfemale_cM\n" > refined_mf.simmap
for chr in {1..22}; do
  paste Refined_genetic_map_b37/male_chr$chr.txt Refined_genetic_map_b37/female_chr$chr.txt \
    | awk -v OFS="\t" 'NR > 1 && $2 == $6 {print $1,$2,$4,$8}' \
    | sed 's/^chr//' >> refined_mf.simmap;
done

# clean up
rm -rf Refined_genetic_map_b37*


# Next ...
# 1. Get ped-sim interference file
wget https://raw.githubusercontent.com/williamslab/ped-sim/master/interfere/nu_p_campbell.tsv


# Next ...
# 1. Subset the autosomal 1000G data to GSA+reliable sites for unreleated GBR samples
# 2. Index the output file
# NOTE: the contents of GBR.txt includes sample IDs for all unrelated GBR samples
bcftools view -R GSA-reliable.chr-pos.tsv 1000g.concat.autosomes.vcf.gz -S GBR.txt -v snps -Oz | bcftools sort -Oz -o 1000g.concat.autosomes.GBR.vcf.gz
tabix 1000g.concat.autosomes.GBR.vcf.gz

# Next ...
# 1. Run ped-sim
# 2. Zip and index result file
# NOTE: the contents of the pedigree definition includes 5 generations with half-siblings

ped-sim \
    --seed 090322 \
    --nogz \
    --err_rate 0 \
    --miss_rate 0 \
    --err_hom_rate 0 \
    -d 5GHS.def \
    -m refined_mf.simmap \
    --intf nu_p_campbell.tsv \
    --fam \
    -i 1000g.concat.autosomes.GBR.vcf.gz \
    -o simulation

bgzip simulation.vcf
tabix simulation.vcf.gz
