# Downloaded Brassica oleracae reference genome from:
ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/plants/Brassica_oleracea/BOL_v1.0/Primary_Assembly/unplaced_scaffolds/FASTA/

# Download the raw data of this study and make to folders, say lane1 and lane2.

# Place the parameters_lane1.conf and parameters_lane2.conf in the corresponding folder and adjust the .conf files to your requirements (location of reference genomes, barcode files, tools, etc.).

# Then start the pipeline and move libraries with a too low number of reads:
# For details of the pipeline please look in the .conf file and the gbs_pipline.sh script
cd lane1
bash ../scripts/gbs_pipeline.sh . -c parameters_lane1.conf
mkdir mapping_low_cov
cd mapping_lane1
mv prepro_BA96.bam* prepro_BA53.bam* prepro_BA27.bam* prepro_BA29.bam* prepro_BA19.bam* ../mapping_low_cov/

cd ../lane2/
bash ../scripts/gbs_pipeline.sh . -c parameters_lane2.conf
mkdir mapping_low_cov
cd mapping_lane2
mv prepro_BA04.bam* prepro_BA94.bam* prepro_BA86.bam* prepro_BA05.bam* prepro_BA03.bam* prepro_BA46.bam* prepro_BA62.bam* prepro_BA78.bam* prepro_BA27.bam* prepro_BA77.bam* prepro_BA30.bam* prepro_BA70.bam* prepro_BA61.bam* ../mapping_low_cov/


# SNP calling:
cd ../
mkdir SNPcalling; cd SNPcalling
samtools mpileup -f B_oleracea_scaf.fa -D -g -C 50 -S -A -E ../lane1/mapping_lane1/*bam ../lane2/mapping_lane2/*bam| bcftools view -bvcg - > out.bcf
bcftools view {} | vcfutils.pl varFilter -D 10000 > vs_Bolec.vcf

# Filter vcf: minimum coverage 30 reads, min 10 reads with variant nucleotide (-e 0 means no missing values allowed)
python filterVCF.py vs_Bolec.vcf 10varc30e1.vcf 10 0 -c 30 -e 1
python filterVCF.py vs_Bolec.vcf 10varc30e0.vcf 10 0 -c 30 -e 0


# Adjust genotype values and missing data (set missing data genotype to ./.):
python adjust_vcf_genotypes.py 10varc30e0.vcf adjusted_10varc30e0.vcf
python adjust_vcf_genotypes.py 10varc30e1.vcf adjusted_10varc30e1.vcf

# => Get SNPs in different file types:
python vcf2various.py adjusted_10varc30e1.vcf adjusted_10varc30e1.SNP 2SNP 10 0 -c 30 -e 1 
python vcf2various.py adjusted_10varc30e1.vcf adjusted_10varc30e1 2structure 10 0 -c 30 -e 1
python vcf2various.py adjusted_10varc30e1.vcf adjusted_10varc30e1.distMat 2distMatrix 10 0 -c 30 -e 1 
python vcf2various.py adjusted_10varc30e0.vcf adjusted_10varc30e0.SNP 2SNP 10 0 -c 30 -e 0 
python vcf2various.py adjusted_10varc30e0.vcf adjusted_10varc30e0 2structure 10 0 -c 30 -e 0 
python vcf2various.py adjusted_10varc30e0.vcf adjusted_10varc30e0.distMat 2distMatrix 10 0 -c 30 -e 0 

# Add locus name and population ID to .stru and .distMat for use in DAPC and/or STRUCTURE
awk '{print $1}' adjusted_10varc30e1.SNP | sed ':a;N;$!ba;s/\n/, /g' | sed 's/SNP-id, //g'> locus_names_e1
sed 's/,//g' locus_names_e1 > locus_names_e1_NoComma 	# input for stru_with*LOCUS
cat locus_names_e1_NoComma adjusted_10varc30e1.stru > adjusted_10varc30e1_LOCUS.stru
awk '{print $1}' adjusted_10varc30e0.SNP | sed ':a;N;$!ba;s/\n/, /g' | sed 's/SNP-id, //g'> locus_names_e0
sed 's/,//g' locus_names_e0 > locus_names_e0_NoComma 	# input for stru_with*LOCUS
cat locus_names_e0_NoComma adjusted_10varc30e0.stru > adjusted_10varc30e0_LOCUS.stru

# Make bed
vcftools --vcf adjusted_10varc30e0.vcf --plink --out  adjusted_10varc30e0
plink --file adjusted_10varc30e0 --make-bed --out adjusted_10varc30e0.ped

