## get Arthropoda CO1 reference database with sequences from BOLD, GBOL and GenBank
bundle exec ruby taxalogue.rb --taxon Arthropoda download --all
bundle exec ruby taxalogue.rb --taxon Arthropoda classify --all

## replace space with underscore
ruby underscore_fasta.rb Arthropoda_derep_all_output.fas > Arthropoda_derep_all_output_uc.fas

## remove sequences with stop codons and correct reverse complements
ruby stop_codon_filter_and_rc_correcton.rb --input Arthropoda_derep_all_output_uc.fas --output Arthropoda_derep_all_output_uc_STPf.fas --genetic_code 5 --filter_info STPf.tsv

## remove gaps from sequences
ruby degap_fasta.rb Arthropoda_derep_all_output_uc_STPf.fas > Arthropoda_derep_all_output_uc_STPf_dg.fas

## remove possible contaminants
cat Homo_sapiens_output.fas Wolbachia_output.fas > contaminants.fas
./vsearch_usearch_global_qsub.sh

## remove sequences with more than 3 Ns
ruby filter_Ns.rb Arthropoda_derep_all_output_uc_STPf_dg_CONTf.fas Nf3_discarded.fas 3 > Arthropoda_derep_all_output_uc_STPf_dg_CONTf_Nf3.fas

## remove sequences that have less than 400 bp and more than 1569 bp
ruby length_filter.rb Arthropoda_derep_all_output_uc_STPf_dg_CONTf_Nf3.fas 400 1569 STPf_dg_CONTf_Nf3_Lf400_1569.tsv > Arthropoda_derep_all_output_uc_STPf_dg_CONTf_Nf3_Lf400_1569.fas

## get consensus seq from a subsample of the db and then filter the db based on the consensus seq
# retain only sequence that have standard folmer region length of 658
ruby length_filter.rb Arthropoda_derep_all_output_uc_STPf_dg_CONTf_Nf3.fas 658 658 STPf_dg_CONTf_Nf3_Lf658.tsv > Arthropoda_derep_all_output_uc_STPf_dg_CONTf_Nf3_Lf658.fas

# get 10k random sequnces from the ref db
rand_num=$RANDOM
/home/nnoll/installed_programs/vsearch-2.14.1-linux-x86_64/bin/vsearch --fastx_subsample Arthropoda_derep_all_output_uc_STPf_dg_CONTf_Nf3_Lf658.fas --sample_size 10000 --randseed $rand_num  --fastaout ss10000_rs_STPf_dg_CONTf_Nf3_Lf658.fas
mv ss10000_rs_STPf_dg_CONTf_Nf3_Lf658.fas ss10000_rs6912_STPf_dg_CONTf_Nf3_Lf658.fas

# cluster sequences
usearch11 -cluster_fast ss10000_rs6912_STPf_dg_CONTf_Nf3_Lf658.fas -id 0.0 -consout CONS_ss10000_rs6912_STPf_dg_CONTf_Nf3_Lf658.fas -centroids CENTROID_ss10000_rs6912_STPf_dg_CONTf_Nf3_Lf658.fas -sizeout -dbmask none -qmask none

# get cluster alignments
/home/nnoll/installed_programs/vsearch-2.14.1-linux-x86_64/bin/vsearch --usearch_global CENTROID_ss10000_rs6912_STPf_dg_CONTf_Nf3_Lf658.fas --db CENTROID_ss10000_rs6912_STPf_dg_CONTf_Nf3_Lf658.fas --maxaccepts 0 --maxrejects 0 --id 0.0 --blast6out consensus_clusters.b6 -alnout consensus_clusters.al -dbmask none -qmask none

# did pick cluster with highest abundance from CONS_ss10000_rs6912_STPf_dg_CONTf_Nf3_Lf658.fas, which was a consensus sequence from the folmer region, and saved it to Arthropoda_CO1_CONSENSUS.fas
# beforehand the sequences were uppercased
# other clusters did represent different regions within the CO1 gene

# match reference database against consensus sequence and retain only sequences that match at least 50% of the consensus sequence
./CONS_vsearch_usearch_global_qsub.sh
mv CONS_matched.fas Arthropoda_derep_all_output_uc_STPf_dg_CONTf_Nf3_Lf400_1569_ConsM.fas

## some sequences had missing information, e.g. order info was missing => sequence taxon info was removed until class level
ruby remove_lower_than_missing_taxon_info.rb Arthropoda_derep_all_output_uc_STPf_dg_CONTf_Nf3_Lf400_1569_ConsM.fas > Arthropoda_derep_all_output_uc_STPf_dg_CONTf_Nf3_Lf400_1569_ConsM_TaxR.fas
