Papillomavirus Pan-Genome Analysis Pipeline Description

## by Jingtao Chen


01_retrieve_genome_from_NCBI
## With the accession id, efetch (v16.2) can be used to retrieve the genome from NCBI and the fasta files can be saved with name of strian.

efetch -db sequences \
	-id $accession_id \
	-format fasta |gzip \
	-c  > $work_dir/$strain_name.fa.gz

02_run_retrieve_gff3_from_NCBI
## With the accession id, we can retrieve the genome annotation files with name of strian from NCBI.
wget -O $sample_name.gff3 \"https://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?db=nuccore&report=gff3&id=${acc_id}\"

03_01_gff3_annotation
##Since the quality of gff3 files obtained from NCBI was uneven, we used PuMA (v1.2.2) to annotate all PV genomes, and some strains that could not be annotated were processed manually.

docker run --rm --privileged=true -v "$(pwd)/:/in_out"  \
    kvdlab/puma:1.2.2 run_puma.py \
    -i /in_out/$strain_name.fa \
    -o /in_out/puma_out \
    -d /app/puma/data_dir

03_02_gff_to_cds_extract.sh
## The coding sequence (CDS) of each strain is required for curated data processing and analysis. Thus, gffread (v0.9.12) is used to extract each CDS through genome and corresponding gff3 files.
## raw_fastas & raw_gff3 → all_CDS

gffread $gff_dir/$strian_name$prefix_gff3 \
	-g $seq_dir/$strian_name$prefix_fasta \
	-x $cds_dir/$strian_name.cds.fa

**To improve the efficiency and accuracy of the subsequent analysis, and considering that the DNA of PV is circular, we extracted the L1 sequence of each PV through the gffread (v0.9.12) according to the corresponding GFF file, and then placed the L1 to the end site of the genome and the subsequent sequences to the start site, so that each PV gene sequence takes L1 gene as the ending site.

03_03_genome_reset_preparation_step1
## raw_fasta → remove "\n" → step1_fasta

03_04_genome_reset_preparation_step2
## all_CDS → L1_CDS
## step1_fasta & L1_CDS → reset_fasta(step2_files, without "\n")
## "aaaL1bbb" → "bbbaaaL1"

03_05_gff3_annotation_reset
## After processing the raw genome, the corresponding gff3 files need to be updated and curated.


04_windowmasker
## We used WindowMasker (v1.0.0) to mask out highly repetitive and low complexity DNA sequences within a genome using the sequence of the genome itself.

# step1: prepare *.ustat file for windowmaser 
windowmasker -checkdup true \
    -mk_counts \
    -in $genome_dir/$strain.fa \
    -out $strain.masking_library.ustat \
    -mem 65000

# step2: mask out highly repetitive and low complexity DNA sequences
windowmasker -ustat $strain.masking_library.ustat \
    -in $genome_dir/$strain.fa \
    -out $strain.softmask.fa \
    -outfmt fasta \
    -dust true

05_generate_the_guide_tree_for_phylogenetic_analysis
## For the phylogenetic analysis, we need to generate a guide tree from the alignment fasta of curated genome, and we used MAFFT (V7.310) to align the genome and IQ-TREE (v2.0.6) to produce the guide tree.

# step1: genome alignment with MAFFT
mafft --auto \
   --thread $threads \
   --preservecase $prefix.noref.softmask.fa > $prefix.mafft.fa 

# step2: alignment trimming by clipkit (v1.3.0)
clipkit $prefix.mafft.fa \
   -m gappy -g 0.9 \
   -o $prefix.mafft.clip.fa 

# step3: guide tree generated by iqtree
iqtree2 -s ./$prefix.mafft.clip.fa \
        --seed $seed \
        -m MFP \
        -B 1000 --bnni \
        -alrt 1000 \
        -T $threads \
        --prefix $prefix.clip.iqtree \
        --safe

06_phylogenetic_tree
## With the guide tree generated by MAFFT (V7.310), we used Cactus (v2.2.0) to align the curated genome with the reference genome of hom_sap_HPV16 (curated) and IQ-TREE (v2.0.6) to produce the phylogenetic tree.

# step1: prepare the input seqFile for cactus alignment
python3 $scripts_dir/01_generate_cactus_input_seqFile.py \
	-strains $strains \
	-ref_strains $ref_strains -genome_dir $genome_dir \
    	-guide_tree $guide_tree \
	-out_seqFile $seqFile

# step2: cactus alignment
cactus ./jobstore $seqFile --maxCores 32 ${prefix}.hal

# step3: hal2maf
hal2maf --onlyOrthologs \
	--noDupes \
	--maxRefGap 2000 \
	--refGenome hom_sap_HPV16 \
	${prefix}.hal ${prefix}.maf

# step4: maf2fa
python3 $scripts_dir/02_maf2fa_for_noDupes.py \
	-maf $prefix.maf \
	-out_fa_QC $prefix.fa.QC \
	-out_fa_info $prefix.fa.info \
	-out_fa $prefix.fa

# step5: clipkit
clipkit $prefix.fa -m gappy -o $prefix.clip.fa

# step6: phylogenetic tree generated by iqtree
iqtree2 -s ./$prefix.clip.fa \
        --seed $seed \
        -m MFP \
        -B 1000 --bnni \
        -alrt 1000 \
        -T $threads \
        --prefix $prefix.clip.iqtree \
        --safe

# step7: phylogenetic tree figure generated by ggtree.R (v3.8.2)
```{r}
library(treeio)
library(ggplot2)
library(ggtree)
library(ggtreeExtra)

setwd("/path/to/Program_example/Phylogenetic_Tree")
Length_colors = c("white", "pink", "red")

Risk_colors = c("HighRisk" = "red", "LowRisk" = "#B0E2FF", "ProbablyHighRisk" = "#efdf00", "UnClassified" = 
                  "grey", "NonHuman" = "#57e559" )

Disease_colors = c("Cancer" = "red", "PrecancerousLesion" = "orange", "BenignDisease" = "#FEEBCD", 
                   "ImmuneDisorder" = "#8B636C", "Healthy" = "#1874CD", "NA" = "grey", "NonHuman" = "#57e559")

Disease_Type_colors = c("OPSCC" = "#8e3af4", "CSCC" = "red","CIN"="orange", "BenignDisease" = "#FEEBCD", 
                        "ImmuneDisorder" = "#8B636C", "Healthy" = "#1874CD", "NA" = "grey", "NonHuman" = 
                          "#57e559")

Geographical_colors = c("Africa" = "#cc8e12", "Asia" = "red", "NAmerica" = "#8e3af4","SAmerica" = 
                                 "#8B636C", "Australia" = "#FEEBCD", "Europe" = "#FFAEB9", "NA" = "grey", 
                               "Oceania" = "#3FA9F5", "NonHuman" = "#57e559")

Genus_colors = c("Beta" = "#ddd53e", "Alpha" = "#cc8e12", "Mu" = "#925bea","Nu"="#FFC125", "Gamma" = "#167275", 
                 "Omikron" = "#0dbc21", "NonPrimate" = "grey")

Genus_color2 = c("Alpha" = "#cc8e12","Beta" =  "#ddd53e","Gamma" =  "#167275","Mu" =  "#925bea","Nu" =  "#FFC125","Chi" =  "#B9D3EE","Delta" =  "#FFC0CB","Epsilon" =  "#2F4F4F","Eta" =  "#ce2523","Iota" =  "#8B636C","Kappa" =  "#3FA9F5","Lambda" =  "#2E8B57","Omega" =  "#FEEBCD","Omikron" =  "#57e559" ,"Phi" = "#f9a270","Pi" = "#22547f","Psi" =  "#db5e92","Rho" =  "#edd05e","Sigma" =  "#DEB887","Tau" =  "#0dbc21","Theta" = "#280f7a","Upsilon" =  "#6373ed","Xi" =  "#B3EE3A","Zeta" =  "#FF6347","Dyoomikron" = "#391c82","Dyopi" = "#2baeb5","Dyodelta" = "#8249aa","Dyoxi" =  "#ed1299","Dyolambda" = "#246b93","Dyoeta" = "#d66551","Dyoomega" = "#09f9f5","Dyophi" = "#e86502","Dyosigma" = "#39ba30","Dyotau" = "#d561dd","Dyopsi" = "#99db27","Dyoupsilon" = "#e07233","Dyokappa" = "#ff523f","Dyonu" = "#8AD6E4FF","Dyotheta" = "#E5A8A0FF","Dyochi" = "#B6EEE2FF","Dyorho" = "#CAB8E5FF","Dyoiota" = "#C6EDC5FF","Dyomu" = "#A1BDE6FF","Dyoepsilon" = "#EBD4ABFF","Dyozeta" = "#A0CDE2FF","Treieta" = "#CFB192FF","Treisdelta" = "#87C4B8FF","Treisepsilon" = "#E6BBC7FF","Treiszeta" = "#B2C69AFF","Treisiota" = "black","Unclassified" =  "grey")


Tissue_colors = c("Skin" = "#fe5000", "Mucosa" = "#1f78b4","Skin/Mucosa" = "#efdf00","NA" = "#b2df8a")
Site_colors = c("Anus" = "#3FA9F5", "Oropharynx" = "red","Epidermis" = "#cc8e12","NonHuman" = "#57e559" ,"NA" = "grey","Genitalia" =  "#8e3af4")

# read tree & annot file
tree = read.tree("phylogenetic_tree_example.treefile")
data1 = read.xlsx("example_data.xlsx", na.strings = NULL)

data1$Genus_Original <- factor(data1$Genus_Original,c("Alpha","Beta","Gamma","Mu","Nu","Chi","Delta","Epsilon","Eta","Iota","Kappa","Lambda","Omega","Omikron","Phi","Pi","Psi","Rho","Sigma","Tau","Theta","Upsilon","Xi","Zeta","Dyoomikron","Dyopi","Dyodelta","Dyoxi","Dyolambda","Dyoeta","Dyoomega","Dyophi","Dyosigma","Dyotau","Dyopsi","Dyoupsilon","Dyokappa","Dyonu","Dyotheta","Dyochi","Dyorho","Dyoiota","Dyomu","Dyoepsilon","Dyozeta","Treieta","Treisdelta","Treisepsilon","Treiszeta","Treisiota","UnClassified"))

# highlight_node
C1	<-	3550
C2	<-	3569
C3	<-	3591
C4	<-	3626
C5	<-	3655
C6	<-	3635
C7	<-	4222
C8	<-	4107
C9	<-	4129
C10	<-	4161
C11	<-	4090
C12	<-	3669
C13	<-	4032
C14	<-	4049
C15	<-	4020
C16	<-	3718
C17	<-	3685
C18	<-	3736
C19	<-	3747
C20	<-	3811
C21	<-	3799
C22	<-	3765
C23	<-	4015
C24	<-	3936
C25	<-	4011
C26	<-	3947
C27	<-	3967
C28	<-	3924
C29	<-	3902
C30	<-	3894
C31	<-	3847
C32	<-	3866
C33	<-	4230
C34	<-	7092
C35	<-	4247
C36	<-	4317
C37	<-	4265
C38	<-	4256
C39	<-	4279
C40	<-	4275
C41	<-	4304
C42	<-	4298
C43	<-	4285
C44	<-	4759
C45	<-	4736
C46	<-	4717
C47	<-	4706
C48	<-	4609
C49	<-	4336
C50	<-	7081
C51	<-	4966
C52	<-	4943
C53	<-	4896
C54	<-	4847
C55	<-	4831
C56	<-	4784
C57	<-	5211
C58	<-	5180
C59	<-	5153
C60	<-	5126
C61	<-	4978
C62	<-	7045
C63	<-	7063
C64	<-	6870
C65	<-	6662
C66	<-	6846
C67	<-	6697
C68	<-	6625
C69	<-	6195
C70	<-	5242


p1 <- ggtree(tree, layout="rectangular", size = 0.2, color="black")+
      geom_point2(aes(subset=!isTip & !(is.na(support)), 
                      color=cut(support, c(-1,50,70,90,100))), 
                      shape=16,
                      size=0.1)+
      geom_text2(aes(subset=!isTip, label=node), hjust=-0.3, size=3, color="black")+
      geom_tiplab(size=2, color="black", linetype="dotted",align =F,hjust=-0.02)

p1 <-  collapse(p1,C1,'max',fill="#CB7C77FF",alpha=0.6) %>% 
  collapse(C2,'max',fill="#68D359FF",alpha=0.7) %>% 
  collapse(C3,'max',fill="#6B42C8FF",alpha=0.7) %>% 
  collapse(C4,'max',fill="#C9D73DFF",alpha=0.7) %>% 
  collapse(C5,'max',fill="#C555CBFF",alpha=0.7) %>% 
  collapse(C6,'max',fill="#AED688FF",alpha=0.7) %>% 
  collapse(C7,'max',fill="#502E71FF",alpha=0.7) %>% 
  collapse(C8,'max',fill="#C49A3FFF",alpha=0.7) %>% 
  collapse(C9,'max',fill="#6A7DC9FF",alpha=0.7) %>% 
  collapse(C10,'max',fill="#D7652DFF",alpha=0.7) %>% 
  collapse(C11,'max',fill="#7CD5C8FF",alpha=0.7) %>% 
  collapse(C12,'max',fill="#C5383CFF",alpha=0.7) %>% 
  collapse(C13,'max',fill="#507D41FF",alpha=0.7) %>% 
  collapse(C14,'max',fill="#CF4C8BFF",alpha=0.7) %>% 
  collapse(C15,'max',fill="#5D8D9CFF",alpha=0.7) %>% 
  collapse(C16,'max',fill="#722E41FF",alpha=0.7) %>% 
  collapse(C17,'max',fill="#C8B693FF",alpha=0.7) %>% 
  collapse(C18,'max',fill="#33333CFF",alpha=0.7) %>% 
  collapse(C19,'max',fill="#C6A5CCFF",alpha=0.7) %>% 
  collapse(C20,'max',fill="#674C2AFF",alpha=0.7) %>% 
  collapse(C21,'max',fill="#8AD6E4FF",alpha=0.7) %>% 
  collapse(C22,'max',fill="#E5A8A0FF",alpha=0.7) %>% 
  collapse(C23,'max',fill="#B6EEE2FF",alpha=0.7) %>% 
  collapse(C24,'max',fill="#CAB8E5FF",alpha=0.7) %>% 
  collapse(C25,'max',fill="#C6EDC5FF",alpha=0.7) %>% 
  collapse(C26,'max',fill="#A1BDE6FF",alpha=0.7) %>% 
  collapse(C27,'max',fill="#EBD4ABFF",alpha=0.7) %>% 
  collapse(C28,'max',fill="#A0CDE2FF",alpha=0.7) %>% 
  collapse(C29,'max',fill="#CFB192FF",alpha=0.7) %>% 
  collapse(C30,'max',fill="#87C4B8FF",alpha=0.7) %>% 
  collapse(C31,'max',fill="#E6BBC7FF",alpha=0.7) %>% 
  collapse(C32,'max',fill="#B2C69AFF",alpha=0.7) %>% 
  collapse(C33,'max',fill="#E2C5E0FF",alpha=0.7) %>% 
  collapse(C34,'max',fill="#A3C0A6FF",alpha=0.7) %>% 
  collapse(C35,'max',fill="#B5B4C8FF",alpha=0.7) %>% 
  collapse(C36,'max',fill="#DBE7C5FF",alpha=0.7) %>% 
  collapse(C37,'max',fill="#A4BEB8FF",alpha=0.7) %>% 
  collapse(C38,'max',fill="#F0DCD0FF",alpha=0.7) %>% 
  collapse(C39,'max',fill="#CDE4E9FF",alpha=0.7) %>% 
  collapse(C40,'max',fill="#D0BCADFF",alpha=0.7) %>% 
  collapse(C41,'max',fill="#4E79A7FF",alpha=0.7) %>% 
  collapse(C42,'max',fill="#FFBE7DFF",alpha=0.7) %>% 
  collapse(C43,'max',fill="#B6992DFF",alpha=0.7) %>% 
  collapse(C44,'max',fill="#A0CBE8FF",alpha=0.7) %>% 
  collapse(C45,'max',fill="#59A14FFF",alpha=0.7) %>% 
  collapse(C46,'max',fill="#F1CE63FF",alpha=0.7) %>% 
  collapse(C47,'max',fill="#BAB0ACFF",alpha=0.7) %>% 
  collapse(C48,'max',fill="#8CD17DFF",alpha=0.7) %>% 
  collapse(C49,'max',fill="#FF9D9AFF",alpha=0.7) %>% 
  collapse(C50,'max',fill="#86BCB6FF",alpha=0.7) %>% 
  collapse(C51,'max',fill="#E15759FF",alpha=0.7) %>% 
  collapse(C52,'max',fill="#499894FF",alpha=0.7) %>% 
  collapse(C53,'max',fill="#79706EFF",alpha=0.7) %>% 
  collapse(C54,'max',fill="#F28E2BFF",alpha=0.7) %>% 
  collapse(C55,'max',fill="#D37295FF",alpha=0.7) %>% 
  collapse(C56,'max',fill="#2CA030FF",alpha=0.7) %>% 
  collapse(C57,'max',fill="#B07AA1FF",alpha=0.7) %>% 
  collapse(C58,'max',fill="#FFAA0EFF",alpha=0.7) %>% 
  collapse(C59,'max',fill="#9D7660FF",alpha=0.7) %>% 
  collapse(C60,'max',fill="#D7B5A6FF",alpha=0.7) %>% 
  collapse(C61,'max',fill="#1F83B4FF",alpha=0.7) %>% 
  collapse(C62,'max',fill="#12A2A8FF",alpha=0.7) %>% 
  collapse(C63,'max',fill="#FABFD2FF",alpha=0.7) %>% 
  collapse(C64,'max',fill="#78A641FF",alpha=0.7) %>% 
  collapse(C65,'max',fill="#BCBD22FF",alpha=0.7) %>% 
  collapse(C66,'max',fill="#FFBF50FF",alpha=0.7) %>% 
  collapse(C67,'max',fill="#D4A6C8FF",alpha=0.7) %>% 
  collapse(C68,'max',fill="#FF7F0EFF",alpha=0.7) %>% 
  collapse(C69,'max',fill="#C7519CFF",alpha=0.7) %>% 
  collapse(C70,'max',fill="#D63A3AFF",alpha=0.7)



p2 <- p1 + 
  geom_fruit(
    mapping=aes(y=Strain_Name,fill=Species_3),
    data=data1,
    geom=geom_tile,
    width=0.05,
    offset = 0.05) + 
  scale_fill_manual(name="02_PVs_Species",
                    breaks=c("Homo_Sapiens","Primate_Old_World_Monkey","Primate_New_World_Monkey",
                             "NonPrimate_Mammalia","Reptilia","Aves"),
                    values = color8,
                    guide=guide_legend(override.aes = list(size = 20),
                    title.theme = element_text(face="bold", size=40),
                    label.theme = element_text(face="bold",size=40), 
                    keywidth = 12, keyheight = 12,
                    order = 2))+
  new_scale_fill()+
  scale_color_manual(name="01_Bootstrap Percentage(BP)",
                    values = c("#B0B0B0","#FFB90F","pink", "red"),
                    guide=guide_legend(override.aes = list(size = 20),
                    title.theme = element_text(face="bold", size=40),
                    label.theme =element_text(face="bold",size=40), 
                    keywidth = 12, keyheight = 12,
                    order=1))

p3 <- p2+
    new_scale_fill()+
    geom_fruit(
    data=data1,
    geom=geom_tile,
    mapping=aes(y=Strain_Name,fill=Risk),
    width=0.05,
    offset = 0.05)+
  scale_fill_manual(name="03_Risk",
                    breaks=c("HighRisk","ProbablyHighRisk","LowRisk",
                             "UnClassified"),
                    values = Risk_colors,
                    guide=guide_legend(override.aes = list(size = 20),
                    title.theme = element_text(face="bold", size=40),
                    label.theme =element_text(face="bold",size=40), 
                    keywidth = 12, keyheight = 12, 
                    order = 3))

p4 <- p3+
    new_scale_fill()+
    geom_fruit(
    data=data1,
    geom=geom_tile,
    mapping=aes(y=Strain_Name,fill=Disease),
    width=0.05,
    offset = 0.05)+
  scale_fill_manual(name="04_Disease",
                    breaks=c("Cancer", "PrecancerousLesion", "BenignDisease", "ImmuneDisorder", "Healthy", "NA",
                             "NonHuman"),
                    values = Disease_colors,
                    guide=guide_legend(override.aes = list(size = 20),
                    title.theme = element_text(face="bold", size=40),
                    label.theme =element_text(face="bold",size=40), 
                    keywidth = 12, keyheight = 12, 
                    order = 4))

p5 <- p4+
    new_scale_fill()+
    geom_fruit(
    data=data1,
    geom=geom_tile,
    mapping=aes(y=Strain_Name,fill=Disease_Type),
    width=0.05,
    offset = 0.05)+
  scale_fill_manual(name="05_Disease_Type",
                    breaks=c("OPSCC", "CSCC", "CIN", "BenignDisease", "ImmuneDisorder", "Healthy", "NA", 
                             "NonHuman"),
                    values = Disease_Type_colors,
                    guide=guide_legend(override.aes = list(size = 20),
                    title.theme = element_text(face="bold", size=40),
                    label.theme =element_text(face="bold",size=40), 
                    keywidth = 12, keyheight = 12, 
                    order = 4))

p6 <- p5+
   new_scale_fill()+
    geom_fruit(
    data=data1,
    geom=geom_tile,
    mapping=aes(y=Strain_Name,fill=Geography_Continent),
    width=0.05,
    offset = 0.05)+
  scale_fill_manual(name="06_Geographical_origin",
                    breaks=c("Africa", "Asia", "NAmerica", "SAmerica", "Europe", "NA", "Oceania", "NonHuman"),
                    values = Geographical_colors,
                    guide=guide_legend(override.aes = list(size = 20),
                    title.theme = element_text(face="bold", size=40),
                    label.theme =element_text(face="bold",size=40), 
                    keywidth = 12, keyheight = 12, 
                    order = 4))
  
p6 <-  p6 +  new_scale_fill()
p7 <-  gheatmap(p6,data2,width = 0.025,offset=0.6,
               low = "#B0B0B0",high = "black",color = "NA",
               legend_title = "07_3548PVs_genomic_length",
               colnames = TRUE,colnames_position = "bottom",font.size = 20)+
               theme(legend.key.size = unit(5, 'cm'),
               legend.text=element_text(size=40),
               plot.title = element_text(size=40, hjust = 0.5, vjust = 1.5))

p7 <-  p6 +  new_scale_fill()
p8 <-  gheatmap(p7,data3,width = 0.025,offset=0.7,
               low = "grey",high = "black",color = "NA",
               legend_title = "08_3548PVs_snp_count",
               colnames = TRUE,colnames_position = "bottom",font.size = 20)+
               theme(legend.key.size = unit(5, 'cm'),
               legend.text=element_text(size=40),
               plot.title = element_text(size=40, hjust = 0.5, vjust = 1.5))

p9 <- p8+
    new_scale_fill()+
    geom_fruit(
    data=data1,
    geom=geom_tile,
    mapping=aes(y=Strain_Name,fill=Genus_Original),
    width=0.05,
    offset = 0.05)+
  scale_fill_manual(name="09_Genus",
                    breaks=c("Alpha","Beta","Gamma","Mu","Nu","Chi","Delta","Epsilon","Eta","Iota","Kappa","Lambda","Omega","Omikron","Phi","Pi","Psi","Rho","Sigma","Tau","Theta","Upsilon","Xi","Zeta","Unclassified","Dyoomikron","Dyopi","Dyodelta","Dyoxi","Dyolambda","Dyoeta","Dyoomega","Dyophi","Dyosigma","Dyotau","Dyopsi","Dyoupsilon","Dyokappa","Dyonu","Dyotheta","Dyochi","Dyorho","Dyoiota","Dyomu","Dyoepsilon","Dyozeta","Treieta","Treisdelta","Treisepsilon","Treiszeta","Treisiota"),
                    values = color5,
                    guide=guide_legend(override.aes = list(size = 20),
                    title.theme = element_text(face="bold", size=40),
                    label.theme =element_text(face="bold",size=40), 
                    keywidth = 12, keyheight = 12, 
                    order = 4))

p10 <- p9+
    new_scale_fill()+
    geom_fruit(
    data=data1,
    geom=geom_tile,
    mapping=aes(y=Strain_Name,fill=Tissue),
    width=0.05,
    offset = 0.05)+
  scale_fill_manual(name="10_Tissue",
                    breaks=c("Skin", "Mucosa","NA","Skin/Mucosa"),
                    values = Tissue_colors,
                    guide=guide_legend(override.aes = list(size = 20),
                    title.theme = element_text(face="bold", size=40),
                    label.theme =element_text(face="bold",size=40), 
                    keywidth = 12, keyheight = 12, 
                    order = 4))

p11 <- p10+
    new_scale_fill()+
    geom_fruit(
    data=data1,
    geom=geom_tile,
    mapping=aes(y=Strain_Name,fill=Body_site),
    width=0.05,
    offset = 0.05)+
  scale_fill_manual(name="11_Body_site",
                    breaks=c("Crissum", "Oropharynx","Epidermis","NonHuman","NA","Genitalia"),
                    values = Site_colors,
                    guide=guide_legend(override.aes = list(size = 20),
                    title.theme = element_text(face="bold", size=40),
                    label.theme =element_text(face="bold",size=40), 
                    keywidth = 12, keyheight = 12, 
                    order = 4))  

ggsave(p11,filename = "phylogenetic_tree_example.pdf", width = 32.0, height = 50.0, limitsize = FALSE)
```


07_treefile_filter_and_reroot
## Considering the evolutionary characteristics of the PV host species, we reroot the phylogenetic tree with the aves PVs and reptile PVs with newick_utils (v1.6), following by a bootstrap value in phylogenetic tree file.

# remove the first bootstrap value
cat $input_tree | sed -E 's/\)[^\/]*\//\)/g' > $output_tree

# or remove the second bootstrap value
cat $input_tree | sed 's/\/[^:]*:/:/g' > $output_tree

# reroot the phylogenetic tree
nw_reroot $output_tree "the root of phylogenetic tree" > $output_reroot_tree

08_01_remove_gap
## As the reference of hom_sap_HPV16, we need to prepare the alignment file without gap in reference genome to generate the SNVs of curated data, so we used trimAl (v1.4.1) to remove the coordinate data of the alignment file corresponding to the gap in hom_sap_HPV16.

# extract reference sequence from the alignment
echo "Extract reference sequence from the alignment ..."
echo ""
echo "hom_sap_HPV16" > ref.txt    ##reference_strain
$seqtk_dir subseq $input_fa ref.txt > $outdir/$prefix.reference.fa
rm ref.txt

# step1: extract gap regions from the alignment
$python_dir/python3 $scripts_dir/03_extra_gap_regions.py \
    -fa $outdir/$prefix.reference.fa \
    --output $outdir/gap_regions.txt

# step2: remove gap alignments
trimal -in $input_fa -out $outdir/$prefix.rmgap.fa -selectcols { `cat $outdir/gap_regions.txt` }

08_02_SNV_calling
## SNV calling of removed-gap aligned fasta from curated data with snp-site (v2.5.1)

# step1: SNV calling from curated genome
snp-sites -mv \
	-o $prefix \
	$input_fa

# step2: Sort with strain order of phylogenetic tree
python $scripts_dir/04_sort_snpsites_output_vcf.py \
    -tree_order $tree_order \
    -vcf_in $prefix2.vcf \
    -vcf_out $prefix2.sort.vcf

09_CDS_alignment
## The alignment for coding sequence can better reflect the amino acid mutations caused by SNVs in the coding region.
java -jar $MACSE_dir/macse.jar \
   -prog alignSequences \
   -gc_def 1 \
   -seq $cds.checked.fa \
   -out_NT $gene.macse_NT.aln.fa \
   -out_AA $gene.macse_AA.aln.fa"

10_Genetic_Background_Analysis (PCA)
## For 3548 curated PVs, we used ADMIXTURE (v1.3.0) to generate maximum likelihood estimation of individual ancestries from multilocus SNVs and define 70 clades of phylogenetic tree.

# step1: generate *.ped & *.map file
plink --double-id \
   --vcf $snp_vcf \
   --recode 12 \
   --out $prefix

# step2: search for best K value
for K in $(seq 1 50)
do
admixture --cv $prefix.ped $K -j30 | tee log${K}.out
done

# step3: generate figure to pick K value by plotQ.R
```{r}
library(pophelper)
library(dplyr)
library(pals)

setwd('/path/to/Program_example/ADMIXTURE')
color = c("#ed1299", "#246b93", "#cc8e12", "#d66551", "#ddd53e", "#98FB98", "#8249aa", "#f7aa5d", "#B9D3EE", "#FFC0CB", "#931635", "#373bbf", "#8B636C", "#AFEEEE", "#A8A8A8", "#2E8B57", "#FEEBCD", "#7149af" ,"#57e559" ,"#8e3af4" ,"#f9a270" ,"#22547f", "#db5e92", "#edd05e", "#6f25e8", "#0dbc21", "#280f7a", "#6373ed", "#5b910f" ,"#7b34c1" ,"#0cf29a" ,"#d80fc1", "#dd27ce", "#07a301", "#167275", "#391c82", "#2baeb5","#925bea", "#63ff4f", "#09f9f5","#e86502", "#9ed84e", "#39ba30", "#6ad157", "#d561dd", "#99db27", "#e07233", "#ff523f", "#ce2523","#FFC125")

files <- list.files("./input") %>% .[grepl('Q$',.)] %>% paste0(getwd(),'/',.)
tbls <- readQ(files)

plotQ(tbls,imgoutput="join",clustercol=color, showlegend=TRUE,returndata=FALSE,returnplot=FALSE,
      outputfilename="ADMIXTURE_example", imgtype='pdf')
```

11_Principal_Component_Analysis (PCA)
## The SmartPCA (v3.0) was used to generate PCA for the spatial relationship among 3548 curated PVs in difference factors (risk, geography, genus et al.)

# step1: vcf file filtering by plink
plink --vcf $vcf_f \
    --geno $missing_data_cutoff \
    --maf 0.01 \
    --recode \
    --double-id \
    --out $PCA_prefix

# step2: run convertf to generate *.par file for smartpca
convertf -p $PCA_prefix.mask.convertf.par

# step3: run smartpca
smartpca -p $PCA_prefix.smartpca.par > $PCA_prefix.smartpca.$flag.log

# step4: generate figure by ggplot.R
```{r}
library(tidyverse)
library(ggplot2)
library(ggnewscale)
library(ggstar)

setwd("/path/to/Program_example/PCA")
Risk_colors = c("HighRisk" = "red", "LowRisk" = "#B0E2FF", "ProbablyHighRisk" = "orange", "UnClassified" = "grey", "NonHuman" = "#57e559") ##Risk_colors

Disease_colors = c("OPSCC" = "red", "CIN" = "#8e3af4", "BenignDisease" = "#FEEBCD", "ImmuneDisorder" = "#8B636C", "Healthy" = "#B0E2FF", "NA" = "grey", "CSCC" = "orange","NonHuman" = "#57e559") ##Disease_colors

Geographical_colors = c("Africa" = "#cc8e12", "Asia" = "red", "NAmerica" = "#8e3af4", "SAmerica" ="#8B636C", "Australia" = "#FEEBCD", "Europe" = "#FFAEB9", "NA" = "grey","Oceania" = "#3FA9F5", "NonHuman" = "#57e559") ##Geographical_origin_colors

Species_colors = c("red","orange","#FFFF00","#A2CD5A","#3FA9F5",'#8e3af4') ##Species_color

Type_colors = c("Type35" = "#8B636C", "Type16" = "red", "Type18" = "#8e3af4","Type52"="#ddd53e", "Type58" = "orange", "Type6" = "#FFAEB9", "Other_HPV_Types" = "grey", "Type11" = "#3FA9F5", "NonHuman" = "#57e559") ##Type_colors

Genus_colors = c("Beta" = "#ddd53e", "Alpha" = "#cc8e12", "Mu" = "#925bea","Nu"="#FFC125", "Gamma" = "#167275", "Dyoomikron" = "#391c82", "NonPrimate" = "#57e559", "Primate_PVs_UnClassified" = "grey") ##Genus_colors


Tissue_colors = c("Skin" = "#fe5000", "Mucosa" = "#1f78b4","Skin & Mucosa" = "#efdf00","Skin/Mucosa" = "#b2df8a") ##Tissue_colors

Site_colors = c("Crissum" = "#3FA9F5", "Oropharynx" = "red","Epidermis" = "#cc8e12","NonHuman" = "#57e559" ,"NA" = "grey","Genitalia" =  "#8e3af4") ##body_site_colors



prefix <- "PCA_example"
pca <- read.table(paste0(prefix,".eigenvec"))
eigenval <- scan(paste0(prefix,".eigenval"))

names(pca)[1] <- "Strain_Name"
names(pca)[2:ncol(pca)] <- paste0("PC", 1:(ncol(pca)-1))

annot <- read.xlsx("PCA_data_example.xlsx", na.strings = NULL)

pca_species <- pca %>% 
  left_join(annot[c("Strain_Name", "Species_3")], by="Strain_Name")

pca_risk <- pca %>% 
  left_join(annot[c("Strain_Name", "Risk")], by="Strain_Name")

pca_disease <- pca %>% 
  left_join(annot[c("Strain_Name", "Disease_Type")], by="Strain_Name")

pca_geography <- pca %>% 
  left_join(annot[c("Strain_Name", "Geographic")], by="Strain_Name")

pca_type <- pca %>% 
  left_join(annot[c("Strain_Name", "Type_PCA")], by="Strain_Name")

pca_genus <- pca %>% 
  left_join(annot[c("Strain_Name", "Genus_PCA")], by="Strain_Name")

pca_tissue <- pca %>% 
  left_join(annot[c("Strain_Name", "Tissue_PCA")], by="Strain_Name")

pca_site <- pca %>% 
  left_join(annot[c("Strain_Name", "Body_site")], by="Strain_Name")

# first convert to percentage variance explained
pve <- data.frame(PC = 1:3548, pve = eigenval/sum(eigenval)*100)


p1 <- ggplot(pca_species, aes(PC1, PC2, col = Species_3)) +
  geom_point(size = 4,alpha=0.7,shape=16)+
  theme_classic() +
  scale_color_manual(values = c(species_colors))+
  coord_equal() +
  xlab(paste0("PC1 (", signif(pve$pve[1], 3), "%)")) +
  ylab(paste0("PC2 (", signif(pve$pve[2], 3), "%)")) +
  theme(axis.title = element_text(size = 42), axis.text = element_text(size = 35),
        legend.title = element_text(size = 42), legend.text = element_text(size = 32))

p2 <- ggplot(pca_risk, aes(PC1, PC2, col = Risk)) +
  geom_point(size = 4,alpha=0.7,shape=16)+
  theme_classic() +
  scale_color_manual(values = Risk_colors)+
  coord_equal() +
  xlab(paste0("PC1 (", signif(pve$pve[1], 3), "%)")) +
  ylab(paste0("PC2 (", signif(pve$pve[2], 3), "%)")) +
  theme(axis.title = element_text(size = 42), axis.text = element_text(size = 35),
        legend.title = element_text(size = 42), legend.text = element_text(size = 32))

p3 <- ggplot(pca_disease, aes(PC1, PC2, col = Disease_Type)) +  
  geom_point(size = 4,alpha=0.7,shape=16)+
  theme_classic() +
  scale_color_manual(values = Disease_colors)+
  coord_equal() +
  xlab(paste0("PC1 (", signif(pve$pve[1], 3), "%)")) +
  ylab(paste0("PC2 (", signif(pve$pve[2], 3), "%)")) +
  theme(axis.title = element_text(size = 42), axis.text = element_text(size = 35),
        legend.title = element_text(size = 42), legend.text = element_text(size = 32))


p4 <- ggplot(pca_geography, aes(PC1, PC2, col = Geographic)) +  
  geom_point(size = 4,alpha=0.7,shape=16)+
  theme_classic() +
  scale_color_manual(values = Geographical_colors)+
  coord_equal() +
  xlab(paste0("PC1 (", signif(pve$pve[1], 3), "%)")) +
  ylab(paste0("PC2 (", signif(pve$pve[2], 3), "%)")) +
  theme(axis.title = element_text(size = 42), axis.text = element_text(size = 35),
        legend.title = element_text(size = 42), legend.text = element_text(size = 32))


p5 <- ggplot(pca_type, aes(PC1, PC2, col = Type_PCA)) +  
  geom_point(size = 4,alpha=0.7,shape=16)+
  theme_classic() +
  scale_color_manual(values = Type_colors)+
  coord_equal() +
  xlab(paste0("PC1 (", signif(pve$pve[1], 3), "%)")) +
  ylab(paste0("PC2 (", signif(pve$pve[2], 3), "%)")) +
  theme(axis.title = element_text(size = 42), axis.text = element_text(size = 35),
        legend.title = element_text(size = 42), legend.text = element_text(size = 32))

p6 <- ggplot(pca_genus, aes(PC1, PC2, col = Genus_PCA)) +  
  geom_point(size = 4,alpha=0.7,shape=16)+
  theme_classic() +
  scale_color_manual(values = Genus_colors)+
  coord_equal() +
  xlab(paste0("PC1 (", signif(pve$pve[1], 3), "%)")) +
  ylab(paste0("PC2 (", signif(pve$pve[2], 3), "%)")) +
  theme(axis.title = element_text(size = 42), axis.text = element_text(size = 35),
        legend.title = element_text(size = 42), legend.text = element_text(size = 32))

p7 <- ggplot(pca_tissue, aes(PC1, PC2, col = Tissue_PCA)) +  
  geom_point(size = 4,alpha=0.7,shape=16)+
  theme_classic() +
  scale_color_manual(values = Tissue_colors)+
  coord_equal() +
  xlab(paste0("PC1 (", signif(pve$pve[1], 3), "%)")) +
  ylab(paste0("PC2 (", signif(pve$pve[2], 3), "%)")) +
  theme(axis.title = element_text(size = 42), axis.text = element_text(size = 35),
        legend.title = element_text(size = 42), legend.text = element_text(size = 32))

p8 <- ggplot(pca_site, aes(PC1, PC2, col = Body_site)) +  
  geom_point(size = 4,alpha=0.7,shape=16)+
  theme_classic() +
  scale_color_manual(values = Site_colors)+
  coord_equal() +
  xlab(paste0("PC1 (", signif(pve$pve[1], 3), "%)")) +
  ylab(paste0("PC2 (", signif(pve$pve[2], 3), "%)")) +
  theme(axis.title = element_text(size = 42), axis.text = element_text(size = 35),
        legend.title = element_text(size = 42), legend.text = element_text(size = 32))


p9 <- p1|p2|p3|p4|p5|p6|p7|p8


ggsave(p9,filename = "PCA_example.pdf", width =60.0, height = 35.0, limitsize = FALSE)
```

12_SNV_enrichment_in_clades
## For known SNVs, we need to figure out the enrichment of each SNV in different clades to discover the clade-specific SNVs.
python $scripts_dir/09_run_clade_snp_enrich.py \
	-vcf $PVs_vcf_file \
	-clade $PVs_clade_file \
	-outdir $output_dir

13_Risk_Disease_related_SNV_enrichment
## For known SNVs, we need to figure out the enrichment of each SNV in different risk groups and disease groups to discover the risk-specific and disease-specific SNVs.

# disease related SNV
python $scripts_dir/10_run_disease_snp_enrich.py \
	-vcf $PVs_vcf_file \
	 -clade $disease_clade_file \
	 -outdir $output_dir

# risk related SNV
python $scripts_dir/11_run_risk_snp_enrich.py \
	-vcf $PVs_vcf_file \
	-clade $risk_clade_file \
	-outdir $output_dir


14_SNV_distribution_in_curated_genome
## For risk-specific and disease-specific SNVs discovered above, we want to understand the distribution of specific SNV in the curated genome.
# R.code
```{r}
library(tidyverse)
library(hrbrthemes)
library(viridis)
library(plotly)
library(openxlsx)
library(RColorBrewer)
library(gplots)
library(ComplexHeatmap)

setwd("/path/to/Program_example/SNV_distribution")

Nt_color = c('A' = "#2E8B57", 'T' = "red", "C" = "dodgerblue", "G" = "purple", "-" = "grey")

data <- read.xlsx('SNV_distribution.xlsx')
data <- as.data.frame(data)
rownames(data) <- data[, 1]
data[, 1] <- NULL
data <- as.matrix(data)

pdf("SNV_distribution.pdf", height=2, width=50)

p = Heatmap(data,
            name="SNP_Distribution",
            col=Nt_color,
            column_names_rot = 45,
            show_row_names = T,
            row_names_side = "right",
            show_column_names = F,
            column_names_gp=gpar(fontsize=10),
            row_names_gp=gpar(fontsize=10),
            cluster_rows=F,
            cluster_columns=F
            )

p
dev.off()
```

15_SNV_annotation
##We used Ensembl's Variant Effect Predictor (VEP)17 to annotate variants using the reference of hom_sap_HPV16.
vep -i $input_vcf \
    --ref_gff ref_genome.gff.gz \
    --fasta ref_genome.fa.gz \
    -o $output.vep.vcf \
    --force_overwrite


16_Linkage_disequilibrium_analysis
##For the known SNVs, we used LDBlockShow (v1.40) to generate linkage disequilibrium (LD) heatmap and annotation results from VCF file to explore the linkage relationship among prognosis-related SNVs.

# step1: prepare the input vcf file for LDBlockShow
python3 -u $scripts_dir/12_prepare_LDBlockShow_input_vcf.py \
    -vcf $vcf \
    -clade $clade \
    -disease $disease \
    -snps $snps \
    -output $output \
    -output2 $output2 

# step2: generate linkage disequilibrium (LD) heatmap from VCF file
LDBlockShow \
    -InVCF $output2 \
    -InGFF $gff \
    -ShowGWASSpeSNP \
    -MAF 0.01 \
    -Miss 0.25 \
    -OutPut  $output_dir\
    -Region "the_length_of_reference_genome"\       ##HPV16:1:7906
    -OutPng \
    -SeleVar 2 \
    -BlockType 1

17_Divergence_time_estimation
##For the divergence time estimation of PVs, we used BEAST 2 as a method to estimate rooted time-measured phylogenetic tree of curated 600PVs using relaxed molecular clock models.

# step1: prepare the input XML file for BEAST with BEAUti
# BEAUti is a graphical user-interface (GUI) application for generating BEAST XML files (Windows, macOS; details in https://www.beast2.org/beauti/)
## parameter setting (details in input_example.xml)
Gamma Category Count: 4
Subst Model: HKY
Clock Model: Relaxed Clock Log Normal
Starting tree: Newick Tree (phylogenetic tree file of 600PVs)
MRCA prior: 6 groups
Chain Length: "200,000,000"
Store Every: "20,000"
Subtree Slide: "0"
Wilson Balding: "0"
Narrow Exchange: "0"
Wide Exchange: "0"

# step2: generate divergence time estimation with BEAST 2 (v2.5.2)
beast -seed $seed \
 -beagle_SSE \
 -beagle_GPU \
 -beagle_double \
 -loglevel info \
 -prefix $prefix \
 -statefile $state_file \
 -threads $threads \
 $input_xml_file


18_Divergence_time_annotation
##For the "*.trees" file produced by BEAST 2, we used the program "TreeAnnotator" to summarize the information onto the target tree.

treeannotator -burnin 10 \
    -target $target_tree_file \
    -lowMem $input_tree_file \
    $output_file
