metaWRAP for Cholera
Creators
Description
Prefetch SRA Files
bash
# Download SRA files based on SRR numbers
prefetch SRRXXXXX
Move .sra Files to a Single Directory
bash
find . -type f -name '*.sra' -exec mv {} /data/chengz/Cohort/CFS-1/CFS-1-fastq \;
Check Folder Size
bash
du -sh *
Convert .sra Files to FASTQ Format
bash
fasterq-dump -p -e 80 --split-3 *.sra
Quality Control with MetaWRAP
bash
source activate metawrap
metawrap read_qc --skip-bmtagger -1 FLJ2_1.fq -2 FLJ2_2.fq -t 50 -o READ_QC/FLJ2
--skip-bmtagger: Skips host contamination removal.
Input files (-1 and -2) should be named as sample_1 and sample_2.
-t 50: Use 50 threads.
Merge FASTQ Files
bash
mkdir ALL
cat *_R1.fastq > ALL/ALL_READS_R1.fastq
cat *_R2.fastq > ALL/ALL_READS_R2.fastq
Remove Host Contamination
Paired-End Sequencing
bash
for i in SRR26886987 SRR26886988 ...; do
bowtie2 -p 30 -x /mnt/database/humanindex/humangenome \
-1 ${i}_1.fastq.gz -2 ${i}_2.fastq.gz \
-S ${i}.sam --un-conc-gz /data/chengz/Cohort1/BangladeshControl/Fastq/${i}.fastq.gz;
done
Single-End Sequencing
bash
for i in SRR26267438 SRR26267439 ...; do
bowtie2 -p 30 -x /mnt/database/humanindex/humangenome \
-U ${i}.fastq.gz -S ${i}.sam \
--un-conc-gz /data/chengz/Cohort1/IndianControl/Fastq/Output/${i}.fastq.gz;
done
Assembly with MetaWRAP
Single-End Assembly
bash
megahit -r SRR15408397_1.fastq --min-contig-len 800 -t 96 -o ASSEMBLY
Paired-End Assembly
bash
for i in *.fastq.1.gz; do
f=$(basename "$i" .fastq.1.gz)
metawrap assembly -1 $f.fastq.1.gz -2 $f.fastq.2.gz -t 50 -l 800 -o ASSEMBLY/${f};
done
Binning with MetaWRAP
Prepare Filenames
bash
for file in input_dir/*; do
if [[ $file == *.fastq.1.gz ]]; then
mv $file $(echo $file | sed 's/\.fastq\.1\.gz/_1.fastq.gz/')
fi
done
for file in input_dir/*; do
if [[ $file == *.fastq.2.gz ]]; then
mv $file $(echo $file | sed 's/\.fastq\.2\.gz/_2.fastq.gz/')
fi
done
Binning
bash
for sample in "${samples[@]}"; do
metawrap binning -o "${sample}_BINNING" -t 40 \
-a ASSEMBLY/${sample}/final_assembly.fasta \
--metabat2 --maxbin2 --concoct \
${sample}_1.fastq ${sample}_2.fastq
done
Bin Refinement
bash
for dir in SRR*_BINNING; do
metawrap bin_refinement -o "${dir}_REBIN" -t 30 \
-A "${dir}/metabat2_bins/" -B "${dir}/maxbin2_bins/" -C "${dir}/concoct_bins/" \
-c 50 -x 10;
done
Extract and Rename Refined Bins
bash
find SRR*/metawrap_50_10_bins/ -type f -name "*.fa" -exec cp {} /path/to/Output/ \;
Remove Redundant Bins with dRep
Taxonomic Classification with GTDB-Tk
bash
source activate gtdbtk
nohup gtdbtk classify_wf --genome_dir /data/chengz/Cohort1/TotalBin/BinALL --skip_ani_screen --out_dir /data/chengz/Cohort1/TotalBin/gtdbtkALL --extension fa --cpus 16 &
Files
metaWRAP.txt
Files
(2.9 kB)
Name | Size | Download all |
---|---|---|
md5:46a2edc692148c2e0fa6e1722b1dd1d5
|
2.9 kB | Preview Download |
Additional details
References
- Chen S, Zhou Y, Chen Y, Gu J. fastp: an ultra-fast all-in-one FASTQ preprocessor. Bioinformatics. 2018; 34(17): i884-i890 Uritskiy GV, DiRuggiero J, Taylor J. MetaWRAP-a flexible pipeline for genome-resolved metagenomic data analysis. Microbiome. 2018; 6(1):158-170 Chaumeil PA, Mussig AJ, Hugenholtz P, Parks DH. GTDB-Tk v2: memory friendly classification with the genome taxonomy database. Bioinformatics. 2022; 38(23):5315-5316 Hyatt D, Chen GL, Locascio PF, Land ML, Larimer FW, Hauser LJ. Prodigal: prokaryotic gene recognition and translation initiation site identification. BMC Bioinformatics. 2010; 11: 119-129