# Stop on errors. Show the commands as these are executed. set -uex # The reference genome. ACC=AF086833 # Select sequencing run. SRR=SRR1972739 # The directory that stores the reference file. REF=ref # GenBank format stores all information. GBK=$REF/$ACC.gb # The FASTA format contains the sequence only. FASTA=$REF/$ACC.fa # The GFF format contains the intervals only. GFF=$REF/$ACC.gff # The alignment file. BAM=$SRR.bam # The resulting VCF file. VCF=$SRR.vcf # The logfile of the run. LOG=runlog.txt # Make the directory for the reference. mkdir -p $REF # Fetch the sequence from NCBI. efetch -db nuccore -format gb -id $ACC > $GBK # Convert genbank to fasta. cat $GBK | seqret -filter -osformat2 fasta > $FASTA # Convert genbank to GFF. Keep only the first 59 lines... (see later why) cat $GBK | seqret -filter -feature -osformat2 gff | head -59 > $GFF # # Above is demonstrates the ridiculous side of bioinformatics, where you have to # bend backwards to do something that should just work in the first place. # # The seqret transformation also adds the sequence information # into the GFF file at the end which is correct but usually not present in most GFF files. # For that reason most tools expect the GFF to be tabular all the way # an will fail on this on this valid GFF file. # # So we need to keep only the first N lines up to the point where the # word FASTA appears in the GFF file. You can find that line by: # # cat -n $GFF | grep ##FASTA # # This shows 59 hence why we keep only the first 59 lines. # Generate the bwa index. bwa index $FASTA 2>> $LOG # Index the reference genome samtools faidx $FASTA # Obtain the sequencing data. fastq-dump -X 10000 --split-files $SRR 2>> $LOG 1>> $LOG # Align to reference. bwa mem $FASTA ${SRR}_1.fastq ${SRR}_2.fastq 2>> $LOG | samtools sort > $BAM 2>> $LOG # Index the BAM file. samtools index $BAM # Coverage at every single base according to samtools samtools depth $BAM > $SRR-samtools-coverage.txt # Coverage over just a region samtools depth -r AF086833:1000-1005 $BAM > $SRR-samtools-coverage-query.txt # Generates a histogram of how many bases # are covered at a given depth. # Reports both per chromosome and genome wide. # In this case they are the same since we have only one chromosome. bedtools genomecov -ibam $BAM > $SRR-bedtools-histogram.txt # We will now compute coverages over the intervals. # Adds a number at the end (column 10) that indicates how many reads cover that feature. bedtools multicov -bams $SRR.bam -bed $GFF > $SRR-multicov-coverage.txt # Adds a number at the end that indicates how many reads on the same strand cover that feature. bedtools multicov -s -bams $BAM -bed $GFF > $SRR-multicov-coverage-stranded.txt