# Stop on errors. set -uexo pipefail # The reference genome. ACC=CP000253 # Select sequencing run. SRR=SRR397558 # How many sequencing reads to operate on initially. N=250000 # The directory that stores the reference sequence. mkdir -p ref # The file that stores the reference genome in GENBANK format. GB=ref/${ACC}.gb # The file that stores the reference genome in FASTA format. REF=ref/${ACC}.fa # The file that store the annotation in GFF format. GFF=ref/${ACC}.gff # Download the reference accession number as GENBANK. # We will then format as FASTA and GFF # We need to rename the sequence to match the locus of the GFF format. efetch -db nuccore -id $ACC -format gb > $GB # Reformat GenBank to FASTA. cat $GB | readseq -p -f fasta > $REF 2>> log.txt # Reformat GenBank to GFF. Keep only lines that match the word CDS. cat $GB | readseq -p -f gff 2>> log.txt | grep CDS > $GFF # Index the reference with BWA bwa index $REF 1>> log.txt 2>> log.txt # Index the referene with samtools to visualize in IGV. samtools faidx $REF # The directory that stores the sequencing reads. mkdir -p reads # Download the sequencing data. fastq-dump -X $N --origfmt --split-files -O reads $SRR # Generate a report on the sequencing data size. echo "*** Sequence statistics" seqkit stat reads/${SRR}_?.fastq echo "*** " # Simulate reads from the data. # We do this to contrast "non-stranded" sequencing data to "stranded" library prep # data so that you can better understand what "stranded" RNA-Seq looks like. # Shortcut names of pairs for the simulation. SIMR1=reads/sim1.fastq SIMR2=reads/sim2.fastq # Simulate reads from the reference. wgsim -e 0 -r 0 -R 0 -X 0 -N $N $REF ${SIMR1} ${SIMR2} 2>> log.txt # Align the simulated data. # First we align only one of the pairs. With that establish whether there is a directionality to the data bwa mem $REF $SIMR1 2>> log.txt | samtools sort > sim1.bam 2>> log.txt # Index the simulated alignment file. samtools index sim1.bam # We now take the real data. # Shortcut names of real sequencing data pairs. R1=reads/${SRR}_1.fastq R2=reads/${SRR}_2.fastq # The name of alignment file. BAM=real1.bam # Align the first in pair. bwa mem $REF $R1 2>> log.txt | samtools sort > $BAM 2>> log.txt # Index the real1.bam file. samtools index real1.bam # Investigate, visualize and compare real1.bam to the sim1.bam. # What can you tell about it. # # Lets find anti-sense transcripts. # # From the view you can note how the first in pair aligns # against the sense of the original transcript. Hence, antisense transcript # will map in the same (!) orientation as the feature. # We'll demonstrate the forward strand first. # Create a feature file that contain the features on the forward strand only. # The seventh column of the GFF file is the strand. cat $GFF | awk '$7 == "+" { print $0 }' > forward_cds.gff # Intersect the bam file with the forward facing features. bedtools intersect -a $BAM -b forward_cds.gff > forward_overlap.bam # The reads that support sense transcription. samtools view -b -F 4 -f 16 forward_overlap.bam > forward_sense.bam # The reads that support anti sense transcription. samtools view -b -F 4 -F 16 forward_overlap.bam > forward_antisense.bam # Create an index for every new bam file in the directory. ls -1 forward_*.bam | xargs -n 1 samtools index # Produce a coverage files that let us see coverages when not zoomed in. bedtools genomecov -ibam forward_sense.bam -bg > forward_sense.bedgraph bedtools genomecov -ibam forward_antisense.bam -bg > forward_antisense.bedgraph # The full solution is now to generalize to paired end alignments. # Select so that both pairs are in the correct orientation.