# Stop on errors.
set -uexo pipefail

# The reference genome.
ACC=CP000253

# Select sequencing run.
SRR=SRR397558

# How many sequencing reads to operate on initially.
N=250000

# The directory that stores the reference sequence.
mkdir -p ref

# The file that stores the reference genome in GENBANK format.
GB=ref/${ACC}.gb

# The file that stores the reference genome in FASTA format.
REF=ref/${ACC}.fa

# The file that store the annotation in GFF format.
GFF=ref/${ACC}.gff

# Download the reference accession number as GENBANK.
# We will then format as FASTA and GFF

# We need to rename the sequence to match the locus of the GFF format.
efetch -db nuccore -id $ACC -format gb > $GB

# Reformat GenBank to FASTA.
cat $GB | readseq -p -f fasta > $REF 2>> log.txt

# Reformat GenBank to GFF. Keep only lines that match the word CDS.
cat $GB | readseq -p -f gff 2>> log.txt | grep CDS > $GFF

# Index the reference with BWA
bwa index $REF 1>> log.txt 2>> log.txt

# Index the referene with samtools to visualize in IGV.
samtools faidx $REF

# The directory that stores the sequencing reads.
mkdir -p reads

# Download the sequencing data.
fastq-dump -X $N --origfmt --split-files -O reads $SRR

# Generate a report on the sequencing data size.
echo "*** Sequence statistics"
seqkit stat reads/${SRR}_?.fastq
echo "*** "

# Simulate reads from the data.
# We do this to contrast "non-stranded" sequencing data to "stranded" library prep
# data so that you can better understand what "stranded" RNA-Seq looks like.

# Shortcut names of pairs for the simulation.
SIMR1=reads/sim1.fastq
SIMR2=reads/sim2.fastq

# Simulate reads from the reference.
wgsim -e 0 -r 0 -R 0 -X 0 -N $N $REF ${SIMR1} ${SIMR2} 2>> log.txt

# Align the simulated data.
# First we align only one of the pairs. With that establish whether there is a directionality to the data
bwa mem $REF $SIMR1 2>> log.txt | samtools sort > sim1.bam 2>> log.txt

# Index the simulated alignment file.
samtools index sim1.bam

# We now take the real data.

# Shortcut names of real sequencing data pairs.
R1=reads/${SRR}_1.fastq
R2=reads/${SRR}_2.fastq

# The name of alignment file.
BAM=real1.bam

# Align the first in pair.
bwa mem $REF  $R1 2>> log.txt | samtools sort > $BAM 2>> log.txt

# Index the real1.bam file.
samtools index real1.bam

# Investigate, visualize and compare real1.bam  to the sim1.bam.
# What can you tell about it.

#
# Lets find anti-sense transcripts.
#
# From the view you can note how the first in pair aligns
# against the sense of the original transcript. Hence, antisense transcript
# will map in the same (!) orientation as the feature.

# We'll demonstrate the forward strand first.

# Create a feature file that contain the features on the forward strand only.
# The seventh column of the GFF file is the strand.
cat $GFF | awk '$7 == "+" { print $0 }' > forward_cds.gff

# Intersect the bam file with the forward facing features.
bedtools intersect -a $BAM -b forward_cds.gff > forward_overlap.bam

# The reads that support sense transcription.
samtools view -b -F 4 -f 16 forward_overlap.bam > forward_sense.bam

# The reads that support anti sense transcription.
samtools view -b -F 4 -F 16 forward_overlap.bam > forward_antisense.bam

# Create an index for every new bam file in the directory.
ls -1 forward_*.bam | xargs -n 1 samtools index

# Produce a coverage files that let us see coverages when not zoomed in.
bedtools genomecov -ibam forward_sense.bam  -bg > forward_sense.bedgraph
bedtools genomecov -ibam forward_antisense.bam  -bg > forward_antisense.bedgraph

# The full solution is now to generalize to paired end alignments.
# Select so that both pairs are in the correct orientation.