#
# The script assumes that reference data have been already installed with
#
# http://data.biostarhandbook.com/redo/zika/zika-references.sh
#

# Stop on errors.
set -uex

# Set the project name.
PROJECT=PRJNA313294

# Shortcut to annotations
GTF=~/refs/Homo_sapiens.GRCh38.96.chr.gtf

# Full run information.
RUNINFO=runinfo.all.csv

# Run info for single end reads.
SINGLE=runinfo.single.csv

# Run information for paired end reads.
PAIRED=runinfo.paired.csv

# How many reads per sample to unpack.
READNUM=1000000

# The Ensembl transcript file
# The file may be downloaded with
# http://data.biostarhandbook.com/redo/zika/zika-references.sh
REF=~/refs/Homo_sapiens.GRCh38.cdna.all.fa
IDX=~/refs/Homo_sapiens.GRCh38.cdna.all.idx

# Downloading the run information.
esearch -db sra -query $PROJECT | efetch -format runinfo > $RUNINFO

# Separate SRR numbers for the single end files.
cat $RUNINFO | grep  SRR.*SINGLE | cut -f 1 -d , > $SINGLE

# Separate SRR numbers for the paired end files.
cat $RUNINFO | grep  SRR.*PAIRED | cut -f 1 -d , > $PAIRED

# Download and unpack single end reads.
cat $SINGLE  | parallel fastq-dump -X $READNUM -O reads >> log.txt

# Download and unpack paired end reads
cat $PAIRED  | parallel fastq-dump -X $READNUM --split-files -O reads >> log.txt

# Create the kallisto index (this task only needs to be done once!)
# Comment it out for subsequent runs.
kallisto index -i $IDX $REF

# Store results files here.
mkdir -p results

# Run kallisto on single end reads.
cat $SINGLE | parallel kallisto quant -i $IDX -o results/{} --single -l 187 -s 70 reads/{}.fastq 2>> log.txt

# Run kallisto on paired end reads.
cat $PAIRED | parallel kallisto quant -i $IDX -o results/{} -l 187 -s 70 reads/{}_1.fastq reads/{}_2.fastq 2>> log.txt

# Rename the files by the sample.
cat $SINGLE $PAIRED | parallel "cut -f 1,4 results/{}/abundance.tsv > results/{}.txt"

# These are the control samples.
#MOCK samples are SRR3191542 SRR3191543 SRR3194428 SRR3194429
paste results/SRR3191542.txt results/SRR3191543.txt results/SRR3194428.txt results/SRR3194429.txt | cut -f 1,2,4,6,8 > mock.txt

# The infected ZIKV samples.
# ZIKV SRR3191544 SRR3191545 SRR3194430 SRR3194431
paste results/SRR3191544.txt results/SRR3191545.txt results/SRR3194430.txt results/SRR3194431.txt | cut -f 1,2,4,6,8 > zikv.txt

# Combine the mock and zikv.
# The header need to be edited by hand to reflect the true sample names.
paste mock.txt zikv.txt | cut -f 1-5,7-11 > counts.txt

# Filter the file for low expression levels.
cat counts.txt | awk ' ($2+$3+$4+$5+$6+$7+$8+$9) > 25 { print $0 } ' > over25.txt

# Reformat the data to have integers instead of float numbers.
cat over25.txt | awk ' { printf("%s\t%4.0f\t%4.0f\t%4.0f\t%4.0f\t%4.0f\t%4.0f\t%4.0f\t%4.0f\n", $1, $2, $3, $4, $5, $6, $7, $8, $9) } ' > valid.txt

# Get the script that runs the deseq 1 method.
wget -q -nc  http://data.biostarhandbook.com/rnaseq/code/deseq1.r

# Perform the differential expression study.
cat valid.txt | Rscript deseq1.r 4x4 > results.txt 2>> log.txt

# Download the script to draw the heatmaps (requires the glplots package).
wget -q -nc http://data.biostarhandbook.com/rnaseq/code/draw-heatmap.r

# Generate heatmap from the deseq1 normalized matrix.
cat norm-matrix-deseq1.txt | Rscript draw-heatmap.r > heatmap-norm-matrix-deseq1.pdf  2>> log.txt