# # The script assumes that reference data have been already installed with # # http://data.biostarhandbook.com/redo/zika/zika-references.sh # # Stop on errors. set -uex # Set the project name. PROJECT=PRJNA313294 # Shortcut to annotations GTF=~/refs/Homo_sapiens.GRCh38.96.chr.gtf # Full run information. RUNINFO=runinfo.all.csv # Run info for single end reads. SINGLE=runinfo.single.csv # Run information for paired end reads. PAIRED=runinfo.paired.csv # How many reads per sample to unpack. READNUM=1000000 # The Ensembl transcript file # The file may be downloaded with # http://data.biostarhandbook.com/redo/zika/zika-references.sh REF=~/refs/Homo_sapiens.GRCh38.cdna.all.fa IDX=~/refs/Homo_sapiens.GRCh38.cdna.all.idx # Downloading the run information. esearch -db sra -query $PROJECT | efetch -format runinfo > $RUNINFO # Separate SRR numbers for the single end files. cat $RUNINFO | grep SRR.*SINGLE | cut -f 1 -d , > $SINGLE # Separate SRR numbers for the paired end files. cat $RUNINFO | grep SRR.*PAIRED | cut -f 1 -d , > $PAIRED # Download and unpack single end reads. cat $SINGLE | parallel fastq-dump -X $READNUM -O reads >> log.txt # Download and unpack paired end reads cat $PAIRED | parallel fastq-dump -X $READNUM --split-files -O reads >> log.txt # Create the kallisto index (this task only needs to be done once!) # Comment it out for subsequent runs. kallisto index -i $IDX $REF # Store results files here. mkdir -p results # Run kallisto on single end reads. cat $SINGLE | parallel kallisto quant -i $IDX -o results/{} --single -l 187 -s 70 reads/{}.fastq 2>> log.txt # Run kallisto on paired end reads. cat $PAIRED | parallel kallisto quant -i $IDX -o results/{} -l 187 -s 70 reads/{}_1.fastq reads/{}_2.fastq 2>> log.txt # Rename the files by the sample. cat $SINGLE $PAIRED | parallel "cut -f 1,4 results/{}/abundance.tsv > results/{}.txt" # These are the control samples. #MOCK samples are SRR3191542 SRR3191543 SRR3194428 SRR3194429 paste results/SRR3191542.txt results/SRR3191543.txt results/SRR3194428.txt results/SRR3194429.txt | cut -f 1,2,4,6,8 > mock.txt # The infected ZIKV samples. # ZIKV SRR3191544 SRR3191545 SRR3194430 SRR3194431 paste results/SRR3191544.txt results/SRR3191545.txt results/SRR3194430.txt results/SRR3194431.txt | cut -f 1,2,4,6,8 > zikv.txt # Combine the mock and zikv. # The header need to be edited by hand to reflect the true sample names. paste mock.txt zikv.txt | cut -f 1-5,7-11 > counts.txt # Filter the file for low expression levels. cat counts.txt | awk ' ($2+$3+$4+$5+$6+$7+$8+$9) > 25 { print $0 } ' > over25.txt # Reformat the data to have integers instead of float numbers. cat over25.txt | awk ' { printf("%s\t%4.0f\t%4.0f\t%4.0f\t%4.0f\t%4.0f\t%4.0f\t%4.0f\t%4.0f\n", $1, $2, $3, $4, $5, $6, $7, $8, $9) } ' > valid.txt # Get the script that runs the deseq 1 method. wget -q -nc http://data.biostarhandbook.com/rnaseq/code/deseq1.r # Perform the differential expression study. cat valid.txt | Rscript deseq1.r 4x4 > results.txt 2>> log.txt # Download the script to draw the heatmaps (requires the glplots package). wget -q -nc http://data.biostarhandbook.com/rnaseq/code/draw-heatmap.r # Generate heatmap from the deseq1 normalized matrix. cat norm-matrix-deseq1.txt | Rscript draw-heatmap.r > heatmap-norm-matrix-deseq1.pdf 2>> log.txt