set -uex


# Input file that contains the accession numbers.
ACC=FILENAME


# Use the taxonomy specific files to build the custom database.
TAXDIR=~/refs/taxonomy

# Files used in generating the Centrifuge index.
TABLE=$TAXDIR/table.txt
NODES=$TAXDIR/nodes.dmp
NAMES=$TAXDIR/names.dmp

# Make the taxonomy directory.
mkdir -p $TAXDIR

#
# Download the taxonomy for centrifuge.
# Needs to be done separately and only once.
#

#
# Prepare the taxonomy data.
# This operation only needs to be done once for the entire website.
#
# Download the taxonomy files from NCBI.
# (cd $TAXDIR && wget ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz)
# (cd $TAXDIR && wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz)

# Uncompress the taxonomy files.
# (cd $TAXDIR &&  tar -xzvf $TAXDIR/taxdump.tar.gz)
# (cd $TAXDIR && gunzip nucl_gb.accession2taxid.gz)

# Create the conversion table (accession to taxid mapping).
# cat $TAXDIR/nucl_gb.accession2taxid | cut -f 2,3 > $TABLE


# How many reads to simulate.
N=1000

# Lenght of reads.
L=100

# Read error rates.
E=0.02

# Will store data in these locations
mkdir -p input output code

# The FASTA file storing the sequence for accession numbers.
FASTA=input/reference.fa

# The FASTQ file storing the reads simulated from the reference.
R1=input/r1.fq
R2=input/r2.fq

# The accesion taxonomy file
TAXONOMY=input/taxonomy.txt

# The Centrifuge classification report.
REPORT=output/centrifuge-report.txt

# The Centrifuge output.
OUTPUT=output/centrifuge-output.txt

# The Centrifuge accuracy report.
ACCURACY=accuracy-centrifuge.txt

# Copy over the accession file to input folder to aid reproducibility.
cp $ACC input/accessions.txt

# Get sequences from NCBI.
epost -db nuccore -input $ACC | efetch -format fasta > $FASTA

# Identify the taxonomy for each accession number.
epost -db nuccore -input $ACC  | esummary | xtract -pattern DocumentSummary -element Caption,TaxId,Title > $TAXONOMY

# Store the Centrifuge database under the checksum name.
INDEX=index/db

# Make thr directory
mkdir -p index

# Build the centrifuge index.
centrifuge-build -p 2 --conversion-table $TABLE --taxonomy-tree $NODES --name-table $NAMES $FASTA $INDEX >> runlog.txt

# Install the plac command parser.
pip install plac -q

# Clone the read simulator. Must use wgsim2.
git clone --depth=1 https://github.com/ialbert/wgsim2.git > log.txt

# Compile the read simulator.
(cd wgsim2 && make 2> log.txt)

# Simulate reads with errors in fixed N mode.
wgsim2/wgsim2 -f -r 0 -R 0 -N $N -e $E -1 $L -2 $L $FASTA $R1 $R2 > log.txt

# Run Centrifuge
centrifuge -x $INDEX -1 $R1 -2 $R2 -S $OUTPUT  --report-file $REPORT

# Kraken style report
centrifuge-kreport -x $INDEX $OUTPUT > $REPORT

# The location of the code that validates the reads.
URL2=https://raw.githubusercontent.com/biostars/biocode/master/scripts/classify/validate.py

# Get the code.
curl $URL2 > code/validate.py

# Run the validator
python code/validate.py -f $OUTPUT -t $TAXONOMY -c $N > $ACCURACY