set -uex # Input file that contains the accession numbers. ACC=FILENAME # Use the taxonomy specific files to build the custom database. TAXDIR=~/refs/taxonomy # Files used in generating the Centrifuge index. TABLE=$TAXDIR/table.txt NODES=$TAXDIR/nodes.dmp NAMES=$TAXDIR/names.dmp # Make the taxonomy directory. mkdir -p $TAXDIR # # Download the taxonomy for centrifuge. # Needs to be done separately and only once. # # # Prepare the taxonomy data. # This operation only needs to be done once for the entire website. # # Download the taxonomy files from NCBI. # (cd $TAXDIR && wget ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz) # (cd $TAXDIR && wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/nucl_gb.accession2taxid.gz) # Uncompress the taxonomy files. # (cd $TAXDIR && tar -xzvf $TAXDIR/taxdump.tar.gz) # (cd $TAXDIR && gunzip nucl_gb.accession2taxid.gz) # Create the conversion table (accession to taxid mapping). # cat $TAXDIR/nucl_gb.accession2taxid | cut -f 2,3 > $TABLE # How many reads to simulate. N=1000 # Lenght of reads. L=100 # Read error rates. E=0.02 # Will store data in these locations mkdir -p input output code # The FASTA file storing the sequence for accession numbers. FASTA=input/reference.fa # The FASTQ file storing the reads simulated from the reference. R1=input/r1.fq R2=input/r2.fq # The accesion taxonomy file TAXONOMY=input/taxonomy.txt # The Centrifuge classification report. REPORT=output/centrifuge-report.txt # The Centrifuge output. OUTPUT=output/centrifuge-output.txt # The Centrifuge accuracy report. ACCURACY=accuracy-centrifuge.txt # Copy over the accession file to input folder to aid reproducibility. cp $ACC input/accessions.txt # Get sequences from NCBI. epost -db nuccore -input $ACC | efetch -format fasta > $FASTA # Identify the taxonomy for each accession number. epost -db nuccore -input $ACC | esummary | xtract -pattern DocumentSummary -element Caption,TaxId,Title > $TAXONOMY # Store the Centrifuge database under the checksum name. INDEX=index/db # Make thr directory mkdir -p index # Build the centrifuge index. centrifuge-build -p 2 --conversion-table $TABLE --taxonomy-tree $NODES --name-table $NAMES $FASTA $INDEX >> runlog.txt # Install the plac command parser. pip install plac -q # Clone the read simulator. Must use wgsim2. git clone --depth=1 https://github.com/ialbert/wgsim2.git > log.txt # Compile the read simulator. (cd wgsim2 && make 2> log.txt) # Simulate reads with errors in fixed N mode. wgsim2/wgsim2 -f -r 0 -R 0 -N $N -e $E -1 $L -2 $L $FASTA $R1 $R2 > log.txt # Run Centrifuge centrifuge -x $INDEX -1 $R1 -2 $R2 -S $OUTPUT --report-file $REPORT # Kraken style report centrifuge-kreport -x $INDEX $OUTPUT > $REPORT # The location of the code that validates the reads. URL2=https://raw.githubusercontent.com/biostars/biocode/master/scripts/classify/validate.py # Get the code. curl $URL2 > code/validate.py # Run the validator python code/validate.py -f $OUTPUT -t $TAXONOMY -c $N > $ACCURACY