#!/usr/bin/env bash # Stop script on any error. set -ue # If you set the so called BLASTDB variable # blast will find the files there automatically. # This is handy as it makes for shorter commands. # This will be the directory we store BLAST databases in. export BLASTDB=~/refs/blastdb # Make the BLAST database directory. mkdir -p $BLASTDB # Temporarily switch to the BLASTDB directory and run the blast database updater. # It will only update the data if it is incomplete or has changed. (cd $BLASTDB && update_blastdb.pl --decompress 16SMicrobial) # # By default the blastdbcmd produces FASTA output # # When we know the accession number we can query for one specific entry. echo "" echo "*** The first ten lines of accession number NR_026093" blastdbcmd -db 16SMicrobial -entry NR_026093 | head # We can limit the sequence length. echo "" echo "*** Limit to the first 20 bases:" blastdbcmd -db 16SMicrobial -entry NR_026093 -range 1-20 # We can also extract all entries from the database. echo "" echo "*** Dump all sequences of the database into file all.fa" blastdbcmd -db 16SMicrobial -entry all > all.fa # We can also format the output using a format string. # # See blastdbcmd -help for the -outfmt parameter # echo "" echo "*** Format the output using a format string: accession number and sequence length" blastdbcmd -db 16SMicrobial -entry all -range 1-20 -outfmt "%a %l" > acc.txt # Show the first ten lines. cat acc.txt | head # How many genes are there echo "" echo "*** How many database sequences in the database" blastdbcmd -db 16SMicrobial -entry all -range 1-20 -outfmt "%a" | wc -l # Since these are 16SMicrobial genes that are highly conserved # let's find out the variety we can see within 20 bases. # What are the top ten most common gene starts and how many of each? echo "" echo "*** Top 10 common gene sequence starts:" blastdbcmd -db 16SMicrobial -entry all -range 1-20 -outfmt "%s" | sort | uniq -c | sort -rn > common.txt # Show the first ten common starts. cat common.txt | head # # How to we build a new blast database from sequences. # # Use the all.fa file to create a new blast database. # # Create a directory that will store the blast database. mkdir -p xdb # Generate the new blast database echo "" echo "*** Building the blast database" makeblastdb -dbtype nucl -in all.fa -parse_seqids -out xdb/my16S # Demonstrates how the new database has the same content" echo "" echo "*** Top 10 common gene sequence starts using the newly created database:" blastdbcmd -db xdb/my16S -entry all -range 1-20 -outfmt "%s" | sort | uniq -c | sort -rn | head