Download Sequences from GenBank, Keeping Only Codons

02 Oct 2009
Posted by Jeet Sukumaran

The following script takes a space separated list of GenBank numbers as input, and then uses BioPython to download the corresponding sequences from GenBank, strips off all non-coding nucleotides, gives the sequences sensible names, and assembles them into a FASTA file.

It is pretty basic, and does not do a lot of fancy error checking, and is probably a little too specific to be useful for most people. I can imagine extending it in a number of ways that would make it much more useful in a number of different contexts. For example, parsing the GenBank data into multiple FASTA files on a gene-by-gene basis. Nonetheless, despite its dubious general utility, I'm still posting it here. This is partly so that other people can use it as a starting point to develop scripts more applicable for their own work, but also so that I myself do not have to wade through several (very useful!) BioPython tutorials if I ever want to do this again.

#! /usr/bin/env python
 
###############################################################################
##  get-codons.py
##
##  Copyright 2009 Jeet Sukumaran.
##
##  This program is free software; you can redistribute it and/or modify
##  it under the terms of the GNU General Public License as published by
##  the Free Software Foundation; either version 3 of the License, or
##  (at your option) any later version.
##
##  This program is distributed in the hope that it will be useful,
##  but WITHOUT ANY WARRANTY; without even the implied warranty of
##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##  GNU General Public License for more details.
##
##  You should have received a copy of the GNU General Public License along
##  with this program. If not, see <http://www.gnu.org/licenses/>.
##
###############################################################################
 
"""
Downloads nucleotide sequences from GenBank, trimming out non-coding DNA.
"""
 
from optparse import OptionGroup
from optparse import OptionParser
import os
import sys
import textwrap
 
import StringIO
 
from Bio import Entrez
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
 
_prog_usage = '%prog [options] <GENBANK ACCESSION NUMBERS>'
_prog_version = 'GET-CODONS Version 1.0'
_prog_description = 'Downloads nucleotide sequences from GenBank, trimming out non-coding DNA.'
_prog_author = 'Jeet Sukumaran'
_prog_copyright = 'Copyright (C) 2009 Jeet Sukumaran.'
 
class Gene(object):
 
    def __init__(self, seq_record, feature):
        self.gb_num = seq_record.id
        self.type = feature.type
        self.start = feature.location.start.position
        self.end = feature.location.end.position
        self.seq = seq_record.seq[self.start:self.end]
        try:        
            self.name = feature.qualifiers['gene'][0]
        except KeyError:
            self.name = None
        try:            
            self.codon_start = feature.qualifiers['codon_start'][0]
        except KeyError:
            self.codon_start = None
        try:            
            self.translation = feature.qualifiers['translation'][0]
        except KeyError:
            self.translation = None
 
    def __len__(self):
        return len(self.seq)
 
    def chars(self):
        return str(self.seq)
 
def main():
    """
    Main CLI handler.
    """
 
    parser = OptionParser(usage=_prog_usage, 
        add_help_option=True, 
        version=_prog_version, 
        description=_prog_description)    
 
    parser.add_option('-o', '--output',
        action='store_const',
        dest='output',
        metavar='PREFIX',
        help='prefix for output file names')
 
    parser.add_option('-w', '--wrap',
        action='store_true',
        dest='wrap',
        default=False,
        help='wrap sequences (slows things down ...)')          
 
    (opts, args) = parser.parse_args()
 
    if len(args) == 0:
        args = sys.stdin.read()
        args = [a for a in args.split("\n") if a]
 
    if opts.output is None:
        out = sys.stdout
    else:
        out = open(opts.output, "w")
 
    gb_recs = Entrez.efetch(db="nucleotide", rettype="gb", id=",".join(args))
 
    for seq_record in SeqIO.parse(gb_recs, "genbank"):
        dp = seq_record.description.split(' ')
        label = seq_record.id + "_" + dp[0] + "_" + dp[1]      
        sys.stderr.write("[%s: %s]\n" % (seq_record.id, seq_record.description))
        sys.stderr.write("%10s    % 7s    % 7s    % 8s\n" \
                    % ("Gene",
                       "Start",
                       "End",
                       "Length"))
        codons = StringIO.StringIO()
        for f in seq_record.features:
            if f.type == "CDS":
                g = Gene(seq_record, f)
                sys.stderr.write("%10s    % 7d    % 7d    % 8d\n" \
                    % (g.name,
                       g.start,
                       g.end,
                       len(g)))
                codons.write(g.chars())
 
        out.write(">%s\n" % label)
        codons = codons.getvalue()
        if opts.wrap:
            out.write("%s\n\n" % textwrap.fill(codons, width=76))
        else:
            out.write("%s\n\n" % codons)
 
if __name__ == '__main__':
    main()

Post new comment

The content of this field is kept private and will not be shown publicly.
CAPTCHA
This question is for testing whether you are a biological visitor and to prevent automated spam submissions.
Image CAPTCHA
Enter the characters shown in the image.