Download Sequences from GenBank, Keeping Only Codons

The following script takes a space separated list of GenBank numbers as input, and then uses BioPython to download the corresponding sequences from GenBank, strips off all non-coding nucleotides, gives the sequences sensible names, and assembles them into a FASTA file. It is pretty basic, and does not do a lot of fancy error checking, and is probably a little too specific to be useful for most people. I can imagine extending it in a number of ways that would make it much more useful in a number of different contexts. For example, parsing the GenBank data into multiple FASTA files on a gene-by-gene basis. Nonetheless, despite its dubious general utility, I’m still posting it here. This is partly so that other people can use it as a starting point to develop scripts more applicable for their own work, but also so that I myself do not have to wade through several (very useful!) BioPython tutorials if I ever want to do this again.

class Gene(object):

    def __init__(self, seq_record, feature):
        self.gb_num = seq_record.id
        self.type = feature.type
        self.start = feature.location.start.position
        self.end = feature.location.end.position
        self.seq = seq_record.seq[self.start:self.end]
        try:
            self.name = feature.qualifiers['gene'][0]
        except KeyError:
            self.name = None
        try:
            self.codon_start = feature.qualifiers['codon_start'][0]
        except KeyError:
            self.codon_start = None
        try:
            self.translation = feature.qualifiers['translation'][0]
        except KeyError:
            self.translation = None

    def __len__(self):
        return len(self.seq)

    def chars(self):
        return str(self.seq)

def main(): """ Main CLI handler. """

    parser = OptionParser(usage=_prog_usage,
        add_help_option=True,
        version=_prog_version,
        description=_prog_description)

    parser.add_option('-o', '--output',
        action='store_const',
        dest='output',
        metavar='PREFIX',
        help='prefix for output file names')

    parser.add_option('-w', '--wrap',
        action='store_true',
        dest='wrap',
        default=False,
        help='wrap sequences (slows things down ...)')

    (opts, args) = parser.parse_args()

    if len(args) == 0:
        args = sys.stdin.read()
        args = [a for a in args.split("\n") if a]

    if opts.output is None:
        out = sys.stdout
    else:
        out = open(opts.output, "w")

    gb_recs = Entrez.efetch(db="nucleotide", rettype="gb", id=",".join(args))

    for seq_record in SeqIO.parse(gb_recs, "genbank"):
        dp = seq_record.description.split(' ')
        label = seq_record.id + "_" + dp[0] + "_" + dp[1]
        sys.stderr.write("[%s: %s]\n" % (seq_record.id, seq_record.description))
        sys.stderr.write("%10s    % 7s    % 7s    % 8s\n" \
                    % ("Gene",
                       "Start",
                       "End",
                       "Length"))
        codons = StringIO.StringIO()
        for f in seq_record.features:
            if f.type == "CDS":
                g = Gene(seq_record, f)
                sys.stderr.write("%10s    % 7d    % 7d    % 8d\n" \
                    % (g.name,
                       g.start,
                       g.end,
                       len(g)))
                codons.write(g.chars())

        out.write(">%s\n" % label)
        codons = codons.getvalue()
        if opts.wrap:
            out.write("%s\n\n" % textwrap.fill(codons, width=76))
        else:
            out.write("%s\n\n" % codons)

if __name__ == '__main__':
    main()
Share