Download Sequences from GenBank, Keeping Only Codons
The following script takes a space separated list of GenBank numbers as input, and then uses BioPython to download the corresponding sequences from GenBank, strips off all non-coding nucleotides, gives the sequences sensible names, and assembles them into a FASTA file.
It is pretty basic, and does not do a lot of fancy error checking, and is probably a little too specific to be useful for most people. I can imagine extending it in a number of ways that would make it much more useful in a number of different contexts. For example, parsing the GenBank data into multiple FASTA files on a gene-by-gene basis. Nonetheless, despite its dubious general utility, I'm still posting it here. This is partly so that other people can use it as a starting point to develop scripts more applicable for their own work, but also so that I myself do not have to wade through several (very useful!) BioPython tutorials if I ever want to do this again.
#! /usr/bin/env python ############################################################################### ## get-codons.py ## ## Copyright 2009 Jeet Sukumaran. ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 3 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License along ## with this program. If not, see <http://www.gnu.org/licenses/>. ## ############################################################################### """ Downloads nucleotide sequences from GenBank, trimming out non-coding DNA. """ from optparse import OptionGroup from optparse import OptionParser import os import sys import textwrap import StringIO from Bio import Entrez from Bio import SeqIO from Bio.Seq import Seq from Bio.Alphabet import IUPAC _prog_usage = '%prog [options] <GENBANK ACCESSION NUMBERS>' _prog_version = 'GET-CODONS Version 1.0' _prog_description = 'Downloads nucleotide sequences from GenBank, trimming out non-coding DNA.' _prog_author = 'Jeet Sukumaran' _prog_copyright = 'Copyright (C) 2009 Jeet Sukumaran.' class Gene(object): def __init__(self, seq_record, feature): self.gb_num = seq_record.id self.type = feature.type self.start = feature.location.start.position self.end = feature.location.end.position self.seq = seq_record.seq[self.start:self.end] try: self.name = feature.qualifiers['gene'][0] except KeyError: self.name = None try: self.codon_start = feature.qualifiers['codon_start'][0] except KeyError: self.codon_start = None try: self.translation = feature.qualifiers['translation'][0] except KeyError: self.translation = None def __len__(self): return len(self.seq) def chars(self): return str(self.seq) def main(): """ Main CLI handler. """ parser = OptionParser(usage=_prog_usage, add_help_option=True, version=_prog_version, description=_prog_description) parser.add_option('-o', '--output', action='store_const', dest='output', metavar='PREFIX', help='prefix for output file names') parser.add_option('-w', '--wrap', action='store_true', dest='wrap', default=False, help='wrap sequences (slows things down ...)') (opts, args) = parser.parse_args() if len(args) == 0: args = sys.stdin.read() args = [a for a in args.split("\n") if a] if opts.output is None: out = sys.stdout else: out = open(opts.output, "w") gb_recs = Entrez.efetch(db="nucleotide", rettype="gb", id=",".join(args)) for seq_record in SeqIO.parse(gb_recs, "genbank"): dp = seq_record.description.split(' ') label = seq_record.id + "_" + dp[0] + "_" + dp[1] sys.stderr.write("[%s: %s]\n" % (seq_record.id, seq_record.description)) sys.stderr.write("%10s % 7s % 7s % 8s\n" \ % ("Gene", "Start", "End", "Length")) codons = StringIO.StringIO() for f in seq_record.features: if f.type == "CDS": g = Gene(seq_record, f) sys.stderr.write("%10s % 7d % 7d % 8d\n" \ % (g.name, g.start, g.end, len(g))) codons.write(g.chars()) out.write(">%s\n" % label) codons = codons.getvalue() if opts.wrap: out.write("%s\n\n" % textwrap.fill(codons, width=76)) else: out.write("%s\n\n" % codons) if __name__ == '__main__': main()
feed
Post new comment