Easy Rejection Sampling in msBayes

01 Oct 2009
Posted by Jeet Sukumaran

msBayes is a suite of tools that support a Approximate Bayesian Computation approach for phylogeographic analysis, estimation and hypothesis testing. The following Python script facilitates the execution of the rejection sampling step of the analysis, by wrapping the invocation to the "msReject" program, automatically identifying the summary statistic columns in the data files and composing the correct command arguments. The script depends crucially on a header row of column names as the first line of the files, and assumes that all prior/parameter column names begin with the string "PRI.".

#! /usr/bin/env python
 
###############################################################################
##  Copyright 2009 Jeet Sukumaran.
##
##  This program is free software; you can redistribute it and/or modify
##  it under the terms of the GNU General Public License as published by
##  the Free Software Foundation; either version 3 of the License, or
##  (at your option) any later version.
##
##  This program is distributed in the hope that it will be useful,
##  but WITHOUT ANY WARRANTY; without even the implied warranty of
##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##  GNU General Public License for more details.
##
##  You should have received a copy of the GNU General Public License along
##  with this program. If not, see <http://www.gnu.org/licenses/>.
##
###############################################################################
 
"""
Performs accept/reject sampling on a file of simulations from priors to produce
set of samples the posterior distribution.
(1) Identifies summary statistic columns in a given data file (based on column
names that do not begin with "PRI.".
(2) Calls msReject with the appriopriate parameters.
"""
 
from optparse import OptionGroup
from optparse import OptionParser
import subprocess
import sys
import os
 
_prog_usage = '%prog [options] <OBS-DATA-FILE> <SIM-DATA-FILE>'
_prog_version = 'SAMPLE-POSTERIORS Version 1.0'
_prog_description = """Performs accept/reject sampling on a file of simulations from priors to produce
set of samples the posterior distribution."""
_prog_author = 'Jeet Sukumaran'
_prog_copyright = 'Copyright (C) 2009 Jeet Sukumaran.'
 
def parse_fields(row, separator="\t", include_empty=False):
    fields = row.strip().replace("\n", "").split(separator)
    if not include_empty:
        fields = [f for f in fields if f != ""]
    return fields        
 
def main():
    """
    Main CLI handler.
    """
 
    parser = OptionParser(usage=_prog_usage, 
        add_help_option=True, 
        version=_prog_version, 
        description=_prog_description)    
 
    parser.add_option('-a', '--application',
        action='store',
        dest='app_path',
        type='string', # also 'float', 'string' etc.
        default="msReject",
        metavar='MSREJECT-PATH',
        help='path to msReject (default="%default")')     
 
    parser.add_option('-t', '--tolerance',
        action='store',
        dest='tolerance',
        type='float', # also 'float', 'string' etc.
        default=0.02,
        metavar='TOLERANCE',
        help='sampling tolerance (default=%default)')
 
    parser.add_option('-s', '--sep',
        action='store',
        dest='separator',
        type='string', # also 'float', 'string' etc.
        default='\t',
        metavar='COLUMN-SEPARATOR',
        help='character to use as column separator (default=<TAB>)')
 
    parser.add_option('-n', '--no-execute', 
        action='store_true',
        dest="dry_run",
        default=False,
        help='print command to standard output, but do not actually run')
 
    parser.add_option('-q', '--quiet', 
        action='store_true',
        dest="quiet",
        default=False,
        help='run without any supplemental messages')
 
    (opts, args) = parser.parse_args()
 
    if len(args) < 2:
        sys.exit("Paths to observed and simulated data files not specified.")
 
    obs_fpath = os.path.expandvars(os.path.expanduser(args[0]))
    sim_fpath = os.path.expandvars(os.path.expanduser(args[1]))
    obs_file = open(obs_fpath, "rU")
    sim_file = open(sim_fpath, "rU")
 
    obs_fields = parse_fields(obs_file.readline(), opts.separator)
    sim_fields = parse_fields(sim_file.readline(), opts.separator)
 
    if len(obs_fields) != len(sim_fields):
        sys.exit("ERROR: Number of columns in observed data file (%d) does not equal number of columns in simulated data file (%d)." 
            % (len(obs_fields), len(sim_fields)))
 
    param_cols = []            
    sum_stat_cols = []            
 
    for i, f in enumerate(obs_fields):
        if obs_fields[i] != sim_fields[i]:
            sys.exit('ERROR: Column %d does not correspond between observed and simulated files ("%s" vs. "%s").' 
                % (i+1, obs_fields[i], sim_fields[i]))
        if not f.startswith("PRI."):
            sum_stat_cols.append(i)
        else:
            param_cols.append(i)
 
    if not opts.quiet:
        sys.stderr.write("Following columns identified as PARAMETERS:\n")
        for c in param_cols:
            sys.stderr.write('  [%d]:"%s"\n' % (c+1, obs_fields[c]))
        sys.stderr.write("\nFollowing columns identified as SUMMARY STATISTICS:\n")
        for c in sum_stat_cols:
            sys.stderr.write('  [%d]:"%s"\n' % (c+1, obs_fields[c]))        
        sys.stderr.write('\n')
 
    cmd = (opts.app_path 
        + " " 
        + obs_fpath 
        + " " 
        + sim_fpath 
        + " "        
        + str(opts.tolerance)
        + " "
        + " ".join([str(i+1) for i in sum_stat_cols]))
 
    if opts.dry_run:        
        sys.stdout.write(cmd + "\n")
        sys.exit(0)
 
    if not opts.quiet:        
        sys.stderr.write("\nCommand to be executed:\n")
        sys.stderr.write('   %s\n' % cmd)
 
    msreject = subprocess.Popen(cmd, shell=True)
 
    if not opts.quiet:    
        sys.stderr.write('\nRunning with PID: %d.\n' % msreject.pid)
 
    msreject.wait()
 
    if not opts.quiet:
        sys.stderr.write("Completed with exit status: %s.\n" % msreject.returncode)
 
if __name__ == '__main__':
    main()

Post new comment

The content of this field is kept private and will not be shown publicly.
CAPTCHA
This question is for testing whether you are a biological visitor and to prevent automated spam submissions.
Image CAPTCHA
Enter the characters shown in the image.