Source code for dendropy.model.parsimony

#! /usr/bin/env python
# -*- coding: utf-8 -*-

##############################################################################
##  DendroPy Phylogenetic Computing Library.
##
##  Copyright 2010-2015 Jeet Sukumaran and Mark T. Holder.
##  All rights reserved.
##
##  See "LICENSE.rst" for terms and conditions of usage.
##
##  If you use this work or any portion thereof in published work,
##  please cite it as:
##
##     Sukumaran, J. and M. T. Holder. 2010. DendroPy: a Python library
##     for phylogenetic computing. Bioinformatics 26: 1569-1571.
##
##############################################################################

"""
Models, modeling and model-fitting of parsimony.
"""

from dendropy.utility.error import TaxonNamespaceIdentityError
from dendropy.utility import deprecate

class _NodeStateSetMap(dict):
    def __init__(self, taxon_state_sets_map=None):
        self.taxon_state_sets_map = taxon_state_sets_map
    def __getitem__(self, key):
        try:
            return dict.__getitem__(self, key)
        except KeyError:
            v = self.taxon_state_sets_map[key.taxon]
            self[key] = v
            return v

def _store_sets_as_attr(n, state_sets_attr_name, v):
    setattr(n, state_sets_attr_name, v)

def _retrieve_state_sets_from_attr(n, state_sets_attr_name, taxon_state_sets_map):
    try:
        return getattr(n, state_sets_attr_name)
    except AttributeError:
        v = taxon_state_sets_map[n.taxon]
        setattr(n, state_sets_attr_name, v)
        return v

[docs] def fitch_down_pass( postorder_node_iter, state_sets_attr_name="state_sets", taxon_state_sets_map=None, weights=None, score_by_character_list=None, **kwargs, ): """ Returns the parsimony score given a list of nodes in postorder and associated states, using Fitch's (1971) unordered parsimony algorithm. Parameters ---------- postorder_node_iter : iterable of/over |Node| objects An iterable of |Node| objects in in order of post-order traversal of the tree. state_sets_attr_name : str Name of attribute on |Node| objects in which state set lists will stored/accessed. If |None|, then state sets will not be stored on the tree. taxon_state_sets_map : dict[taxon] = state sets A dictionary that takes a taxon object as a key and returns a state set list as a value. This will be used to populate the state set of a node that has not yet had its state sets scored and recorded (typically, leaves of a tree that has not yet been processed). weights : iterable A list of weights for each pattern. score_by_character_list : None or list If not |None|, should be a reference to a list object. This list will be populated by the scores on a character-by-character basis. Returns ------- s : int Parismony score of tree. Notes ----- Currently this requires a bifurcating tree (even at the root). Examples -------- Assume that we have a tree, ``tree``, and an associated data set, ``data``:: import dendropy from dendropy.model.parsimony import fitch_down_pass taxa = dendropy.TaxonNamespace() data = dendropy.StandardCharacterMatrix.get_from_path( "apternodus.chars.nexus", "nexus", taxon_namespace=taxa) tree = dendropy.Tree.get_from_path( "apternodus.tre", "nexus", taxon_namespace=taxa) taxon_state_sets_map = data.taxon_state_sets_map(gaps_as_missing=True) The following will return the parsimony score of the ``tree`` with respect to the data in ``data``:: score = fitch_down_pass( nodes=tree.postorder_node_iter(), taxon_state_sets_map=taxon_set_map) print(score) In the above, every |Node| object of ``tree`` will have an attribute added, "state_sets", that stores the list of state sets from the analysis:: for nd in tree: print(nd.state_sets) If you want to store the list of state sets in a different attribute, e.g., "analysis1_states":: score = fitch_down_pass( nodes=tree.postorder_node_iter(), state_sets_attr_name="analysis1_states", taxon_state_sets_map=taxon_set_map) print(score) for nd in tree: print(nd.analysis1_states) Or not to store these at all:: score = fitch_down_pass( nodes=tree.postorder_node_iter(), state_sets_attr_name=None, taxon_state_sets_map=taxon_set_map) print(score) Scoring custom data can be done by something like the following:: taxa = dendropy.TaxonNamespace() taxon_state_sets_map = {} t1 = taxa.require_taxon("A") t2 = taxa.require_taxon("B") t3 = taxa.require_taxon("C") t4 = taxa.require_taxon("D") t5 = taxa.require_taxon("E") taxon_state_sets_map[t1] = [ set([0,1]), set([0,1]), set([0]), set([0]) ] taxon_state_sets_map[t2] = [ set([1]), set([1]), set([1]), set([0]) ] taxon_state_sets_map[t3] = [ set([0]), set([1]), set([1]), set([0]) ] taxon_state_sets_map[t4] = [ set([0]), set([1]), set([0,1]), set([1]) ] taxon_state_sets_map[t5] = [ set([1]), set([0]), set([1]), set([1]) ] tree = dendropy.Tree.get_from_string( "(A,(B,(C,(D,E))));", "newick", taxon_namespace=taxa) score = fitch_down_pass(tree.postorder_node_iter(), taxon_state_sets_map=taxon_state_sets_map) print(score) """ if "postorder_nodes" in kwargs: deprecate.dendropy_deprecation_warning( preamble="Deprecated since DendroPy 5", old_construct="postorder_nodes", new_construct="postorder_node_iter" ) postorder_node_iter = kwargs.pop("postorder_nodes") if kwargs: raise ValueError(f"Unrecognized argument(s): {kwargs}") if score_by_character_list is not None: assert len(score_by_character_list) == 0 for idx in range(len(list(taxon_state_sets_map.values())[0])): # this is unacceptable! score_by_character_list.append(0) score = 0 if state_sets_attr_name is None: node_state_set_map = _NodeStateSetMap(taxon_state_sets_map) get_node_state_sets = lambda node : node_state_set_map[node] set_node_state_sets = lambda node, v : node_state_set_map.__setitem__(node, v) else: get_node_state_sets = lambda node : _retrieve_state_sets_from_attr(node, state_sets_attr_name, taxon_state_sets_map) set_node_state_sets = lambda node, v : _store_sets_as_attr(node, state_sets_attr_name, v) for nd in postorder_node_iter: c = nd.child_nodes() if not c: ss = get_node_state_sets(nd) continue left_c, right_c = c[:2] remaining = c[2:] left_ssl = get_node_state_sets(left_c) while True: right_ssl = get_node_state_sets(right_c) result = [] for n, ssp in enumerate(zip(left_ssl, right_ssl)): left_ss, right_ss = ssp inter = left_ss.intersection(right_ss) if inter: result.append(inter) else: if weights is None: wt = 1 else: wt = weights[n] score += wt result.append(left_ss.union(left_ss, right_ss)) if score_by_character_list is not None: # try: # score_by_character_list[n] += wt # except IndexError: # score_by_character_list.append(wt) score_by_character_list[n] += wt if remaining: right_c = remaining.pop(0) left_ssl = result else: break # setattr(nd, state_sets_attr_name, result) set_node_state_sets(nd, result) return score
[docs] def fitch_up_pass( preorder_node_iter, state_sets_attr_name="state_sets", taxon_state_sets_map=None, **kwargs): """ Finalizes the state set lists associated with each node using the "final phase" of Fitch's (1971) unordered parsimony algorithm. Parameters ---------- preorder_node_iter : iterable of/over |Node| objects An iterable of |Node| objects in in order of post-order traversal of the tree. state_sets_attr_name : str Name of attribute on |Node| objects in which state set lists will stored/accessed. If |None|, then state sets will not be stored on the tree. taxon_state_sets_map : dict[taxon] = state sets A dictionary that takes a taxon object as a key and returns a state set list as a value. This will be used to populate the state set of a node that has not yet had its state sets scored and recorded (typically, leaves of a tree that has not yet been processed). Notes ----- Currently this requires a bifurcating tree (even at the root). Examples -------- :: taxa = dendropy.TaxonNamespace() data = dendropy.StandardCharacterMatrix.get_from_path( "apternodus.chars.nexus", "nexus", taxon_namespace=taxa) tree = dendropy.Tree.get_from_path( "apternodus.tre", "nexus", taxon_namespace=taxa) taxon_state_sets_map = data.taxon_state_sets_map(gaps_as_missing=True) score = fitch_down_pass(tree.postorder_node_iter(), taxon_state_sets_map=taxon_state_sets_map) print(score) fitch_up_pass(tree.preorder_node_iter()) for nd in tree: print(nd.state_sets) """ if "preorder_nodes" in kwargs: deprecate.dendropy_deprecation_warning( preamble="Deprecated since DendroPy 5", old_construct="preorder_nodes", new_construct="preorder_node_iter" ) preorder_node_iter = kwargs.pop("preorder_nodes") if len(kwargs) != 0: raise ValueError(f"Unrecognized argument(s): {kwargs}") node_state_sets_map = {} for nd in preorder_node_iter: c = nd.child_nodes() p = nd.parent_node if (not c) or (not p): continue assert(len(c) == 2) left_c, right_c = c try: left_ssl = getattr(left_c, state_sets_attr_name) except AttributeError: if not taxon_state_sets_map: raise left_ssl = taxon_state_sets_map[left_c.taxon] try: right_ssl = getattr(right_c, state_sets_attr_name) except AttributeError: if not taxon_state_sets_map: raise right_ssl = taxon_state_sets_map[right_c.taxon] par_ssl = getattr(p, state_sets_attr_name) curr_ssl = getattr(nd, state_sets_attr_name) result = [] for n, ssp in enumerate(zip(par_ssl, curr_ssl, left_ssl, right_ssl)): par_ss, curr_ss, left_ss, right_ss = ssp down_parup_inter = par_ss.intersection(curr_ss) if down_parup_inter == par_ss: final_ss = down_parup_inter else: rl_inter = left_ss.intersection(right_ss) if not rl_inter: final_ss = par_ss.union(curr_ss) else: in_par_and_left = par_ss.intersection(left_ss) in_par_and_right = par_ss.intersection(right_ss) final_ss = in_par_and_left.union(in_par_and_right, curr_ss) #_LOG.debug("downpass = %s, par = %s, left = %s, right = %s, final_ss= %s" % # (str(curr_ss), str(par_ss), str(left_ss), str(right_ss), str(final_ss))) result.append(final_ss) setattr(nd, state_sets_attr_name, result)
[docs] def parsimony_score( tree, chars, gaps_as_missing=True, weights=None, score_by_character_list=None, ): """ Calculates the score of a tree, ``tree``, given some character data, ``chars``, under the parsimony model using the Fitch algorithm. Parameters ---------- tree : a |Tree| instance A |Tree| to be scored. Must reference the same |TaxonNamespace| as ``chars``. chars : a |CharacterMatrix| instance A |CharacterMatrix|-derived object with data to be scored. Must have the same |TaxonNamespace| as ``tree``. gap_as_missing : bool If |True| [default], then gaps will be treated as missing data. If |False|, then gaps will be treated as a new/additional state. weights : iterable A list of weights for each pattern/column in the matrix. score_by_character_list : None or list If not |None|, should be a reference to a list object. This list will be populated by the scores on a character-by-character basis. Returns ------- pscore : int The parsimony score of the tree given the data. Examples -------- :: import dendropy from dendropy.calculate import treescore # establish common taxon namespace taxon_namespace = dendropy.TaxonNamespace() # Read data; if data is, e.g., "standard", use StandardCharacterMatrix. # If unsure of data type, can do: # dataset = dendropy.DataSet.get( # path="path/to/file.nex", # schema="nexus", # taxon_namespace=tns,) # chars = dataset.char_matrices[0] chars = dendropy.DnaCharacterMatrix.get( path="pythonidae.chars.nexus", schema="nexus", taxon_namespace=taxon_namespace) tree = dendropy.Tree.get( path="pythonidae.mle.newick", schema="newick", taxon_namespace=taxon_namespace) # We store the site-specific scores here # This is optional; if we do not want to # use the per-site scores, just pass in |None| # for the ``score_by_character_list`` argument # or do not specify this argument at all. score_by_character_list = [] score = treescore.parsimony_score( tree, chars, gaps_as_missing=False, score_by_character_list=score_by_character_list) # Print the results: the score print("Score: {}".format(score)) # Print the results: the per-site scores for idx, x in enumerate(score_by_character_list): print("{}: {}".format(idx+1, x)) Notes ----- If the same data is going to be used to score multiple trees or multiple times, it is probably better to generate the 'taxon_state_sets_map' once and call "fitch_down_pass" directly yourself, as this function generates a new map each time. """ if tree.taxon_namespace is not chars.taxon_namespace: raise TaxonNamespaceIdentityError(tree, chars) taxon_state_sets_map = chars.taxon_state_sets_map(gaps_as_missing=gaps_as_missing) nodes = tree.postorder_node_iter() pscore = fitch_down_pass(nodes, taxon_state_sets_map=taxon_state_sets_map, weights=weights, score_by_character_list=score_by_character_list) return pscore