#! /usr/bin/env python
# -*- coding: utf-8 -*-
##############################################################################
## DendroPy Phylogenetic Computing Library.
##
## Copyright 2010-2015 Jeet Sukumaran and Mark T. Holder.
## All rights reserved.
##
## See "LICENSE.rst" for terms and conditions of usage.
##
## If you use this work or any portion thereof in published work,
## please cite it as:
##
## Sukumaran, J. and M. T. Holder. 2010. DendroPy: a Python library
## for phylogenetic computing. Bioinformatics 26: 1569-1571.
##
##############################################################################
"""
Models, modeling and model-fitting of parsimony.
"""
from dendropy.utility.error import TaxonNamespaceIdentityError
from dendropy.utility import deprecate
class _NodeStateSetMap(dict):
def __init__(self, taxon_state_sets_map=None):
self.taxon_state_sets_map = taxon_state_sets_map
def __getitem__(self, key):
try:
return dict.__getitem__(self, key)
except KeyError:
v = self.taxon_state_sets_map[key.taxon]
self[key] = v
return v
def _store_sets_as_attr(n, state_sets_attr_name, v):
setattr(n, state_sets_attr_name, v)
def _retrieve_state_sets_from_attr(n, state_sets_attr_name, taxon_state_sets_map):
try:
return getattr(n, state_sets_attr_name)
except AttributeError:
v = taxon_state_sets_map[n.taxon]
setattr(n, state_sets_attr_name, v)
return v
[docs]
def fitch_down_pass(
postorder_node_iter,
state_sets_attr_name="state_sets",
taxon_state_sets_map=None,
weights=None,
score_by_character_list=None,
**kwargs,
):
"""
Returns the parsimony score given a list of nodes in postorder and
associated states, using Fitch's (1971) unordered parsimony algorithm.
Parameters
----------
postorder_node_iter : iterable of/over |Node| objects
An iterable of |Node| objects in in order of post-order
traversal of the tree.
state_sets_attr_name : str
Name of attribute on |Node| objects in which state set lists
will stored/accessed. If |None|, then state sets will not be stored on
the tree.
taxon_state_sets_map : dict[taxon] = state sets
A dictionary that takes a taxon object as a key and returns a state set
list as a value. This will be used to populate the state set of a node
that has not yet had its state sets scored and recorded (typically,
leaves of a tree that has not yet been processed).
weights : iterable
A list of weights for each pattern.
score_by_character_list : None or list
If not |None|, should be a reference to a list object.
This list will be populated by the scores on a character-by-character
basis.
Returns
-------
s : int
Parismony score of tree.
Notes
-----
Currently this requires a bifurcating tree (even at the root).
Examples
--------
Assume that we have a tree, ``tree``, and an associated data set, ``data``::
import dendropy
from dendropy.model.parsimony import fitch_down_pass
taxa = dendropy.TaxonNamespace()
data = dendropy.StandardCharacterMatrix.get_from_path(
"apternodus.chars.nexus",
"nexus",
taxon_namespace=taxa)
tree = dendropy.Tree.get_from_path(
"apternodus.tre",
"nexus",
taxon_namespace=taxa)
taxon_state_sets_map = data.taxon_state_sets_map(gaps_as_missing=True)
The following will return the parsimony score of the ``tree`` with
respect to the data in ``data``::
score = fitch_down_pass(
nodes=tree.postorder_node_iter(),
taxon_state_sets_map=taxon_set_map)
print(score)
In the above, every |Node| object of ``tree`` will have an attribute
added, "state_sets", that stores the list of state sets from the analysis::
for nd in tree:
print(nd.state_sets)
If you want to store the list of state sets in a different attribute, e.g.,
"analysis1_states"::
score = fitch_down_pass(
nodes=tree.postorder_node_iter(),
state_sets_attr_name="analysis1_states",
taxon_state_sets_map=taxon_set_map)
print(score)
for nd in tree:
print(nd.analysis1_states)
Or not to store these at all::
score = fitch_down_pass(
nodes=tree.postorder_node_iter(),
state_sets_attr_name=None,
taxon_state_sets_map=taxon_set_map)
print(score)
Scoring custom data can be done by something like the following::
taxa = dendropy.TaxonNamespace()
taxon_state_sets_map = {}
t1 = taxa.require_taxon("A")
t2 = taxa.require_taxon("B")
t3 = taxa.require_taxon("C")
t4 = taxa.require_taxon("D")
t5 = taxa.require_taxon("E")
taxon_state_sets_map[t1] = [ set([0,1]), set([0,1]), set([0]), set([0]) ]
taxon_state_sets_map[t2] = [ set([1]), set([1]), set([1]), set([0]) ]
taxon_state_sets_map[t3] = [ set([0]), set([1]), set([1]), set([0]) ]
taxon_state_sets_map[t4] = [ set([0]), set([1]), set([0,1]), set([1]) ]
taxon_state_sets_map[t5] = [ set([1]), set([0]), set([1]), set([1]) ]
tree = dendropy.Tree.get_from_string(
"(A,(B,(C,(D,E))));", "newick",
taxon_namespace=taxa)
score = fitch_down_pass(tree.postorder_node_iter(),
taxon_state_sets_map=taxon_state_sets_map)
print(score)
"""
if "postorder_nodes" in kwargs:
deprecate.dendropy_deprecation_warning(
preamble="Deprecated since DendroPy 5",
old_construct="postorder_nodes",
new_construct="postorder_node_iter"
)
postorder_node_iter = kwargs.pop("postorder_nodes")
if kwargs:
raise ValueError(f"Unrecognized argument(s): {kwargs}")
if score_by_character_list is not None:
assert len(score_by_character_list) == 0
for idx in range(len(list(taxon_state_sets_map.values())[0])): # this is unacceptable!
score_by_character_list.append(0)
score = 0
if state_sets_attr_name is None:
node_state_set_map = _NodeStateSetMap(taxon_state_sets_map)
get_node_state_sets = lambda node : node_state_set_map[node]
set_node_state_sets = lambda node, v : node_state_set_map.__setitem__(node, v)
else:
get_node_state_sets = lambda node : _retrieve_state_sets_from_attr(node, state_sets_attr_name, taxon_state_sets_map)
set_node_state_sets = lambda node, v : _store_sets_as_attr(node, state_sets_attr_name, v)
for nd in postorder_node_iter:
c = nd.child_nodes()
if not c:
ss = get_node_state_sets(nd)
continue
left_c, right_c = c[:2]
remaining = c[2:]
left_ssl = get_node_state_sets(left_c)
while True:
right_ssl = get_node_state_sets(right_c)
result = []
for n, ssp in enumerate(zip(left_ssl, right_ssl)):
left_ss, right_ss = ssp
inter = left_ss.intersection(right_ss)
if inter:
result.append(inter)
else:
if weights is None:
wt = 1
else:
wt = weights[n]
score += wt
result.append(left_ss.union(left_ss, right_ss))
if score_by_character_list is not None:
# try:
# score_by_character_list[n] += wt
# except IndexError:
# score_by_character_list.append(wt)
score_by_character_list[n] += wt
if remaining:
right_c = remaining.pop(0)
left_ssl = result
else:
break
# setattr(nd, state_sets_attr_name, result)
set_node_state_sets(nd, result)
return score
[docs]
def fitch_up_pass(
preorder_node_iter,
state_sets_attr_name="state_sets",
taxon_state_sets_map=None,
**kwargs):
"""
Finalizes the state set lists associated with each node using the "final
phase" of Fitch's (1971) unordered parsimony algorithm.
Parameters
----------
preorder_node_iter : iterable of/over |Node| objects
An iterable of |Node| objects in in order of post-order
traversal of the tree.
state_sets_attr_name : str
Name of attribute on |Node| objects in which state set lists
will stored/accessed. If |None|, then state sets will not be stored on
the tree.
taxon_state_sets_map : dict[taxon] = state sets
A dictionary that takes a taxon object as a key and returns a state set
list as a value. This will be used to populate the state set of a node
that has not yet had its state sets scored and recorded (typically,
leaves of a tree that has not yet been processed).
Notes
-----
Currently this requires a bifurcating tree (even at the root).
Examples
--------
::
taxa = dendropy.TaxonNamespace()
data = dendropy.StandardCharacterMatrix.get_from_path(
"apternodus.chars.nexus",
"nexus",
taxon_namespace=taxa)
tree = dendropy.Tree.get_from_path(
"apternodus.tre",
"nexus",
taxon_namespace=taxa)
taxon_state_sets_map = data.taxon_state_sets_map(gaps_as_missing=True)
score = fitch_down_pass(tree.postorder_node_iter(),
taxon_state_sets_map=taxon_state_sets_map)
print(score)
fitch_up_pass(tree.preorder_node_iter())
for nd in tree:
print(nd.state_sets)
"""
if "preorder_nodes" in kwargs:
deprecate.dendropy_deprecation_warning(
preamble="Deprecated since DendroPy 5",
old_construct="preorder_nodes",
new_construct="preorder_node_iter"
)
preorder_node_iter = kwargs.pop("preorder_nodes")
if len(kwargs) != 0:
raise ValueError(f"Unrecognized argument(s): {kwargs}")
node_state_sets_map = {}
for nd in preorder_node_iter:
c = nd.child_nodes()
p = nd.parent_node
if (not c) or (not p):
continue
assert(len(c) == 2)
left_c, right_c = c
try:
left_ssl = getattr(left_c, state_sets_attr_name)
except AttributeError:
if not taxon_state_sets_map:
raise
left_ssl = taxon_state_sets_map[left_c.taxon]
try:
right_ssl = getattr(right_c, state_sets_attr_name)
except AttributeError:
if not taxon_state_sets_map:
raise
right_ssl = taxon_state_sets_map[right_c.taxon]
par_ssl = getattr(p, state_sets_attr_name)
curr_ssl = getattr(nd, state_sets_attr_name)
result = []
for n, ssp in enumerate(zip(par_ssl, curr_ssl, left_ssl, right_ssl)):
par_ss, curr_ss, left_ss, right_ss = ssp
down_parup_inter = par_ss.intersection(curr_ss)
if down_parup_inter == par_ss:
final_ss = down_parup_inter
else:
rl_inter = left_ss.intersection(right_ss)
if not rl_inter:
final_ss = par_ss.union(curr_ss)
else:
in_par_and_left = par_ss.intersection(left_ss)
in_par_and_right = par_ss.intersection(right_ss)
final_ss = in_par_and_left.union(in_par_and_right, curr_ss)
#_LOG.debug("downpass = %s, par = %s, left = %s, right = %s, final_ss= %s" %
# (str(curr_ss), str(par_ss), str(left_ss), str(right_ss), str(final_ss)))
result.append(final_ss)
setattr(nd, state_sets_attr_name, result)
[docs]
def parsimony_score(
tree,
chars,
gaps_as_missing=True,
weights=None,
score_by_character_list=None,
):
"""
Calculates the score of a tree, ``tree``, given some character data,
``chars``, under the parsimony model using the Fitch algorithm.
Parameters
----------
tree : a |Tree| instance
A |Tree| to be scored. Must reference the same |TaxonNamespace| as
``chars``.
chars : a |CharacterMatrix| instance
A |CharacterMatrix|-derived object with data to be scored. Must have
the same |TaxonNamespace| as ``tree``.
gap_as_missing : bool
If |True| [default], then gaps will be treated as missing data.
If |False|, then gaps will be treated as a new/additional state.
weights : iterable
A list of weights for each pattern/column in the matrix.
score_by_character_list : None or list
If not |None|, should be a reference to a list object.
This list will be populated by the scores on a character-by-character
basis.
Returns
-------
pscore : int
The parsimony score of the tree given the data.
Examples
--------
::
import dendropy
from dendropy.calculate import treescore
# establish common taxon namespace
taxon_namespace = dendropy.TaxonNamespace()
# Read data; if data is, e.g., "standard", use StandardCharacterMatrix.
# If unsure of data type, can do:
# dataset = dendropy.DataSet.get(
# path="path/to/file.nex",
# schema="nexus",
# taxon_namespace=tns,)
# chars = dataset.char_matrices[0]
chars = dendropy.DnaCharacterMatrix.get(
path="pythonidae.chars.nexus",
schema="nexus",
taxon_namespace=taxon_namespace)
tree = dendropy.Tree.get(
path="pythonidae.mle.newick",
schema="newick",
taxon_namespace=taxon_namespace)
# We store the site-specific scores here
# This is optional; if we do not want to
# use the per-site scores, just pass in |None|
# for the ``score_by_character_list`` argument
# or do not specify this argument at all.
score_by_character_list = []
score = treescore.parsimony_score(
tree,
chars,
gaps_as_missing=False,
score_by_character_list=score_by_character_list)
# Print the results: the score
print("Score: {}".format(score))
# Print the results: the per-site scores
for idx, x in enumerate(score_by_character_list):
print("{}: {}".format(idx+1, x))
Notes
-----
If the same data is going to be used to score multiple trees or multiple times,
it is probably better to generate the 'taxon_state_sets_map' once and call
"fitch_down_pass" directly yourself, as this function generates a new map
each time.
"""
if tree.taxon_namespace is not chars.taxon_namespace:
raise TaxonNamespaceIdentityError(tree, chars)
taxon_state_sets_map = chars.taxon_state_sets_map(gaps_as_missing=gaps_as_missing)
nodes = tree.postorder_node_iter()
pscore = fitch_down_pass(nodes,
taxon_state_sets_map=taxon_state_sets_map,
weights=weights,
score_by_character_list=score_by_character_list)
return pscore