#! /usr/bin/env python
# -*- coding: utf-8 -*-
##############################################################################
## DendroPy Phylogenetic Computing Library.
##
## Copyright 2010-2015 Jeet Sukumaran and Mark T. Holder.
## All rights reserved.
##
## See "LICENSE.rst" for terms and conditions of usage.
##
## If you use this work or any portion thereof in published work,
## please cite it as:
##
## Sukumaran, J. and M. T. Holder. 2010. DendroPy: a Python library
## for phylogenetic computing. Bioinformatics 26: 1569-1571.
##
##############################################################################
"""
Character state definitions and alphabets. Certain state alphabets, such as
DNA, RNA, protein, etc. are defined here. These are termed "fixed" state
alphabets, and for each distinct state alphabet concept (e.g., DNA), there is
one and only one instance of a representation of that concept (i.e., all
DNA-type data in DendroPy, regardless of the source, refer to the same instance
of the state alphabet and state alphabet elements).
"""
import collections
import itertools
from dendropy.datamodel import basemodel
from dendropy.utility import textprocessing
from dendropy.utility import container
###############################################################################
## StateAlphabet
[docs]
class StateAlphabet(
basemodel.DataObject,
basemodel.Annotable):
"""
A master registry mapping state symbols to their definitions.
There are two classes or "denominations" of states:
- fundamental states
These are the basic, atomic, self-contained states of the alphabet,
distinct and mutually-exclusive from every other fundamental state.
E.g., for DNA: adenine, guanine, cytosine, and thymine.
- multi-state states
The states are second-level or "pseudo-states", in that they are
not properly states in and of themselves, but rather each consist
of a set of other states. That is, a multi-state state is a set of
two or more fundamental states. Multi-state states are of one of
two types: "ambiguous" and "polymorphic" states. "Ambiguous" states
represent states in which the true fundamental state is unknown,
but consists of one of the fundamental states to which the
ambiguous states map. "Polymorphic" states represent states in
which the entity actually has multiple fundamental states
simultaneously. "Ambiguous" states are an expression of uncertainty
or lack of knowledge about the identity of state. With
"polymorphic" states, on the other hand, there is no uncertaintly
or lack of knowledge about the state: the state is known
definitively, and it consists of multiple fundamental states. An
example of an ambiguous state would be 'N', representing any base
in molecular sequence data. An example of a polymorphic state would
be the range of a widespread species found in multiple geographic
units. Note that multi-state states can be specified in terms of
other multi-state states, but that upon instantiation, these member
multi-states will be expanded to their fundamental states.
State definitions or identities are immutable: their symbology and mappings
cannot be changed after creation/initialization. State definitions and
identities, however, can be added/removed from a state alphabet.
Parameters
----------
label : string, optional
The name for this state alphabet.
fundamental_states : iterable of strings
An iterable of symbols defining the fundamental (i.e., non-ambiguous
and non-polymorphic states of this alphabet), with a 1-to-1
correspodence between symbols and states. Each state will also be
automatically indexed base on its position in this list. For DNA, this
would be something like: ``'ACGT'`` or ``('A', 'C', 'G', T')``. For
"standard" characters, this would be something like ``'01'`` or ``('0',
'1')``.
no_data_symbol : string
If specified, automatically creates a "no data" ambiguous state,
represented by the (canonical, or primary) symbol
"no_data_symbol", which maps to all fundamental states.
This will also insert |None| into all symbol look-up maps, which, when
dereferenced will return this state. Furthermore, the attribute
``self.no_data_symbol`` will return this symbol and ``self.no_data_state``
will return this state. The 'no data' state will be an ambiguous
multistate type.
ambiguous_states : iterable of tuples
An iterable consisting of tuples expressing ambiguous state symbols and
the set of symbols representing the fundamental states to which they
map. The first element in the tuple is the symbol used to represent the
ambiguous state; this can be blank (""), but if not blank it needs to
be unique across all symbols (including case-variants if the state
alphabet is case-insensitive). The second element is an
iterable of fundamental state symbols to which this ambiguous state
maps. The fundamental state symbols *must* have already been defined,
i.e. given in the value passed to ``fundamental_states``. Note: a
dictionary may seem like a more tractable structure than iterable of
tuples, but we may need to specify multiple anonymous or blank
ambiguous states.
polymorphic_states : iterable of tuples
An iterable consisting of tuples expressing polymorphic state symbols and
the set of symbols representing the fundamental states to which they
map. The first element in the tuple is the symbol used to represent the
polymorphic state; this can be blank (""), but if not blank it needs to
be unique across all symbols (including case-variants if the state
alphabet is case-insensitive). The second element is an
iterable of fundamental state symbols to which this polymorphic state
maps. The fundamental state symbols *must* have already been defined,
i.e. given in the value passed to ``fundamental_states``. Note: a
dictionary may seem like a more tractable structure than iterable of
tuples, but we may need to specify multiple anonymous or blank
polymorphic states.
symbol_synonyms : dictionary
A mapping of symbols, with keys being the new symbols and values being
(already-defined) symbols of states to which they map. This provides a
mechanism by which states with multiple symbols can be managed. For
example, an ambiguous state, "unknown", representing all fundamental
states might be defined with '?' as its primary symbol, and a synonym
symbol for this state might be 'X'.
"""
###########################################################################
### CLass-level Constants
FUNDAMENTAL_STATE = 0
AMBIGUOUS_STATE = 1
POLYMORPHIC_STATE = 2
###########################################################################
### Life-Cycle and Identity
def __init__(self,
fundamental_states=None,
ambiguous_states=None,
polymorphic_states=None,
symbol_synonyms=None,
no_data_symbol=None,
gap_symbol=None,
label=None,
case_sensitive=True):
basemodel.DataObject.__init__(self, label=label)
self._is_case_sensitive = case_sensitive
# Core collection underlying alphabet
self._fundamental_states = []
self._ambiguous_states = []
self._polymorphic_states = []
# Look-up mappings
self._state_identities = None
self._canonical_state_symbols = None
self._canonical_symbol_state_map = None
self._full_symbol_state_map = None
self._index_state_map = None
self._fundamental_states_to_ambiguous_state_map = None
self._fundamental_states_to_polymorphic_state_map = None
# Suppress for initialization
self.autocompile_lookup_tables = False
# This identifies the gap state when compiling the state alphabet. The
# principle purpose behind this is to be able to tell the gap state
# that it is, indeed, a gap state. And the purpose of this, in turn,
# is so that the when the gap state is asked for its fundamental
# indexes, it will return the fundamental indexes of the missing data
# state in its place if it is *NOT* to be treated as a fifth
# fundamental state.
self.gap_state = None
self._gap_symbol = None
self.no_data_state = None
self._no_data_symbol = None
# Populate core collection
if fundamental_states:
for symbol in fundamental_states:
self.new_fundamental_state(symbol)
if gap_symbol:
self.gap_state = self.new_fundamental_state(gap_symbol)
self._gap_symbol = gap_symbol
if no_data_symbol:
self.no_data_state = self.new_ambiguous_state(
symbol=no_data_symbol,
member_states=self._fundamental_states)
self._no_data_symbol = no_data_symbol
if ambiguous_states:
for ss in ambiguous_states:
self.new_ambiguous_state(symbol=ss[0], member_state_symbols=ss[1])
if polymorphic_states:
for ss in polymorphic_states:
self.new_polymorphic_state(symbol=ss[0], member_state_symbols=ss[1])
if symbol_synonyms:
for k in symbol_synonyms:
self.new_symbol_synonym(k, symbol_synonyms[k])
# Build mappings
self.compile_lookup_mappings()
# Post-initialization
self.autocompile_lookup_tables = True
def __hash__(self):
return id(self)
def __eq__(self, other):
return other is self
def __copy__(self, memo=None):
return self
[docs]
def taxon_namespace_scoped_copy(self, memo=None):
return self
def __deepcopy__(self, memo=None):
return self
###########################################################################
### Symbol Management
def _direct_get_state_for_symbol(self, symbol):
"""
Returns the |StateIdentity| instance corresponding to ``symbol``.
"""
for state_symbol, state in self.symbol_state_pair_iter(include_synonyms=True):
if state_symbol == symbol:
return state
raise KeyError(symbol)
def _direct_get_fundamental_states_for_symbols(self, symbols):
"""
Returns the list of |StateIdentity| instances corresponding to
the iterable of symbols given by ``symbols``, with each element in
``symbols`` corresponding to a single symbol.
"""
ss = []
for symbol in symbols:
state = self._direct_get_state_for_symbol(symbol)
ss.extend(state.fundamental_states)
return tuple(ss)
def _validate_new_symbol(self, symbol):
if symbol is None or symbol == "":
raise ValueError("Cannot validate empty symbol")
symbol = str(symbol)
for state_symbol, state in self.symbol_state_pair_iter(include_synonyms=True):
if state_symbol == symbol:
raise ValueError("State with symbol or symbol synonym of '{}' already defined is this alphabet".format(symbol))
return symbol
[docs]
def new_fundamental_state(self, symbol):
"""
Adds a new fundamental state to the collection
of states in this alphabet.
Parameters
----------
symbol : string
The symbol used to represent this state. Cannot have previously
been used to refer to any other state, fundamental or otherwise, as
a primary or synonymous symbol (including implicit synonyms given
by case-variants if the state alphabet is not case-sensitive).
Cannot be blank ("") or |None|.
Returns
-------
s : |StateIdentity|
The new state created and added.
"""
if symbol is None or symbol == "":
raise ValueError("Fundamental states cannot be defined without a valid symbol")
symbol = self._validate_new_symbol(symbol)
index = len(self._fundamental_states)
new_state = StateIdentity(
symbol=symbol,
index=index,
state_denomination=StateAlphabet.FUNDAMENTAL_STATE,
member_states=None)
self._fundamental_states.append(new_state)
if not self._is_case_sensitive:
for s in (symbol.upper(), symbol.lower()):
if s != symbol:
self.new_symbol_synonym(s, symbol)
if self.autocompile_lookup_tables:
self.compile_symbol_lookup_mappings()
return new_state
[docs]
def new_ambiguous_state(self, symbol, **kwargs):
r"""
Adds a new ambiguous state to the collection
of states in this alphabet.
Parameters
----------
symbol : string or None
The symbol used to represent this state. Cannot have previously
been used to refer to any other state, fundamental or otherwise, as
a primary or synonymous symbol (including implicit synonyms given
by case-variants if the state alphabet is not case-sensitive). Can
be blank ("") or |None| if there.
\*\*kwargs : keyword arguments, mandatory
Exactly one of the following must be specified:
member_state_symbols : iterable of strings
List of symbols representing states to which this state maps. Symbols
representing multistates will taken to refer to the set of
fundamental states to which they, in turn, map.
member_states : iterable of |StateIdentity| objects
List of |StateIdentity| representing states to which this state maps.
Returns
-------
s : |StateIdentity|
The new state created and added.
"""
return self.new_multistate(
symbol=symbol,
state_denomination=StateAlphabet.AMBIGUOUS_STATE,
**kwargs)
[docs]
def new_polymorphic_state(self,
symbol,
**kwargs):
r"""
Adds a new polymorphic state to the collection
of states in this alphabet.
Parameters
----------
symbol : string or None
The symbol used to represent this state. Cannot have previously
been used to refer to any other state, fundamental or otherwise, as
a primary or synonymous symbol (including implicit synonyms given
by case-variants if the state alphabet is not case-sensitive). Can
be blank ("") or |None| if there.
\*\*kwargs : keyword arguments, mandatory
Exactly one of the following must be specified:
member_state_symbols : iterable of strings
List of symbols representing states to which this state maps. Symbols
representing multistates will taken to refer to the set of
fundamental states to which they, in turn, map.
member_states : iterable of |StateIdentity| objects
List of |StateIdentity| representing states to which this state maps.
Returns
-------
s : |StateIdentity|
The new state created and added.
"""
return self.new_multistate(
symbol=symbol,
state_denomination=StateAlphabet.POLYMORPHIC_STATE,
**kwargs)
[docs]
def new_multistate(self,
symbol,
state_denomination,
**kwargs):
r"""
Adds a new polymorphic or ambiguous state to the collection
of states in this alphabet.
Parameters
----------
symbol : string or None
The symbol used to represent this state. Cannot have previously
been used to refer to any other state, fundamental or otherwise, as
a primary or synonymous symbol (including implicit synonyms given
by case-variants if the state alphabet is not case-sensitive). Can
be blank ("") or |None| if there.
state_denomination : enum
StateAlphabet.POLYMORPHIC_STATE or StateAlphabet.AMBIGUOUS_STATE
\*\*kwargs : keyword arguments, mandatory
Exactly one of the following must be specified:
member_state_symbols : iterable of strings
List of symbols representing states to which this state maps. Symbols
representing multistates will taken to refer to the set of
fundamental states to which they, in turn, map.
member_states : iterable of |StateIdentity| objects
List of |StateIdentity| representing states to which this state maps.
Returns
-------
s : |StateIdentity|
The new state created and added.
"""
if symbol is not None and symbol != "":
symbol = self._validate_new_symbol(symbol)
if len(kwargs) != 1:
raise TypeError("Exactly one of 'member_state_symbols' or 'member_states' is required")
if "member_state_symbols" in kwargs:
member_states = self._direct_get_fundamental_states_for_symbols(kwargs["member_state_symbols"])
elif "member_states" in kwargs:
member_states = kwargs["member_states"]
else:
raise ValueError("Exactly one of 'member_state_symbols' or 'member_states' is required")
new_state = StateIdentity(
symbol=symbol,
index=None,
state_denomination=state_denomination,
member_states=member_states)
if state_denomination == StateAlphabet.POLYMORPHIC_STATE:
self._polymorphic_states.append(new_state)
elif state_denomination == StateAlphabet.AMBIGUOUS_STATE:
self._ambiguous_states.append(new_state)
else:
raise ValueError(state_denomination)
if symbol and not self._is_case_sensitive:
for s in (symbol.upper(), symbol.lower()):
if s != symbol:
self.new_symbol_synonym(s, symbol)
if self.autocompile_lookup_tables:
if symbol:
self.compile_symbol_lookup_mappings()
self.compile_member_states_lookup_mappings()
return new_state
[docs]
def new_symbol_synonym(self,
symbol_synonym, referenced_symbol):
"""
Defines an alternative symbol mapping for an existing state.
Parameters
----------
symbol_synonym : string
The (new) alternative symbol.
referenced_symbol : string
The symbol for the state to which the alternative symbol will also
map.
Returns
-------
s : |StateIdentity|
The state to which this synonym maps.
------
"""
if symbol_synonym is None or symbol_synonym == "":
raise ValueError("Symbol synonym cannot be empty")
symbol_synonym = self._validate_new_symbol(symbol_synonym)
state = self._direct_get_state_for_symbol(referenced_symbol)
if symbol_synonym in state.symbol_synonyms:
raise ValueError("Symbol synonym '{}' already defined for state '{}".format(symbol_synonym, state))
state.symbol_synonyms.append(symbol_synonym)
if self.autocompile_lookup_tables:
self.compile_symbol_lookup_mappings()
return state
###########################################################################
### Optimization/Sugar: Lookup Mappings and Attribute Settings
[docs]
def compile_lookup_mappings(self):
"""
Builds lookup tables/mappings for quick referencing and dereferencing
of symbols/states.
"""
self.compile_symbol_lookup_mappings()
self.compile_member_states_lookup_mappings()
[docs]
def compile_member_states_lookup_mappings(self):
"""
Builds lookup tables/mappings for quick referencing and dereferencing
of ambiguous/polymorphic states based on the fundamental states to
which they map.
"""
temp_fundamental_states_to_ambiguous_state_map = {}
temp_fundamental_states_to_polymorphic_state_map = {}
if self.no_data_state is not None:
assert self.no_data_state in self._ambiguous_states
self.no_data_state.member_states = tuple(self.fundamental_state_iter())
for idx, state in enumerate(self.state_iter()):
if state.state_denomination == StateAlphabet.AMBIGUOUS_STATE:
member_states = frozenset(state.member_states)
if member_states in temp_fundamental_states_to_ambiguous_state_map:
pass
# raise ValueError("Multiple definitions of ambiguous state with member states of '{}': {}, {}. Define a symbol synonym instead.".format(
# state.member_states_str, temp_fundamental_states_to_ambiguous_state_map[member_states], state))
else:
temp_fundamental_states_to_ambiguous_state_map[member_states] = state
elif state.state_denomination == StateAlphabet.POLYMORPHIC_STATE:
member_states = frozenset(state.member_states)
if member_states in temp_fundamental_states_to_polymorphic_state_map:
pass
# raise ValueError("Multiple definitions of polymorphic state with member states of '{}': {}, {}. Define a symbol synonym instead.".format(
# state.member_states_str, temp_fundamental_states_to_polymorphic_state_map[member_states], state))
else:
temp_fundamental_states_to_polymorphic_state_map[member_states] = state
self._fundamental_states_to_ambiguous_state_map = container.FrozenOrderedDict(temp_fundamental_states_to_ambiguous_state_map)
self._fundamental_states_to_polymorphic_state_map = container.FrozenOrderedDict(temp_fundamental_states_to_polymorphic_state_map)
def _set_symbol_mapping(self, d, symbol, state):
if symbol is None or symbol == "":
raise ValueError("Symbol synonym cannot be empty")
assert symbol not in d
d[symbol] = state
[docs]
def compile_symbol_lookup_mappings(self):
"""
Builds lookup tables/mappings for quick referencing and dereferencing
of state symbology.
"""
temp_states = []
temp_symbols = []
temp_canonical_symbol_state_map = collections.OrderedDict()
temp_full_symbol_state_map = collections.OrderedDict()
temp_index_state_map = collections.OrderedDict()
# if self._gap_symbol is not None and self.no_data_state is None:
# self.no_data_state = self.new_ambiguous_state(symbol=None,
# member_states=self._fundamental_states)
if self.no_data_state is not None:
assert self.no_data_symbol == self.no_data_state.symbol, "{} != {}".format(self.no_data_symbol, self.no_data_state.symbol)
temp_full_symbol_state_map[None] = self.no_data_state
for idx, state in enumerate(self.state_iter()):
temp_states.append(state)
if state.symbol:
temp_symbols.append(state.symbol)
assert state.symbol not in temp_canonical_symbol_state_map
temp_canonical_symbol_state_map[state.symbol] = state
self._set_symbol_mapping(
temp_full_symbol_state_map,
state.symbol,
state)
if state.symbol_synonyms:
for ss in state.symbol_synonyms:
self._set_symbol_mapping(
temp_full_symbol_state_map,
ss,
state)
else:
assert state.state_denomination != StateAlphabet.FUNDAMENTAL_STATE
state._index = idx
if self.gap_state is not None and state is self.gap_state and self.no_data_state is not None:
state.is_gap_state = True
state.gap_state_as_no_data_state = self.no_data_state
else:
state.is_gap_state = False
state.gap_state_as_no_data_state = None
temp_index_state_map[idx] = state
self._state_identities = tuple(temp_states)
self._canonical_state_symbols = tuple(temp_symbols)
self._canonical_symbol_state_map = container.FrozenOrderedDict(temp_canonical_symbol_state_map)
self._full_symbol_state_map = container.FrozenOrderedDict(temp_full_symbol_state_map)
self._index_state_map = container.FrozenOrderedDict(temp_index_state_map)
[docs]
def set_state_as_attribute(self, state, attr_name=None):
"""
Sets the given state as an attribute of this alphabet.
The name of the attribute will be ``attr_name`` if specified,
or the state symbol otherwise.
Parameters
----------
state : |StateIdentity|
The state to be made an attribute of this alphabet.
attr_name : string
The name of the attribute. If not specified, the state
symbol will be used.
"""
if (state not in self._fundamental_states
and state not in self._ambiguous_states
and state not in self._polymorphic_states):
raise ValueError("State {} not defined in current alphabet".format(state))
if attr_name is None:
attr_name = state.symbol
if attr_name is None:
raise TypeError("Cannot set attribute: non-None symbol needed for state or non-None attribute name needs to be provided")
setattr(self, attr_name, state)
###########################################################################
### Special handling to designate gap
def _get_gap_symbol(self):
return self._gap_symbol
def _set_gap_symbol(self, gap_symbol):
"""
For state alphabets with no explicitly-defined gap and no data (missing) symbols,
this method will allow creation of mapping of gaps to no data states,
so that tree/data scoring methods that require gaps to be treated as
missing data can be used. Note that the gap state needs to be already
defined in the state alphabet and already associated with the
designated symbol.
"""
if gap_symbol is not None:
self.gap_state = self[gap_symbol]
self._gap_symbol = gap_symbol
else:
self.gap_state = None
self._gap_symbol = None
gap_symbol = property(_get_gap_symbol, _set_gap_symbol)
def _get_no_data_symbol(self):
return self._no_data_symbol
def _set_no_data_symbol(self, no_data_symbol):
if no_data_symbol is not None:
self.no_data_state = self[no_data_symbol]
self._no_data_symbol = no_data_symbol
else:
self.no_data_state = None
self._no_data_symbol = None
no_data_symbol = property(_get_no_data_symbol, _set_no_data_symbol)
###########################################################################
### Symbol Access
[docs]
def __len__(self):
"""
Number of states.
"""
return ( len(self._fundamental_states)
+ len(self._ambiguous_states)
+ len(self._polymorphic_states) )
[docs]
def __iter__(self):
"""
Returns :meth:`StateAlphabet.state_iter()`: iterator over all state
identities.
"""
return self.state_iter()
[docs]
def state_iter(self):
"""
Returns an iterator over all state identities.
"""
return itertools.chain(
self._fundamental_states,
self._ambiguous_states,
self._polymorphic_states)
[docs]
def fundamental_state_iter(self):
"""
Returns an iterator over all fundamental state identities.
"""
return itertools.chain(self._fundamental_states)
[docs]
def ambiguous_state_iter(self):
"""
Returns an iterator over all ambiguous state identities.
"""
return itertools.chain(self._ambiguous_states)
[docs]
def polymorphic_state_iter(self):
"""
Returns an iterator over all polymorphic state identities.
"""
return itertools.chain(self._polymorphic_states)
[docs]
def multistate_state_iter(self):
"""
Returns an iterator over all ambiguous and polymorphic state
identities.
"""
return itertools.chain(self._ambiguous_states, self._polymorphic_states)
[docs]
def fundamental_symbol_iter(self, include_synonyms=True):
"""
Returns an iterator over all symbols (including synonyms, unless
``include_synonyms`` is |False|) that map to fundamental states.
"""
for state in self.fundamental_state_iter():
yield state.symbol
if state.symbol_synonyms and include_synonyms:
for symbol in state.symbol_synonyms:
yield symbol
[docs]
def ambiguous_symbol_iter(self, include_synonyms=True):
"""
Returns an iterator over all symbols (including synonyms, unless
``include_synonyms`` is |False|) that map to ambiguous states.
"""
for state in self.ambiguous_state_iter():
yield state.symbol
if state.symbol_synonyms and include_synonyms:
for symbol in state.symbol_synonyms:
yield symbol
[docs]
def polymorphic_symbol_iter(self, include_synonyms=True):
"""
Returns an iterator over all symbols (including synonyms, unless
``include_synonyms`` is |False|) that map to polymorphic states.
"""
for state in self.polymorphic_state_iter():
yield state.symbol
if state.symbol_synonyms and include_synonyms:
for symbol in state.symbol_synonyms:
yield symbol
[docs]
def multistate_symbol_iter(self, include_synonyms=True):
"""
Returns an iterator over all symbols (including synonyms, unless
``include_synonyms`` is |False|) that map to multistate states.
"""
for state in self.multistate_state_iter():
yield state.symbol
if state.symbol_synonyms and include_synonyms:
for symbol in state.symbol_synonyms:
yield symbol
[docs]
def symbol_state_pair_iter(self, include_synonyms=True):
"""
Returns an iterator over all symbols paired with the state to which the
they symbols map.
"""
for state in self.state_iter():
yield (state.symbol, state)
if include_synonyms:
for synonym in state.symbol_synonyms:
yield (synonym, state)
def _get_state_identities(self):
"""
Tuple of all state identities in this alphabet.
"""
return self._state_identities
states = property(_get_state_identities)
def _get_canonical_state_symbols(self):
"""
Tuple of all state symbols in this alphabet.
"""
return self._canonical_state_symbols
symbols = property(_get_canonical_state_symbols)
def _get_canonical_symbol_state_map(self):
"""
Dictionary with state symbols as keys and states as values. Does not
include symbol synonyms or case variations.
"""
return self._canonical_symbol_state_map
canonical_symbol_state_map = property(_get_canonical_symbol_state_map)
def _get_full_symbol_state_map(self):
"""
Dictionary with state symbols as keys and states as values.
Includes symbol synonyms or case variations.
"""
return self._full_symbol_state_map
full_symbol_state_map = property(_get_full_symbol_state_map)
[docs]
def __getitem__(self, key):
"""
Returns state identity corresponding to ``key``.
Parameters
----------
key : integer or string
If and integer value, looks up and returns state identity by index.
If a string value, looks up and returns state identity by symbol.
Returns
-------
s : |StateIdentity| instance
Returns a |StateIdentity| corresponding to ``key``.
Raises
------
KeyError if ``key`` is not valid.
"""
if isinstance(key, int):
return self._index_state_map[key]
else:
return self._full_symbol_state_map[key]
[docs]
def get_states_for_symbols(self, symbols):
"""
Returns list of states corresponding to symbols.
Parameters
----------
symbols : iterable of symbols
Returns
-------
s : list of |StateIdentity|
A list of |StateIdentity| instances corresponding to symbols
given in ``symbols``.
"""
states = [self.full_symbol_state_map[s] for s in symbols]
return states
[docs]
def get_fundamental_states_for_symbols(self, symbols):
"""
Returns list of *fundamental* states corresponding to symbols.
Parameters
----------
symbols : iterable of symbols
Returns
-------
s : list of |StateIdentity|
A list of fundamental |StateIdentity| instances corresponding
to symbols given in ``symbols``, with multi-state states expanded
into their fundamental symbols.
"""
states = []
for symbol in symbols:
state = self._full_symbol_state_map[symbol]
states.extend(state.fundamental_states)
return states
[docs]
def get_canonical_symbol_for_symbol(self, symbol):
"""
Returns the canonical state symbol for the state to which ``symbol``
maps. E.g., in a DNA alphabet, return 'A' for 'a'.
Parameters
----------
symbol : string
Returns
-------
s : string
Canonical symbol for state with symbol or synonym symbol of
``symbol``.
"""
return self[symbol].symbol
[docs]
def match_ambiguous_state(self, symbols):
"""
Returns ambiguous state with fundamental member states
represented by symbols given in ``symbols``.
Parameters
----------
symbols : iterable of symbols
Returns
-------
s : |StateIdentity| instance
"""
states = frozenset(self.get_fundamental_states_for_symbols(symbols))
return self._fundamental_states_to_ambiguous_state_map[states]
[docs]
def match_polymorphic_state(self, symbols):
"""
Returns polymorphic state with fundamental member states
represented by symbols given in ``symbols``.
Parameters
----------
symbols : iterable of symbols
Returns
-------
s : |StateIdentity| instance
"""
states = frozenset(self.get_fundamental_states_for_symbols(symbols))
return self._fundamental_states_to_polymorphic_state_map[states]
[docs]
def match_state(self, symbols, state_denomination):
"""
Returns ambiguous or polymorphic state with fundamental member states
represented by symbols given in ``symbols``.
Parameters
----------
symbols : iterable of string symbols
Symbols representing states to be dereferenced.
state_denomination : {StateAlphabet.AMBIGUOUS or StateAlphabet.POLYPMORPHIC_STATE}
Returns
-------
s : |StateIdentity| instance
"""
if state_denomination == StateAlphabet.AMBIGUOUS_STATE:
return self.match_ambiguous_state(symbols=symbols)
else:
return self.match_polymorphic_state(symbols=symbols)
###############################################################################
## StateIdentity
[docs]
class StateIdentity(
basemodel.DataObject,
basemodel.Annotable):
"""
A character state definition, which can either be a fundamental state or
a mapping to a set of other character states (for polymorphic or ambiguous
characters).
"""
def __init__(self,
symbol=None,
index=None,
state_denomination=StateAlphabet.FUNDAMENTAL_STATE,
member_states=None):
"""
A state is immutable with respect to its definition and identity.
Specifically, it 'symbol', 'index', 'multistate', and 'member_states'
properties are set upon definition/creation, and after that are
read-only.
Parameters
----------
symbol : string
A text symbol or token representation of this character state.
E.g., 'G' for the base guanine in a DNA state alphabet, or '1' for
presence of a wing in a morphological data set.
index : integer
The (0-based) numeric index for this state in the state alphabet.
E.g., for a DNA alphabet: 0 = 'A'/adenine, 1 = 'C'/cytosine, 2 =
'G'/guanine, 3 = 'T'/thymine. Or for a "standard" alphabet: 0 =
'0', 1 = '1'. Note that ambiguous and polymorphic state definitions
typically are not indexed.
state_denomination : 'enum'
One of: ``StateAlphabet.FUNDAMENTAL_STATE``,
``StateAlphabet.AMBIGUOUS_STATE``, or
``StateAlphabet.POLYMORPHIC_STATE``.
member_states : iterable of |StateIdentity| instances.
If a multi-state, then a collection of |StateIdentity|
instances to which this state maps.
"""
basemodel.DataObject.__init__(self, label=symbol)
self._symbol = symbol
self._index = index
self._state_denomination = state_denomination
self._member_states = None
self._fundamental_states = None
self._fundamental_symbols = None
self._fundamental_indexes = None
self._fundamental_indexes_with_gaps_as_missing = None
self._partials_vector = None
if member_states is not None:
self._member_states = tuple(member_states)
else:
self._member_states = None
self._str = None
self._repr = None
self._member_states_str = None
self._symbol_synonyms = []
# special handling for treating gap states as missing/no-data states
self.is_gap_state = None
self.gap_state_as_no_data_state = None
def __hash__(self):
return id(self)
def __eq__(self, other):
return other is self
def __copy__(self, memo=None):
return self
[docs]
def taxon_namespace_scoped_copy(self, memo=None):
return self
def __deepcopy__(self, memo=None):
return self
def _get_index(self):
return self._index
index = property(_get_index)
def __str__(self):
if self._str is None:
if self._symbol:
self._str = str(self._symbol)
elif self._state_denomination == StateAlphabet.FUNDAMENTAL_STATE:
self._str = ""
else:
self._str = self.member_states_str
return self._str
def __repr__(self):
if self._repr is None:
s = str(self)
self._repr = "<{} at {}: '{}'>".format(self.__class__.__name__,
hex(id(self)), str(s))
return self._repr
def _get_member_states_str(self):
"""
Representation of member states of self.
"""
if self._member_states_str is None:
if self._state_denomination == StateAlphabet.FUNDAMENTAL_STATE:
self._member_states_str = str(self)
else:
s = ",".join([m._symbol for m in self._member_states])
if self._state_denomination == StateAlphabet.AMBIGUOUS_STATE:
self._member_states_str = "{" + s + "}"
elif self._state_denomination == StateAlphabet.POLYMORPHIC_STATE:
self._member_states_str = "(" + s + ")"
else:
raise ValueError("Unrecognized state denomination: '{}'".format(self._state_denomination))
return self._member_states_str
member_states_str = property(_get_member_states_str)
def _get_symbol(self):
"""
Canonical (primary) symbol of this state.
"""
return self._symbol
symbol = property(_get_symbol)
def _get_state_denomination(self):
"""
Type of multi-statedness: FUNDAMENTAL (not a multistate), AMBIGUOUS, or
POLYMORPHIC.
"""
return self._state_denomination
state_denomination = property(_get_state_denomination)
def _is_single_state(self):
"""
|True| if a FUNDAMENTAL state.
"""
return self._state_denomination == StateAlphabet.FUNDAMENTAL_STATE
is_single_state = property(_is_single_state)
def _is_fundamental_state(self):
"""
|True| if a FUNDAMENTAL state.
"""
return self._state_denomination == StateAlphabet.FUNDAMENTAL_STATE
is_fundamental_state = property(_is_fundamental_state)
def _get_member_states(self):
"""
Returns the (fundamental) member states that this state maps to if not
itself a fundamental state.
"""
return self._member_states
def _set_member_states(self, member_states):
"""
Rebuilds member state set.
"""
if member_states is not None:
self._member_states = tuple(member_states)
else:
self._member_states = None
self._fundamental_states = None
self._fundamental_symbols = None
self._fundamental_indexes = None
self._fundamental_indexes_with_gaps_as_missing = None
self._partials_vector = None
self._str = None
self._repr = None
self._member_states_str = None
member_states = property(_get_member_states, _set_member_states)
def _get_fundamental_states(self):
"""
Returns a tuple of fundamental states (i.e., tupe of single states)
to which this state maps.
"""
if self._fundamental_states is None:
if self._member_states is None:
states = {self:True}
else:
states = collections.OrderedDict()
for state in self._member_states:
assert state is not self
for s in state.fundamental_states:
states[s] = True
self._fundamental_states = tuple(states.keys())
return self._fundamental_states
fundamental_states = property(_get_fundamental_states)
def _get_fundamental_symbols(self):
"""
Returns a tuple of fundamental state symbols (i.e., tuple of symbols
representing single states) to which this state maps.
"""
if self._fundamental_symbols is None:
self._fundamental_symbols = tuple(state.symbol for state in self.fundamental_states)
return self._fundamental_symbols
fundamental_symbols = property(_get_fundamental_symbols)
def _get_fundamental_indexes(self):
"""
Returns a tuple of fundamental state indexes (i.e., tuple of index
values of single states) to which this state maps.
"""
if self._fundamental_indexes is None:
self._fundamental_indexes = tuple([state._index for state in self.fundamental_states])
return self._fundamental_indexes
fundamental_indexes = property(_get_fundamental_indexes)
def _get_fundamental_indexes_with_gaps_as_missing(self):
"""
Returns a tuple of fundamental state indexes (i.e., tuple of index
values of single states) to which this state maps, with gaps being
substituted with missing (no-data) states.
"""
if self._fundamental_indexes_with_gaps_as_missing is None:
if self.is_gap_state:
if self.gap_state_as_no_data_state is not None:
self._fundamental_indexes_with_gaps_as_missing = tuple(self.gap_state_as_no_data_state.fundamental_indexes_with_gaps_as_missing)
else:
raise ValueError("No data state not specified")
else:
fstates = [s for s in self.fundamental_states if not s.is_gap_state]
self._fundamental_indexes_with_gaps_as_missing = tuple([s._index for s in fstates])
return self._fundamental_indexes_with_gaps_as_missing
fundamental_indexes_with_gaps_as_missing = property(_get_fundamental_indexes_with_gaps_as_missing)
def _get_symbol_synonyms(self):
"""
The collection of symbol synonyms (alternatives/equivalents to the
canonical symbol) which also map to this state.
"""
return self._symbol_synonyms
# def _set_symbol_synonyms(self, value):
# self._symbol_synonyms = value
# symbol_synonyms = property(_get_symbol_synonyms, _set_symbol_synonyms)
symbol_synonyms = property(_get_symbol_synonyms)
[docs]
def is_exact_correspondence(self, other):
"""
Tries to determine if two StateIdentity definitions
are equivalent by matching symbols.
"""
match = True
if self._state_denomination != other._state_denomination:
return False
if self._state_denomination != StateAlphabet.FUNDAMENTAL_STATE and other._state_denomination != StateAlphabet.FUNDAMENTAL_STATE:
xf1 = self.fundamental_states
xf2 = other.fundamental_states
if len(xf1) != len(xf2):
match = False
else:
f1 = set(xf1)
f2 = set(xf2)
for m1 in f1:
member_match = False
for m2 in f2:
if m1.is_exact_correspondence(m2):
member_match = True
f2.remove(m2)
break
if not member_match:
match = False
break
if match:
f1 = set(xf1)
f2 = set(xf2)
for m2 in f2:
member_match = False
for m1 in f1:
if m1.is_exact_correspondence(m2):
f1.remove(m1)
member_match = True
break
if not member_match:
match = False
break
return match
else:
return self._symbol == other._symbol
###############################################################################
## DnaStateAlphabet
class DnaStateAlphabet(StateAlphabet):
def __init__(self):
fundamental_states = "ACGT"
polymorphic_states = None
ambiguous_states = (
("N", "ACGT"),
("R", "AG" ),
("Y", "CT" ),
("M", "AC" ),
("W", "AT" ),
("S", "CG" ),
("K", "GT" ),
("V", "ACG" ),
("H", "ACT" ),
("D", "AGT" ),
("B", "CGT" ),
)
symbol_synonyms = {"X": "N"}
StateAlphabet.__init__(self,
fundamental_states=fundamental_states,
no_data_symbol="?",
gap_symbol="-",
polymorphic_states=polymorphic_states,
ambiguous_states=ambiguous_states,
symbol_synonyms=symbol_synonyms,
label="DNA",
case_sensitive=False)
for state in self.state_iter():
if state.symbol == "-":
attr_name = "gap"
else:
attr_name = state.symbol
self.set_state_as_attribute(state, attr_name)
self.any_residue = self.N
self.unknown_state_symbol = 'N'
###############################################################################
## RnaStateAlphabet
class RnaStateAlphabet(StateAlphabet):
def __init__(self):
fundamental_states = "ACGU"
polymorphic_states = None
ambiguous_states = (
("N", "ACGU"),
("R", "AG" ),
("Y", "CU" ),
("M", "AC" ),
("W", "AU" ),
("S", "CG" ),
("K", "GU" ),
("V", "ACG" ),
("H", "ACU" ),
("D", "AGU" ),
("B", "CGU" ),
)
symbol_synonyms = {"X": "N"}
StateAlphabet.__init__(self,
fundamental_states=fundamental_states,
no_data_symbol="?",
gap_symbol="-",
polymorphic_states=polymorphic_states,
ambiguous_states=ambiguous_states,
symbol_synonyms=symbol_synonyms,
label="RNA",
case_sensitive=False)
for state in self.state_iter():
if state.symbol == "-":
attr_name = "gap"
else:
attr_name = state.symbol
self.set_state_as_attribute(state, attr_name)
self.any_residue = self.N
self.unknown_state_symbol = 'N'
###############################################################################
## NucleotideStateAlphabet
class NucleotideStateAlphabet(StateAlphabet):
def __init__(self):
fundamental_states = "ACGTU"
polymorphic_states = None
ambiguous_states = (
("N", "ACGTU"),
("R", "AG" ),
("Y", "CTU" ),
("M", "AC" ),
("W", "ATU" ),
("S", "CG" ),
("K", "GTU" ),
("V", "ACG" ),
("H", "ACTU" ),
("D", "AGTU" ),
("B", "CGTU" ),
)
symbol_synonyms = {"X": "N"}
StateAlphabet.__init__(self,
fundamental_states=fundamental_states,
no_data_symbol="?",
gap_symbol="-",
polymorphic_states=polymorphic_states,
ambiguous_states=ambiguous_states,
symbol_synonyms=symbol_synonyms,
label="Nucleotide",
case_sensitive=False)
for state in self.state_iter():
if state.symbol == "-":
attr_name = "gap"
else:
attr_name = state.symbol
self.set_state_as_attribute(state, attr_name)
self.any_residue = self.N
self.unknown_state_symbol = 'N'
###############################################################################
## ProteinStateAlphabet
class ProteinStateAlphabet(StateAlphabet):
def __init__(self):
fundamental_states = "ACDEFGHIKLMNPQRSTVWY*"
polymorphic_states = None
ambiguous_states = (
("B", "DN"),
("Z", "EQ"),
("X", "ACDEFGHIKLMNPQRSTVWY*"),
)
symbol_synonyms = {}
StateAlphabet.__init__(self,
fundamental_states=fundamental_states,
no_data_symbol="?",
gap_symbol="-",
polymorphic_states=polymorphic_states,
ambiguous_states=ambiguous_states,
symbol_synonyms=symbol_synonyms,
label="Protein",
case_sensitive=False)
for state in self.state_iter():
if state.symbol == "-":
attr_name = "gap"
elif state.symbol == "*":
attr_name = "stop"
else:
attr_name = state.symbol
self.set_state_as_attribute(state, attr_name)
self.any_residue = self.X
self.unknown_state_symbol = 'X'
###############################################################################
## BinaryStateAlphabet
class BinaryStateAlphabet(StateAlphabet):
def __init__(self, allow_gaps=False, allow_missing=False):
fundamental_states = "10"
if allow_gaps:
gap_symbol = "-"
else:
gap_symbol = None
polymorphic_states = None
ambiguous_states = []
if allow_missing:
no_data_symbol = "?"
else:
no_data_symbol = None
symbol_synonyms = {}
StateAlphabet.__init__(self,
fundamental_states=fundamental_states,
no_data_symbol=no_data_symbol,
gap_symbol=gap_symbol,
polymorphic_states=polymorphic_states,
ambiguous_states=ambiguous_states,
symbol_synonyms=symbol_synonyms,
label="Binary",
case_sensitive=False)
for state in self.state_iter():
if state.symbol == "-":
attr_name = "gap"
elif state.symbol == "?":
attr_name = "missing"
elif state.symbol == "*":
attr_name = "stop"
else:
attr_name = state.symbol
self.set_state_as_attribute(state, attr_name)
###############################################################################
## RestrictionSitesStateAlphabet
class RestrictionSitesStateAlphabet(BinaryStateAlphabet):
def __init__(self, allow_gaps=False, allow_missing=False):
BinaryStateAlphabet.__init__(self, allow_gaps=allow_gaps, allow_missing=allow_missing)
###############################################################################
## InfiniteSitesStateAlphabet
class InfiniteSitesStateAlphabet(BinaryStateAlphabet):
def __init__(self, allow_gaps=False, allow_missing=False):
BinaryStateAlphabet.__init__(self, allow_gaps=allow_gaps, allow_missing=allow_missing)
###############################################################################
## GLOBAL STATE ALPHABETS
DNA_STATE_ALPHABET = DnaStateAlphabet()
RNA_STATE_ALPHABET = RnaStateAlphabet()
NUCLEOTIDE_STATE_ALPHABET = NucleotideStateAlphabet()
BINARY_STATE_ALPHABET = BinaryStateAlphabet()
PROTEIN_STATE_ALPHABET = ProteinStateAlphabet()
RESTRICTION_SITES_STATE_ALPHABET = RestrictionSitesStateAlphabet()
INFINITE_SITES_STATE_ALPHABET = InfiniteSitesStateAlphabet()
def new_standard_state_alphabet(
fundamental_state_symbols=None,
case_sensitive=False):
if fundamental_state_symbols is None:
fundamental_state_symbols = "0123456789"
s = StateAlphabet(
fundamental_states=fundamental_state_symbols,
no_data_symbol="?",
gap_symbol="-",
# polymorphic_states=polymorphic_states,
# ambiguous_states=ambiguous_states,
# symbol_synonyms=symbol_synonyms,
label="Standard",
case_sensitive=case_sensitive)
for state in s.state_iter():
if state.symbol == "-":
attr_name = "gap"
else:
attr_name = state.symbol
s.set_state_as_attribute(state, attr_name)
return s
###############################################################################
## Convenience Functions
def coerce_to_state_identities(state_alphabet, values):
coerced_values = []
for v in values:
if isinstance(v, StateIdentity):
coerced_values.append(v)
elif textprocessing.is_str_type(v) or isinstance(v, int):
s = state_alphabet[v]
coerced_values.append(s)
else:
raise ValueError(v)
return coerced_values