Commit be48450a authored by Benjamin Paassen's avatar Benjamin Paassen
Browse files

revised documentation and copyright header in all files to be consistent with...

revised documentation and copyright header in all files to be consistent with the numpy documentation convention
parent 96d96cd8
This diff is collapsed.
"""
Implements a sequence edit distance with affine gap costs using ADP.
Copyright (C) 2019
Benjamin Paaßen
AG Machine Learning
Bielefeld University
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
# Copyright (C) 2019-2020
# Benjamin Paaßen
# AG Machine Learning
# Bielefeld University
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import numpy as np
import edist.adp as adp
__author__ = 'Benjamin Paaßen'
__copyright__ = 'Copyright 2019, Benjamin Paaßen'
__copyright__ = 'Copyright 2019-2020, Benjamin Paaßen'
__license__ = 'GPLv3'
__version__ = '1.0.0'
__version__ = '1.1.0'
__maintainer__ = 'Benjamin Paaßen'
__email__ = 'bpaassen@techfak.uni-bielefeld.de'
......@@ -45,12 +45,22 @@ class AffineAlgebra:
""" This is a class to efficiently store an algebra for the affine edit
distance grammar in a pickleable format.
Attributes:
_rep: A function for replacement costs.
_gap: A function for deletion/insertion costs.
_gap_cost: (optional) a constant cost for deletions/insertions.
_skip: A function for deletion/insertion extension costs.
_skip_cost: (optional) a constant cost for deletion/insertion extensions.
Attributes
----------
_rep: function (default = Kronecker distance)
A function for replacement costs, i.e. _rep(x, y) is the cost of
replacing x with y.
_gap: function (default = constant function with 1.0)
A function for deletion/insertion costs, i.e. _gap(x) is the cost of
deleting/inserting x.
_gap_cost: float (default = 1.0)
a constant cost for deletions/insertions.
_skip: function (default = constant function with 0.5)
A function for deletion/insertion extension costs, i.e. _skip(x) is the
cost of skip-deleting/-inserting x.
_skip_cost: float (default = 0.5)
a constant cost for deletion/insertion extensions.
"""
def __init__(self, rep = None, gap = 1., skip = 0.5):
if(rep is None):
......@@ -95,20 +105,30 @@ class AffineAlgebra:
def aed(x, y, rep = None, gap = 1., skip = 0.5):
""" Computes the affine edit distance using algebraic dynamic programming.
Args:
x: A list-like object.
y: Another list-like object.
rep: A function with two arguments, computing the cost for replacing the
first with the second OR an AffineAlgebra object, in which case the
remaining aguments will be ignored. Defaults to the Kronecker distance.
gap: A function with two arguments, computing the cost for deleting the
first or inserting the second OR a number defining a constant cost.
Defaults to 1.
skip: A function with two arguments, computing the cost for deleting the
first or inserting the second for gap extensions OR a number defining
a constant cost. Defaults to 0.5.
Returns: The affine edit distance between x and y.
Parameters
----------
x: list
A list-like object.
y: list
Another list-like object.
rep: function (default = Kronecker delta)
A function with two arguments, computing the cost for replacing the
first with the second OR an AffineAlgebra object, in which case the
remaining aguments will be ignored. Defaults to the Kronecker distance.
gap: function or float (default = 1.0)
A function with two arguments, computing the cost for deleting the
first or inserting the second OR a number defining a constant cost.
Defaults to 1.
skip: function or float (default = 0.5)
A function with two arguments, computing the cost for deleting the
first or inserting the second for gap extensions OR a number defining
a constant cost. Defaults to 0.5.
Returns
-------
d: float
The affine edit distance between x and y.
"""
if(isinstance(rep, AffineAlgebra)):
algebra = rep
......@@ -120,20 +140,31 @@ def aed_backtrace(x, y, rep = None, gap = 1., skip = 0.5):
""" Computes the backtrace of the affine edit distance using algebraic
dynamic programming.
Args:
x: A list-like object.
y: Another list-like object.
rep: A function with two arguments, computing the cost for replacing the
first with the second OR an AffineAlgebra object, in which case the
remaining aguments will be ignored. Defaults to the Kronecker distance.
gap: A function with two arguments, computing the cost for deleting the
first or inserting the second OR a number defining a constant cost.
Defaults to 1.
skip: A function with two arguments, computing the cost for deleting the
first or inserting the second for gap extensions OR a number defining
a constant cost. Defaults to 0.5.
Returns: An alignment between x and y according to the affine edit distance.
Parameters
----------
x: list
A list-like object.
y: list
Another list-like object.
rep: function (default = Kronecker delta)
A function with two arguments, computing the cost for replacing the
first with the second OR an AffineAlgebra object, in which case the
remaining aguments will be ignored. Defaults to the Kronecker distance.
gap: function or float (default = 1.0)
A function with two arguments, computing the cost for deleting the
first or inserting the second OR a number defining a constant cost.
Defaults to 1.
skip: function or float (default = 0.5)
A function with two arguments, computing the cost for deleting the
first or inserting the second for gap extensions OR a number defining
a constant cost. Defaults to 0.5.
Returns
-------
alignment: class alignment.Alignment
A co-optimal alignment between x and y according to the affine edit
distance.
"""
if(isinstance(rep, AffineAlgebra)):
algebra = rep
......@@ -143,22 +174,38 @@ def aed_backtrace(x, y, rep = None, gap = 1., skip = 0.5):
def aed_backtrace_stochastic(x, y, rep = None, gap = 1., skip = 0.5):
""" Computes the backtrace of the affine edit distance using algebraic
dynamic programming.
dynamic programming stochastically.
Note that the randomness does _not_ produce a uniform distribution over
all co-optimal alignments because random choices at the start of the
alignment process dominate. If you wish to characterize the overall
distribution accurately, use aed_backtrace_matrix instead.
Parameters
----------
x: list
A list-like object.
y: list
Another list-like object.
rep: function (default = Kronecker delta)
A function with two arguments, computing the cost for replacing the
first with the second OR an AffineAlgebra object, in which case the
remaining aguments will be ignored. Defaults to the Kronecker distance.
gap: function or float (default = 1.0)
A function with two arguments, computing the cost for deleting the
first or inserting the second OR a number defining a constant cost.
Defaults to 1.
skip: function or float (default = 0.5)
A function with two arguments, computing the cost for deleting the
first or inserting the second for gap extensions OR a number defining
a constant cost. Defaults to 0.5.
Returns
-------
alignment: class alignment.Alignment
A co-optimal alignment between x and y according to the affine edit
distance.
Args:
x: A list-like object.
y: Another list-like object.
rep: A function with two arguments, computing the cost for replacing the
first with the second OR an AffineAlgebra object, in which case the
remaining aguments will be ignored. Defaults to the Kronecker distance.
gap: A function with two arguments, computing the cost for deleting the
first or inserting the second OR a number defining a constant cost.
Defaults to 1.
skip: A function with two arguments, computing the cost for deleting the
first or inserting the second for gap extensions OR a number defining
a constant cost. Defaults to 0.5.
Returns: An alignment between x and y according to the affine edit distance.
"""
if(isinstance(rep, AffineAlgebra)):
algebra = rep
......@@ -167,28 +214,43 @@ def aed_backtrace_stochastic(x, y, rep = None, gap = 1., skip = 0.5):
return adp.backtrace_stochastic(x, y, _grammar, algebra)
def aed_backtrace_matrix(x, y, rep = None, gap = 1., skip = 0.5):
""" Computes the backtrace matrix of the affine edit distance using
""" Computes the backtrace matrix P of the affine edit distance using
algebraic dynamic programming.
Args:
x: A list-like object.
y: Another list-like object.
rep: A function with two arguments, computing the cost for replacing the
first with the second OR an AffineAlgebra object, in which case the
remaining aguments will be ignored. Defaults to the Kronecker distance.
gap: A function with two arguments, computing the cost for deleting the
first or inserting the second OR a number defining a constant cost.
Defaults to 1.
skip: A function with two arguments, computing the cost for deleting the
first or inserting the second for gap extensions OR a number defining
a constant cost. Defaults to 0.5.
Returns:
P: A len(x) + 2 x len(y) + 2 matrix where P[i, j] contains the probability
of node i being replaced with node j in a co-optimal alignment. The last
two columns contain deletion and deletion-extension probabilities, the
last two rows contains insertion and insertion-extension probabilities.
k: The number of co-optimal alignments.
In particular, P[i, j] contains the probability of node i being replaced
with node j in a co-optimal alignment. The last two columns contain
deletion and deletion-extension probabilities, the last two rows contains
insertion and insertion-extension probabilities.
Parameters
----------
x: list
A list-like object.
y: list
Another list-like object.
rep: function (default = Kronecker distance)
A function with two arguments, computing the cost for replacing the
first with the second OR an AffineAlgebra object, in which case the
remaining aguments will be ignored. Defaults to the Kronecker distance.
gap: function or float (default = 1.0)
A function with two arguments, computing the cost for deleting the
first or inserting the second OR a number defining a constant cost.
Defaults to 1.
skip: function or float (default = 0.5)
A function with two arguments, computing the cost for deleting the
first or inserting the second for gap extensions OR a number defining
a constant cost. Defaults to 0.5.
Returns
-------
P: array_like
A len(x) + 2 x len(y) + 2 matrix where P[i, j] contains the probability
of node i being replaced with node j in a co-optimal alignment. The last
two columns contain deletion and deletion-extension probabilities, the
last two rows contains insertion and insertion-extension probabilities.
k: int
The number of co-optimal alignments.
"""
if(isinstance(rep, AffineAlgebra)):
algebra = rep
......
"""
Implements an alignment between two sequences or trees.
Copyright (C) 2019
Benjamin Paaßen
AG Machine Learning
Bielefeld University
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
# Copyright (C) 2019-2020
# Benjamin Paaßen
# AG Machine Learning
# Bielefeld University
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
__author__ = 'Benjamin Paaßen'
__copyright__ = 'Copyright 2019, Benjamin Paaßen'
__copyright__ = 'Copyright 2019-2020, Benjamin Paaßen'
__license__ = 'GPLv3'
__version__ = '1.0.0'
__version__ = '1.1.0'
__maintainer__ = 'Benjamin Paaßen'
__email__ = 'bpaassen@techfak.uni-bielefeld.de'
......@@ -31,12 +31,17 @@ class Tuple:
""" Models a single alignment entry with an edit operation name,
a left index, and a right index.
Attributes:
_name: The name of the corresponding edit operation.
_left: The index of the aligned object on the left or -1 if no
such object exists.
_right: The index of the aligned object on the right or -1 if no
such object exists.
Attributes
----------
_name: str
The name of the corresponding edit operation.
_left: int
The index of the aligned object on the left or -1 if no such object
exists.
_right: int
The index of the aligned object on the right or -1 if no such object
exists.
"""
def __init__(self, name, left, right):
self._name = name
......@@ -46,14 +51,22 @@ class Tuple:
def cost(self, x, y, deltas):
""" Computes the cost of the current edit tuple.
Args:
x: A symbol list for the left indices.
y: A symbol list for the right indices.
deltas: The cost function delta mapping pairs of elements
to replacement/deletion/insertion costs OR
A map which contains for any operation name such a function.
Parameters
----------
x: list
A symbol list for the left indices.
y: list
A symbol list for the right indices.
deltas: function or dictionary
The cost function delta mapping pairs of elements to
replacement/deletion/insertion costs OR
A map which contains for any operation name such a function.
Returns
-------
cost: float
The cost assigned by deltas to this tuple.
Returns: The cost assigned by deltas to this tuple.
"""
if(self._left >= 0):
left = x[self._left]
......@@ -75,16 +88,23 @@ class Tuple:
and right indices in addition to the respective labels in x and y,
and in addition to the tuple cost.
Args:
x: A symbol list for the left indices.
y: A symbol list for the right indices.
deltas: (optional) The cost function delta mapping pairs of elements
to replacement/deletion/insertion costs OR
A map which contains for any operation name such a function.
If provided, the cost for any operation is rendered as
well.
Parameters
----------
x: list
A symbol list for the left indices.
y: list
A symbol list for the right indices.
deltas: function or dictionary (default = None)
The cost function delta mapping pairs of elements to
replacement/deletion/insertion costs OR
A map which contains for any operation name such a function.
If provided, the cost for any operation is rendered as well.
Returns
-------
repr: str
A string representing this tuple.
Returns: A string representing this tuple.
"""
op_str = ''
if(self._name):
......@@ -142,17 +162,23 @@ class Alignment(list):
tree alignments, with the additional requirement that aligned indices must
respect the structure of the tree, i.e. if i is aligned to j and i2 to j2,
then i can only be a parent of i2 if j is a parent of j2 (and vice versa).
"""
def __init__(self):
list.__init__(self, [])
def append_tuple(self, left, right, op = None):
""" Appends a new tuple to the current trace.
""" Appends a new tuple to the current Alignment.
Parameters
----------
left: int
the left index.
right: int
the right index.
op: str (default = None)
a name for the underlying edit operation.
Args:
left: the left index.
right: the right index.
op: (optional) a name for the underlying edit operation.
"""
self.append(Tuple(op, left, right))
......@@ -160,14 +186,22 @@ class Alignment(list):
""" Computes the cost of this trace. This is equivalent to
the sum of the cost of all tuples in this trace.
Args:
x: A symbol list for the left indices.
y: A symbol list for the right indices.
deltas: The cost function delta mapping pairs of elements
to replacement/deletion/insertion costs OR
A map which contains for any operation name such a function.
Parameters
----------
x: list
A symbol list for the left indices.
y: list
A symbol list for the right indices.
deltas: function or dictionary
The cost function delta mapping pairs of elements to
replacement/deletion/insertion costs OR
A map which contains for any operation name such a function.
Returns
-------
cost: float
The cost assigned by deltas to this Alignment.
Returns: The cost assigned by deltas to this trace.
"""
d = 0.
for op in self:
......@@ -181,16 +215,23 @@ class Alignment(list):
calling 'render' on all tuples in this trace and joining the
resulting strings with newlines.
Args:
x: A symbol list for the left indices.
y: A symbol list for the right indices.
deltas: (optional) The cost function delta mapping pairs of elements
to replacement/deletion/insertion costs OR
A map which contains for any operation name such a function.
If provided, the cost for any operation is rendered as
well.
Parameters
----------
x: list
A symbol list for the left indices.
y: list
A symbol list for the right indices.
deltas: function or dictionary (default = None)
The cost function delta mapping pairs of elements to
replacement/deletion/insertion costs OR
A map which contains for any operation name such a function.
If provided, the cost for any operation is rendered as well.
Returns
-------
repr: str
A string representing this Alignment.
Returns: A string representing this trace.
"""
render = []
for op in self:
......
......@@ -6,24 +6,25 @@ Tree Edit Distance Learning via Adaptive Symbol Embeddings. Proceedings of
the 35th International Conference on Machine Learning (ICML 2018).
URL: http://proceedings.mlr.press/v80/paassen18a.html
Copyright (C) 2019
Benjamin Paaßen
AG Machine Learning
Bielefeld University
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
# Copyright (C) 2019-2020
# Benjamin Paaßen
# AG Machine Learning
# Bielefeld University
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import numpy as np
from scipy.optimize import minimize
from scipy.spatial.distance import pdist, squareform
......@@ -33,9 +34,9 @@ import edist.sed as sed
import edist.multiprocess as mp
__author__ = 'Benjamin Paaßen'
__copyright__ = 'Copyright 2019, Benjamin Paaßen'
__copyright__ = 'Copyright 2019-2020, Benjamin Paaßen'
__license__ = 'GPLv3'
__version__ = '1.0.0'
__version__ = '1.1.0'
__maintainer__ = 'Benjamin Paaßen'
__email__ = 'bpaassen@techfak.uni-bielefeld.de'
......@@ -51,25 +52,37 @@ class BEDL(BaseEstimator, ClassifierMixin):
input symbols which yields an edit distance that makes classification
with this classifier easier.
Attributes:
K: The number of prototypes for the MGLVQ classifier.
T: The number of learning epochs we use at most. Defaults to 5.
phi: A squashing function to post-process each error term. Defaults
to the identity.
phi_grad: The gradient function corresponding to phi.
distance: The edit distance function that shall be learned. Defaults to
the sequence edit distance sed.sed.
distance_backtrace: The matrix backtracing function for the distance.
Defaults to sed.sed_backtrace_matrix. Note that this currently
does NOT support ADP because ADP returns a different backtracing
format.
_classifier: The learned MGLVQ classifier model.
_idx: A mapping from alphabet to indices.
_embedding: A len(alphabet) x len(alphabet) - 1 embedding matrix
for all symbols in the alphabet.
_delta_obj: An internal object to make storing of the delta function
more efficient.
_delta: The learned delta function.
Attributes
----------
K: int
The number of prototypes for the MGLVQ classifier.
T: int
The number of learning epochs we use at most. Defaults to 5.
phi: function (default = identity)
A squashing function to post-process each error term. Defaults to the
identity.
phi_grad: function (default = one)
The gradient function corresponding to phi.
distance: function (default = sed.sed)
The edit distance function that shall be learned. Defaults to the
sequence edit distance sed.sed.
distance_backtrace: function (default = sed.sed_backtrace_matrix)
The matrix backtracing function for the distance.
Defaults to sed.sed_backtrace_matrix. Note that this currently does NOT
support ADP because ADP returns a different backtracing format.
_classifier: class proto_dist_ml.MGLVQ
The learned MGLVQ classifier model.
_idx: dictionary
A mapping from alphabet to indices.
_embedding: array_like
A len(alphabet) x len(alphabet) - 1 embedding matrix for all symbols in
the alphabet.
_delta_obj: class bedl.EmbeddingDelta
An internal object to make storing of the delta function more
efficient.
_delta: function
The learned delta function.
"""
def __init__(self, K, T = 5, phi = None, phi_grad = None, distance = None, distance_backtrace = None):
self.K = K
......@@ -101,11 +114,20 @@ class BEDL(BaseEstimator, ClassifierMixin):
For more details, please refer to the ICML 2018 paper.
Args:
X: a list of data points, each being either a list or a tree,
depending on the edit distance that shall be learned.
y: an array-like or list-like structure with labels for each
data point.
Arguments
---------
X: list
a list of data points, each being either a list or a tree,
depending on the edit distance that shall be learned.
y: array_like or list
an array-like or list-like structure with labels for each
data point.
Returns
-------
class bedl.BEDL
self