# Copyright (C) 2013 Oskar Maier
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# author Oskar Maier
# version r0.1.0
# since 2011-12-01
# status Release
# build-in modules
import math
# third-party modules
import scipy
# own modules
# code
# ////////////////////////////// #
# Bin-by-bin comparison measures #
# ////////////////////////////// #
[docs]def minowski(h1, h2, p = 2): # 46..45..14,11..43..44 / 45 us for p=int(-inf..-24..-1,1..24..inf) / float @array, +20 us @list \w 100 bins
r"""
Minowski distance.
With :math:`p=2` equal to the Euclidean distance, with :math:`p=1` equal to the Manhattan distance,
and the Chebyshev distance implementation represents the case of :math:`p=\pm inf`.
The Minowksi distance between two histograms :math:`H` and :math:`H'` of size :math:`m` is
defined as:
.. math::
d_p(H, H') = \left(\sum_{m=1}^M|H_m - H'_m|^p
\right)^{\frac{1}{p}}
*Attributes:*
- a real metric
*Attributes for normalized histograms:*
- :math:`d(H, H')\in[0, \sqrt[p]{2}]`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-normalized histograms:*
- :math:`d(H, H')\in[0, \infty)`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-equal histograms:*
- not applicable
Parameters
----------
h1 : sequence
The first histogram.
h2 : sequence
The second histogram.
p : float
The :math:`p` value in the Minowksi distance formula.
Returns
-------
minowski : float
Minowski distance.
Raises
------
ValueError
If ``p`` is zero.
"""
h1, h2 = __prepare_histogram(h1, h2)
if 0 == p: raise ValueError('p can not be zero')
elif int == type(p):
if p > 0 and p < 25: return __minowski_low_positive_integer_p(h1, h2, p)
elif p < 0 and p > -25: return __minowski_low_negative_integer_p(h1, h2, p)
return math.pow(scipy.sum(scipy.power(scipy.absolute(h1 - h2), p)), 1./p)
def __minowski_low_positive_integer_p(h1, h2, p = 2): # 11..43 us for p = 1..24 \w 100 bins
"""
A faster implementation of the Minowski distance for positive integer < 25.
@note do not use this function directly, but the general @link minowski() method.
@note the passed histograms must be scipy arrays.
"""
mult = scipy.absolute(h1 - h2)
dif = mult
for _ in range(p - 1): dif = scipy.multiply(dif, mult)
return math.pow(scipy.sum(dif), 1./p)
def __minowski_low_negative_integer_p(h1, h2, p = 2): # 14..46 us for p = -1..-24 \w 100 bins
"""
A faster implementation of the Minowski distance for negative integer > -25.
@note do not use this function directly, but the general @link minowski() method.
@note the passed histograms must be scipy arrays.
"""
mult = scipy.absolute(h1 - h2)
dif = mult
for _ in range(-p + 1): dif = scipy.multiply(dif, mult)
return math.pow(scipy.sum(1./dif), 1./p)
[docs]def manhattan(h1, h2): # # 7 us @array, 31 us @list \w 100 bins
r"""
Equal to Minowski distance with :math:`p=1`.
See also
--------
minowski
"""
h1, h2 = __prepare_histogram(h1, h2)
return scipy.sum(scipy.absolute(h1 - h2))
[docs]def euclidean(h1, h2): # 9 us @array, 33 us @list \w 100 bins
r"""
Equal to Minowski distance with :math:`p=2`.
See also
--------
minowski
"""
h1, h2 = __prepare_histogram(h1, h2)
return math.sqrt(scipy.sum(scipy.square(scipy.absolute(h1 - h2))))
[docs]def chebyshev(h1, h2): # 12 us @array, 36 us @list \w 100 bins
r"""
Chebyshev distance.
Also Tchebychev distance, Maximum or :math:`L_{\infty}` metric; equal to Minowski
distance with :math:`p=+\infty`. For the case of :math:`p=-\infty`, use `chebyshev_neg`.
The Chebyshev distance between two histograms :math:`H` and :math:`H'` of size :math:`m` is
defined as:
.. math::
d_{\infty}(H, H') = \max_{m=1}^M|H_m-H'_m|
*Attributes:*
- semimetric (triangle equation satisfied?)
*Attributes for normalized histograms:*
- :math:`d(H, H')\in[0, 1]`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-normalized histograms:*
- :math:`d(H, H')\in[0, \infty)`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-equal histograms:*
- not applicable
Parameters
----------
h1 : sequence
The first histogram.
h2 : sequence
The second histogram.
Returns
-------
chebyshev : float
Chebyshev distance.
See also
--------
minowski, chebyshev_neg
"""
h1, h2 = __prepare_histogram(h1, h2)
return max(scipy.absolute(h1 - h2))
[docs]def chebyshev_neg(h1, h2): # 12 us @array, 36 us @list \w 100 bins
r"""
Chebyshev negative distance.
Also Tchebychev distance, Minimum or :math:`L_{-\infty}` metric; equal to Minowski
distance with :math:`p=-\infty`. For the case of :math:`p=+\infty`, use `chebyshev`.
The Chebyshev distance between two histograms :math:`H` and :math:`H'` of size :math:`m` is
defined as:
.. math::
d_{-\infty}(H, H') = \min_{m=1}^M|H_m-H'_m|
*Attributes:*
- semimetric (triangle equation satisfied?)
*Attributes for normalized histograms:*
- :math:`d(H, H')\in[0, 1]`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-normalized histograms:*
- :math:`d(H, H')\in[0, \infty)`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-equal histograms:*
- not applicable
Parameters
----------
h1 : sequence
The first histogram.
h2 : sequence
The second histogram.
Returns
-------
chebyshev_neg : float
Chebyshev negative distance.
See also
--------
minowski, chebyshev
"""
h1, h2 = __prepare_histogram(h1, h2)
return min(scipy.absolute(h1 - h2))
[docs]def histogram_intersection(h1, h2): # 6 us @array, 30 us @list \w 100 bins
r"""
Calculate the common part of two histograms.
The histogram intersection between two histograms :math:`H` and :math:`H'` of size :math:`m` is
defined as:
.. math::
d_{\cap}(H, H') = \sum_{m=1}^M\min(H_m, H'_m)
*Attributes:*
- a real metric
*Attributes for normalized histograms:*
- :math:`d(H, H')\in[0, 1]`
- :math:`d(H, H) = 1`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-normalized histograms:*
- not applicable
*Attributes for not-equal histograms:*
- not applicable
Parameters
----------
h1 : sequence
The first histogram, normalized.
h2 : sequence
The second histogram, normalized, same bins as ``h1``.
Returns
-------
histogram_intersection : float
Intersection between the two histograms.
"""
h1, h2 = __prepare_histogram(h1, h2)
return scipy.sum(scipy.minimum(h1, h2))
[docs]def histogram_intersection_1(h1, h2): # 7 us @array, 31 us @list \w 100 bins
r"""
Turns the histogram intersection similarity into a distance measure for normalized,
positive histograms.
.. math::
d_{\bar{\cos}}(H, H') = 1 - d_{\cap}(H, H')
See `histogram_intersection` for the definition of :math:`d_{\cap}(H, H')`.
*Attributes:*
- semimetric
*Attributes for normalized histograms:*
- :math:`d(H, H')\in[0, 1]`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-normalized histograms:*
- not applicable
*Attributes for not-equal histograms:*
- not applicable
Parameters
----------
h1 : sequence
The first histogram, normalized.
h2 : sequence
The second histogram, normalized, same bins as ``h1``.
Returns
-------
histogram_intersection : float
Intersection between the two histograms.
"""
return 1. - histogram_intersection(h1, h2)
[docs]def relative_deviation(h1, h2): # 18 us @array, 42 us @list \w 100 bins
r"""
Calculate the deviation between two histograms.
The relative deviation between two histograms :math:`H` and :math:`H'` of size :math:`m` is
defined as:
.. math::
d_{rd}(H, H') =
\frac{
\sqrt{\sum_{m=1}^M(H_m - H'_m)^2}
}{
\frac{1}{2}
\left(
\sqrt{\sum_{m=1}^M H_m^2} +
\sqrt{\sum_{m=1}^M {H'}_m^2}
\right)
}
*Attributes:*
- semimetric (triangle equation satisfied?)
*Attributes for normalized histograms:*
- :math:`d(H, H')\in[0, \sqrt{2}]`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-normalized histograms:*
- :math:`d(H, H')\in[0, 2]`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-equal histograms:*
- not applicable
Parameters
----------
h1 : sequence
The first histogram.
h2 : sequence
The second histogram, same bins as ``h1``.
Returns
-------
relative_deviation : float
Relative deviation between the two histograms.
"""
h1, h2 = __prepare_histogram(h1, h2)
numerator = math.sqrt(scipy.sum(scipy.square(h1 - h2)))
denominator = (math.sqrt(scipy.sum(scipy.square(h1))) + math.sqrt(scipy.sum(scipy.square(h2)))) / 2.
return numerator / denominator
[docs]def relative_bin_deviation(h1, h2): # 79 us @array, 104 us @list \w 100 bins
r"""
Calculate the bin-wise deviation between two histograms.
The relative bin deviation between two histograms :math:`H` and :math:`H'` of size
:math:`m` is defined as:
.. math::
d_{rbd}(H, H') = \sum_{m=1}^M
\frac{
\sqrt{(H_m - H'_m)^2}
}{
\frac{1}{2}
\left(
\sqrt{H_m^2} +
\sqrt{{H'}_m^2}
\right)
}
*Attributes:*
- a real metric
*Attributes for normalized histograms:*
- :math:`d(H, H')\in[0, \infty)`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-normalized histograms:*
- :math:`d(H, H')\in[0, \infty)`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-equal histograms:*
- not applicable
Parameters
----------
h1 : sequence
The first histogram.
h2 : sequence
The second histogram, same bins as ``h1``.
Returns
-------
relative_bin_deviation : float
Relative bin deviation between the two histograms.
"""
h1, h2 = __prepare_histogram(h1, h2)
numerator = scipy.sqrt(scipy.square(h1 - h2))
denominator = (scipy.sqrt(scipy.square(h1)) + scipy.sqrt(scipy.square(h2))) / 2.
old_err_state = scipy.seterr(invalid='ignore') # divide through zero only occurs when the bin is zero in both histograms, in which case the division is 0/0 and leads to (and should lead to) 0
result = numerator / denominator
scipy.seterr(**old_err_state)
result[scipy.isnan(result)] = 0 # faster than scipy.nan_to_num, which checks for +inf and -inf also
return scipy.sum(result)
[docs]def chi_square(h1, h2): # 23 us @array, 49 us @list \w 100
r"""
Chi-square distance.
Measure how unlikely it is that one distribution (histogram) was drawn from the
other. The Chi-square distance between two histograms :math:`H` and :math:`H'` of size
:math:`m` is defined as:
.. math::
d_{\chi^2}(H, H') = \sum_{m=1}^M
\frac{
(H_m - H'_m)^2
}{
H_m + H'_m
}
*Attributes:*
- semimetric
*Attributes for normalized histograms:*
- :math:`d(H, H')\in[0, 2]`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-normalized histograms:*
- :math:`d(H, H')\in[0, \infty)`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-equal histograms:*
- not applicable
Parameters
----------
h1 : sequence
The first histogram.
h2 : sequence
The second histogram.
Returns
-------
chi_square : float
Chi-square distance.
"""
h1, h2 = __prepare_histogram(h1, h2)
old_err_state = scipy.seterr(invalid='ignore') # divide through zero only occurs when the bin is zero in both histograms, in which case the division is 0/0 and leads to (and should lead to) 0
result = scipy.square(h1 - h2) / (h1 + h2)
scipy.seterr(**old_err_state)
result[scipy.isnan(result)] = 0 # faster than scipy.nan_to_num, which checks for +inf and -inf also
return scipy.sum(result)
[docs]def kullback_leibler(h1, h2): # 83 us @array, 109 us @list \w 100 bins
r"""
Kullback-Leibler divergence.
Compute how inefficient it would to be code one histogram into another.
Actually computes :math:`\frac{d_{KL}(h1, h2) + d_{KL}(h2, h1)}{2}` to achieve symmetry.
The Kullback-Leibler divergence between two histograms :math:`H` and :math:`H'` of size
:math:`m` is defined as:
.. math::
d_{KL}(H, H') = \sum_{m=1}^M H_m\log\frac{H_m}{H'_m}
*Attributes:*
- quasimetric (but made symetric)
*Attributes for normalized histograms:*
- :math:`d(H, H')\in[0, \infty)`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-normalized histograms:*
- not applicable
*Attributes for not-equal histograms:*
- not applicable
Parameters
----------
h1 : sequence
The first histogram, where h1[i] > 0 for any i such that h2[i] > 0, normalized.
h2 : sequence
The second histogram, where h2[i] > 0 for any i such that h1[i] > 0, normalized, same bins as ``h1``.
Returns
-------
kullback_leibler : float
Kullback-Leibler divergence.
"""
old_err_state = scipy.seterr(divide='raise')
try:
h1, h2 = __prepare_histogram(h1, h2)
result = (__kullback_leibler(h1, h2) + __kullback_leibler(h2, h1)) / 2.
scipy.seterr(**old_err_state)
return result
except FloatingPointError:
scipy.seterr(**old_err_state)
raise ValueError('h1 can only contain zero values where h2 also contains zero values and vice-versa')
def __kullback_leibler(h1, h2): # 36.3 us
"""
The actual KL implementation. @see kullback_leibler() for details.
Expects the histograms to be of type scipy.ndarray.
"""
result = h1.astype(scipy.float_)
mask = h1 != 0
result[mask] = scipy.multiply(h1[mask], scipy.log(h1[mask] / h2[mask]))
return scipy.sum(result)
[docs]def jensen_shannon(h1, h2): # 85 us @array, 110 us @list \w 100 bins
r"""
Jensen-Shannon divergence.
A symmetric and numerically more stable empirical extension of the Kullback-Leibler
divergence.
The Jensen Shannon divergence between two histograms :math:`H` and :math:`H'` of size
:math:`m` is defined as:
.. math::
d_{JSD}(H, H') =
\frac{1}{2} d_{KL}(H, H^*) +
\frac{1}{2} d_{KL}(H', H^*)
with :math:`H^*=\frac{1}{2}(H + H')`.
*Attributes:*
- semimetric
*Attributes for normalized histograms:*
- :math:`d(H, H')\in[0, 1]`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-normalized histograms:*
- :math:`d(H, H')\in[0, \infty)`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-equal histograms:*
- not applicable
Parameters
----------
h1 : sequence
The first histogram.
h2 : sequence
The second histogram, same bins as ``h1``.
Returns
-------
jensen_shannon : float
Jensen-Shannon divergence.
"""
h1, h2 = __prepare_histogram(h1, h2)
s = (h1 + h2) / 2.
return __kullback_leibler(h1, s) / 2. + __kullback_leibler(h2, s) / 2.
[docs]def fidelity_based(h1, h2): # 25 us @array, 51 us @list \w 100 bins
r"""
Fidelity based distance.
Also Bhattacharyya distance; see also the extensions `noelle_1` to `noelle_5`.
The metric between two histograms :math:`H` and :math:`H'` of size :math:`m` is defined as:
.. math::
d_{F}(H, H') = \sum_{m=1}^M\sqrt{H_m * H'_m}
*Attributes:*
- not a metric, a similarity
*Attributes for normalized histograms:*
- :math:`d(H, H')\in[0, 1]`
- :math:`d(H, H) = 1`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-normalized histograms:*
- not applicable
*Attributes for not-equal histograms:*
- not applicable
Parameters
----------
h1 : sequence
The first histogram, normalized.
h2 : sequence
The second histogram, normalized, same bins as ``h1``.
Returns
-------
fidelity_based : float
Fidelity based distance.
Notes
-----
The fidelity between two histograms :math:`H` and :math:`H'` is the same as the
cosine between their square roots :math:`\sqrt{H}` and :math:`\sqrt{H'}`.
"""
h1, h2 = __prepare_histogram(h1, h2)
result = scipy.sum(scipy.sqrt(h1 * h2))
result = 0 if 0 > result else result # for rounding errors
result = 1 if 1 < result else result # for rounding errors
return result
[docs]def noelle_1(h1, h2): # 26 us @array, 52 us @list \w 100 bins
r"""
Extension of `fidelity_based` proposed by [1]_.
.. math::
d_{\bar{F}}(H, H') = 1 - d_{F}(H, H')
See `fidelity_based` for the definition of :math:`d_{F}(H, H')`.
*Attributes:*
- semimetric
*Attributes for normalized histograms:*
- :math:`d(H, H')\in[0, 1]`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-normalized histograms:*
- not applicable
*Attributes for not-equal histograms:*
- not applicable
Parameters
----------
h1 : sequence
The first histogram, normalized.
h2 : sequence
The second histogram, normalized, same bins as ``h1``.
Returns
-------
fidelity_based : float
Fidelity based distance.
References
----------
.. [1] M. Noelle "Distribution Distance Measures Applied to 3-D Object Recognition", 2003
"""
return 1. - fidelity_based(h1, h2)
[docs]def noelle_2(h1, h2): # 26 us @array, 52 us @list \w 100 bins
r"""
Extension of `fidelity_based` proposed by [1]_.
.. math::
d_{\sqrt{1-F}}(H, H') = \sqrt{1 - d_{F}(H, H')}
See `fidelity_based` for the definition of :math:`d_{F}(H, H')`.
*Attributes:*
- metric
*Attributes for normalized histograms:*
- :math:`d(H, H')\in[0, 1]`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-normalized histograms:*
- not applicable
*Attributes for not-equal histograms:*
- not applicable
Parameters
----------
h1 : sequence
The first histogram, normalized.
h2 : sequence
The second histogram, normalized, same bins as ``h1``.
Returns
-------
fidelity_based : float
Fidelity based distance.
References
----------
.. [1] M. Noelle "Distribution Distance Measures Applied to 3-D Object Recognition", 2003
"""
return math.sqrt(1. - fidelity_based(h1, h2))
[docs]def noelle_3(h1, h2): # 26 us @array, 52 us @list \w 100 bins
r"""
Extension of `fidelity_based` proposed by [1]_.
.. math::
d_{\log(2-F)}(H, H') = \log(2 - d_{F}(H, H'))
See `fidelity_based` for the definition of :math:`d_{F}(H, H')`.
*Attributes:*
- semimetric
*Attributes for normalized histograms:*
- :math:`d(H, H')\in[0, log(2)]`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-normalized histograms:*
- not applicable
*Attributes for not-equal histograms:*
- not applicable
Parameters
----------
h1 : sequence
The first histogram, normalized.
h2 : sequence
The second histogram, normalized, same bins as ``h1``.
Returns
-------
fidelity_based : float
Fidelity based distance.
References
----------
.. [1] M. Noelle "Distribution Distance Measures Applied to 3-D Object Recognition", 2003
"""
return math.log(2 - fidelity_based(h1, h2))
[docs]def noelle_4(h1, h2): # 26 us @array, 52 us @list \w 100 bins
r"""
Extension of `fidelity_based` proposed by [1]_.
.. math::
d_{\arccos F}(H, H') = \frac{2}{\pi} \arccos d_{F}(H, H')
See `fidelity_based` for the definition of :math:`d_{F}(H, H')`.
*Attributes:*
- metric
*Attributes for normalized histograms:*
- :math:`d(H, H')\in[0, 1]`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-normalized histograms:*
- not applicable
*Attributes for not-equal histograms:*
- not applicable
Parameters
----------
h1 : sequence
The first histogram, normalized.
h2 : sequence
The second histogram, normalized, same bins as ``h1``.
Returns
-------
fidelity_based : float
Fidelity based distance.
References
----------
.. [1] M. Noelle "Distribution Distance Measures Applied to 3-D Object Recognition", 2003
"""
return 2. / math.pi * math.acos(fidelity_based(h1, h2))
[docs]def noelle_5(h1, h2): # 26 us @array, 52 us @list \w 100 bins
r"""
Extension of `fidelity_based` proposed by [1]_.
.. math::
d_{\sin F}(H, H') = \sqrt{1 -d_{F}^2(H, H')}
See `fidelity_based` for the definition of :math:`d_{F}(H, H')`.
*Attributes:*
- metric
*Attributes for normalized histograms:*
- :math:`d(H, H')\in[0, 1]`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-normalized histograms:*
- not applicable
*Attributes for not-equal histograms:*
- not applicable
Parameters
----------
h1 : sequence
The first histogram, normalized.
h2 : sequence
The second histogram, normalized, same bins as ``h1``.
Returns
-------
fidelity_based : float
Fidelity based distance.
References
----------
.. [1] M. Noelle "Distribution Distance Measures Applied to 3-D Object Recognition", 2003
"""
return math.sqrt(1 - math.pow(fidelity_based(h1, h2), 2))
[docs]def cosine_alt(h1, h2): # 17 us @array, 42 us @list \w 100 bins
r"""
Alternative implementation of the `cosine` distance measure.
Notes
-----
Under development.
"""
h1, h2 = __prepare_histogram(h1, h2)
return -1 * float(scipy.sum(h1 * h2)) / (scipy.sum(scipy.power(h1, 2)) * scipy.sum(scipy.power(h2, 2)))
[docs]def cosine(h1, h2): # 17 us @array, 42 us @list \w 100 bins
r"""
Cosine simmilarity.
Compute the angle between the two histograms in vector space irrespective of their
length. The cosine similarity between two histograms :math:`H` and :math:`H'` of size
:math:`m` is defined as:
.. math::
d_{\cos}(H, H') = \cos\alpha = \frac{H * H'}{\|H\| \|H'\|} = \frac{\sum_{m=1}^M H_m*H'_m}{\sqrt{\sum_{m=1}^M H_m^2} * \sqrt{\sum_{m=1}^M {H'}_m^2}}
*Attributes:*
- not a metric, a similarity
*Attributes for normalized histograms:*
- :math:`d(H, H')\in[0, 1]`
- :math:`d(H, H) = 1`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-normalized histograms:*
- :math:`d(H, H')\in[-1, 1]`
- :math:`d(H, H) = 1`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-equal histograms:*
- not applicable
Parameters
----------
h1 : sequence
The first histogram.
h2 : sequence
The second histogram, same bins as ``h1``.
Returns
-------
cosine : float
Cosine simmilarity.
Notes
-----
The resulting similarity ranges from -1 meaning exactly opposite, to 1 meaning
exactly the same, with 0 usually indicating independence, and in-between values
indicating intermediate similarity or dissimilarity.
"""
h1, h2 = __prepare_histogram(h1, h2)
return scipy.sum(h1 * h2) / math.sqrt(scipy.sum(scipy.square(h1)) * scipy.sum(scipy.square(h2)))
[docs]def cosine_1(h1, h2): # 18 us @array, 43 us @list \w 100 bins
r"""
Cosine simmilarity.
Turns the cosine similarity into a distance measure for normalized, positive
histograms.
.. math::
d_{\bar{\cos}}(H, H') = 1 - d_{\cos}(H, H')
See `cosine` for the definition of :math:`d_{\cos}(H, H')`.
*Attributes:*
- metric
*Attributes for normalized histograms:*
- :math:`d(H, H')\in[0, 1]`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-normalized histograms:*
- not applicable
*Attributes for not-equal histograms:*
- not applicable
Parameters
----------
h1 : sequence
The first histogram, normalized.
h2 : sequence
The second histogram, normalized, same bins as ``h1``.
Returns
-------
cosine : float
Cosine distance.
"""
return 1. - cosine(h1, h2)
[docs]def cosine_2(h1, h2): # 19 us @array, 44 us @list \w 100 bins
r"""
Cosine simmilarity.
Turns the cosine similarity into a distance measure for normalized, positive
histograms.
.. math::
d_{\bar{\cos}}(H, H') = 1 - \frac{2*\arccos d_{\cos}(H, H')}{pi}
See `cosine` for the definition of :math:`d_{\cos}(H, H')`.
*Attributes:*
- metric
*Attributes for normalized histograms:*
- :math:`d(H, H')\in[0, 1]`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-normalized histograms:*
- not applicable
*Attributes for not-equal histograms:*
- not applicable
Parameters
----------
h1 : sequence
The first histogram, normalized.
h2 : sequence
The second histogram, normalized, same bins as ``h1``.
Returns
-------
cosine : float
Cosine distance.
"""
return 1. - (2 * cosine(h1, h2)) / math.pi
[docs]def correlate(h1, h2): # 31 us @array, 55 us @list \w 100 bins
r"""
Correlation between two histograms.
The histogram correlation between two histograms :math:`H` and :math:`H'` of size :math:`m`
is defined as:
.. math::
d_{corr}(H, H') =
\frac{
\sum_{m=1}^M (H_m-\bar{H}) \cdot (H'_m-\bar{H'})
}{
\sqrt{\sum_{m=1}^M (H_m-\bar{H})^2 \cdot \sum_{m=1}^M (H'_m-\bar{H'})^2}
}
with :math:`\bar{H}` and :math:`\bar{H'}` being the mean values of :math:`H` resp. :math:`H'`
*Attributes:*
- not a metric, a similarity
*Attributes for normalized histograms:*
- :math:`d(H, H')\in[-1, 1]`
- :math:`d(H, H) = 1`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-normalized histograms:*
- :math:`d(H, H')\in[-1, 1]`
- :math:`d(H, H) = 1`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-equal histograms:*
- not applicable
Parameters
----------
h1 : sequence
The first histogram.
h2 : sequence
The second histogram, same bins as ``h1``.
Returns
-------
correlate : float
Correlation between the histograms.
Notes
-----
Returns 0 if one of h1 or h2 contain only zeros.
"""
h1, h2 = __prepare_histogram(h1, h2)
h1m = h1 - scipy.sum(h1) / float(h1.size)
h2m = h2 - scipy.sum(h2) / float(h2.size)
a = scipy.sum(scipy.multiply(h1m, h2m))
b = math.sqrt(scipy.sum(scipy.square(h1m)) * scipy.sum(scipy.square(h2m)))
return 0 if 0 == b else a / b
[docs]def correlate_1(h1, h2): # 32 us @array, 56 us @list \w 100 bins
r"""
Correlation distance.
Turns the histogram correlation into a distance measure for normalized, positive
histograms.
.. math::
d_{\bar{corr}}(H, H') = 1-\frac{d_{corr}(H, H')}{2}.
See `correlate` for the definition of :math:`d_{corr}(H, H')`.
*Attributes:*
- semimetric
*Attributes for normalized histograms:*
- :math:`d(H, H')\in[0, 1]`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-normalized histograms:*
- :math:`d(H, H')\in[0, 1]`
- :math:`d(H, H) = 0`
- :math:`d(H, H') = d(H', H)`
*Attributes for not-equal histograms:*
- not applicable
Parameters
----------
h1 : sequence
The first histogram.
h2 : sequence
The second histogram, same bins as ``h1``.
Returns
-------
correlate : float
Correlation distnace between the histograms.
Notes
-----
Returns 0.5 if one of h1 or h2 contains only zeros.
"""
return (1. - correlate(h1, h2))/2.
# ///////////////////////////// #
# Cross-bin comparison measures #
# ///////////////////////////// #
def __quadratic_forms_matrix_euclidean(h1, h2):
r"""
Compute the bin-similarity matrix for the quadratic form distance measure.
The matric :math:`A` for two histograms :math:`H` and :math:`H'` of size :math:`m` and
:math:`n` respectively is defined as
.. math::
A_{m,n} = 1 - \frac{d_2(H_m, {H'}_n)}{d_{max}}
with
.. math::
d_{max} = \max_{m,n}d_2(H_m, {H'}_n)
See also
--------
quadratic_forms
"""
A = scipy.repeat(h2[:,scipy.newaxis], h1.size, 1) # repeat second array to form a matrix
A = scipy.absolute(A - h1) # euclidean distances
return 1 - (A / float(A.max()))
# //////////////// #
# Helper functions #
# //////////////// #
def __prepare_histogram(h1, h2):
"""Convert the histograms to scipy.ndarrays if required."""
h1 = h1 if scipy.ndarray == type(h1) else scipy.asarray(h1)
h2 = h2 if scipy.ndarray == type(h2) else scipy.asarray(h2)
if h1.shape != h2.shape or h1.size != h2.size:
raise ValueError('h1 and h2 must be of same shape and size')
return h1, h2