# Source code for medpy.metric.histogram

# Copyright (C) 2013 Oskar Maier
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# author Oskar Maier
# version r0.1.0
# since 2011-12-01
# status Release

# build-in modules
import math

# third-party modules
import scipy

# own modules

# code
# ////////////////////////////// #
# Bin-by-bin comparison measures #
# ////////////////////////////// #

[docs]def minowski(h1, h2, p = 2): # 46..45..14,11..43..44 / 45 us for p=int(-inf..-24..-1,1..24..inf) / float @array, +20 us @list \w 100 bins
r"""
Minowski distance.

With :math:p=2 equal to the Euclidean distance, with :math:p=1 equal to the Manhattan distance,
and the Chebyshev distance implementation represents the case of :math:p=\pm inf.

The Minowksi distance between two histograms :math:H and :math:H' of size :math:m is
defined as:

.. math::

d_p(H, H') = \left(\sum_{m=1}^M|H_m - H'_m|^p
\right)^{\frac{1}{p}}

*Attributes:*

- a real metric

*Attributes for normalized histograms:*

- :math:d(H, H')\in[0, \sqrt[p]{2}]
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-normalized histograms:*

- :math:d(H, H')\in[0, \infty)
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-equal histograms:*

- not applicable

Parameters
----------
h1 : sequence
The first histogram.
h2 : sequence
The second histogram.
p : float
The :math:p value in the Minowksi distance formula.

Returns
-------
minowski : float
Minowski distance.

Raises
------
ValueError
If p is zero.
"""
h1, h2 = __prepare_histogram(h1, h2)
if 0 == p: raise ValueError('p can not be zero')
elif int == type(p):
if p > 0 and p < 25: return __minowski_low_positive_integer_p(h1, h2, p)
elif p < 0 and p > -25: return __minowski_low_negative_integer_p(h1, h2, p)
return math.pow(scipy.sum(scipy.power(scipy.absolute(h1 - h2), p)), 1./p)

def __minowski_low_positive_integer_p(h1, h2, p = 2): # 11..43 us for p = 1..24 \w 100 bins
"""
A faster implementation of the Minowski distance for positive integer < 25.
@note do not use this function directly, but the general @link minowski() method.
@note the passed histograms must be scipy arrays.
"""
mult = scipy.absolute(h1 - h2)
dif = mult
for _ in range(p - 1): dif = scipy.multiply(dif, mult)
return math.pow(scipy.sum(dif), 1./p)

def __minowski_low_negative_integer_p(h1, h2, p = 2): # 14..46 us for p = -1..-24 \w 100 bins
"""
A faster implementation of the Minowski distance for negative integer > -25.
@note do not use this function directly, but the general @link minowski() method.
@note the passed histograms must be scipy arrays.
"""
mult = scipy.absolute(h1 - h2)
dif = mult
for _ in range(-p + 1): dif = scipy.multiply(dif, mult)
return math.pow(scipy.sum(1./dif), 1./p)

[docs]def manhattan(h1, h2): # # 7 us @array, 31 us @list \w 100 bins
r"""
Equal to Minowski distance with :math:p=1.

See also
--------
minowski
"""
h1, h2 = __prepare_histogram(h1, h2)
return scipy.sum(scipy.absolute(h1 - h2))

[docs]def euclidean(h1, h2): # 9 us @array, 33 us @list \w 100 bins
r"""
Equal to Minowski distance with :math:p=2.

See also
--------
minowski
"""
h1, h2 = __prepare_histogram(h1, h2)
return math.sqrt(scipy.sum(scipy.square(scipy.absolute(h1 - h2))))

[docs]def chebyshev(h1, h2): # 12 us @array, 36 us @list \w 100 bins
r"""
Chebyshev distance.

Also Tchebychev distance, Maximum or :math:L_{\infty} metric; equal to Minowski
distance with :math:p=+\infty. For the case of :math:p=-\infty, use chebyshev_neg.

The Chebyshev distance between two histograms :math:H and :math:H' of size :math:m is
defined as:

.. math::

d_{\infty}(H, H') = \max_{m=1}^M|H_m-H'_m|

*Attributes:*

- semimetric (triangle equation satisfied?)

*Attributes for normalized histograms:*

- :math:d(H, H')\in[0, 1]
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-normalized histograms:*

- :math:d(H, H')\in[0, \infty)
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-equal histograms:*

- not applicable

Parameters
----------
h1 : sequence
The first histogram.
h2 : sequence
The second histogram.

Returns
-------
chebyshev : float
Chebyshev distance.

See also
--------
minowski, chebyshev_neg
"""
h1, h2 = __prepare_histogram(h1, h2)
return max(scipy.absolute(h1 - h2))

[docs]def chebyshev_neg(h1, h2): # 12 us @array, 36 us @list \w 100 bins
r"""
Chebyshev negative distance.

Also Tchebychev distance, Minimum or :math:L_{-\infty} metric; equal to Minowski
distance with :math:p=-\infty. For the case of :math:p=+\infty, use chebyshev.

The Chebyshev distance between two histograms :math:H and :math:H' of size :math:m is
defined as:

.. math::

d_{-\infty}(H, H') = \min_{m=1}^M|H_m-H'_m|

*Attributes:*

- semimetric (triangle equation satisfied?)

*Attributes for normalized histograms:*

- :math:d(H, H')\in[0, 1]
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-normalized histograms:*

- :math:d(H, H')\in[0, \infty)
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-equal histograms:*

- not applicable

Parameters
----------
h1 : sequence
The first histogram.
h2 : sequence
The second histogram.

Returns
-------
chebyshev_neg : float
Chebyshev negative distance.

See also
--------
minowski, chebyshev
"""
h1, h2 = __prepare_histogram(h1, h2)
return min(scipy.absolute(h1 - h2))

[docs]def histogram_intersection(h1, h2): # 6 us @array, 30 us @list \w 100 bins
r"""
Calculate the common part of two histograms.

The histogram intersection between two histograms :math:H and :math:H' of size :math:m is
defined as:

.. math::

d_{\cap}(H, H') = \sum_{m=1}^M\min(H_m, H'_m)

*Attributes:*

- a real metric

*Attributes for normalized histograms:*

- :math:d(H, H')\in[0, 1]
- :math:d(H, H) = 1
- :math:d(H, H') = d(H', H)

*Attributes for not-normalized histograms:*

- not applicable

*Attributes for not-equal histograms:*

- not applicable

Parameters
----------
h1 : sequence
The first histogram, normalized.
h2 : sequence
The second histogram, normalized, same bins as h1.

Returns
-------
histogram_intersection : float
Intersection between the two histograms.
"""
h1, h2 = __prepare_histogram(h1, h2)
return scipy.sum(scipy.minimum(h1, h2))

[docs]def histogram_intersection_1(h1, h2): # 7 us @array, 31 us @list \w 100 bins
r"""
Turns the histogram intersection similarity into a distance measure for normalized,
positive histograms.

.. math::

d_{\bar{\cos}}(H, H') = 1 - d_{\cap}(H, H')

See histogram_intersection for the definition of :math:d_{\cap}(H, H').

*Attributes:*

- semimetric

*Attributes for normalized histograms:*

- :math:d(H, H')\in[0, 1]
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-normalized histograms:*

- not applicable

*Attributes for not-equal histograms:*

- not applicable

Parameters
----------
h1 : sequence
The first histogram, normalized.
h2 : sequence
The second histogram, normalized, same bins as h1.

Returns
-------
histogram_intersection : float
Intersection between the two histograms.
"""
return 1. - histogram_intersection(h1, h2)

[docs]def relative_deviation(h1, h2): # 18 us @array, 42 us @list \w 100 bins
r"""
Calculate the deviation between two histograms.

The relative deviation between two histograms :math:H and :math:H' of size :math:m is
defined as:

.. math::

d_{rd}(H, H') =
\frac{
\sqrt{\sum_{m=1}^M(H_m - H'_m)^2}
}{
\frac{1}{2}
\left(
\sqrt{\sum_{m=1}^M H_m^2} +
\sqrt{\sum_{m=1}^M {H'}_m^2}
\right)
}

*Attributes:*

- semimetric (triangle equation satisfied?)

*Attributes for normalized histograms:*

- :math:d(H, H')\in[0, \sqrt{2}]
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-normalized histograms:*

- :math:d(H, H')\in[0, 2]
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-equal histograms:*

- not applicable

Parameters
----------
h1 : sequence
The first histogram.
h2 : sequence
The second histogram, same bins as h1.

Returns
-------
relative_deviation : float
Relative deviation between the two histograms.
"""
h1, h2 = __prepare_histogram(h1, h2)
numerator = math.sqrt(scipy.sum(scipy.square(h1 - h2)))
denominator = (math.sqrt(scipy.sum(scipy.square(h1))) + math.sqrt(scipy.sum(scipy.square(h2)))) / 2.
return numerator / denominator

[docs]def relative_bin_deviation(h1, h2): # 79 us @array, 104 us @list \w 100 bins
r"""
Calculate the bin-wise deviation between two histograms.

The relative bin deviation between two histograms :math:H and :math:H' of size
:math:m is defined as:

.. math::

d_{rbd}(H, H') = \sum_{m=1}^M
\frac{
\sqrt{(H_m - H'_m)^2}
}{
\frac{1}{2}
\left(
\sqrt{H_m^2} +
\sqrt{{H'}_m^2}
\right)
}

*Attributes:*

- a real metric

*Attributes for normalized histograms:*

- :math:d(H, H')\in[0, \infty)
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-normalized histograms:*

- :math:d(H, H')\in[0, \infty)
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-equal histograms:*

- not applicable

Parameters
----------
h1 : sequence
The first histogram.
h2 : sequence
The second histogram, same bins as h1.

Returns
-------
relative_bin_deviation : float
Relative bin deviation between the two histograms.
"""
h1, h2 = __prepare_histogram(h1, h2)
numerator = scipy.sqrt(scipy.square(h1 - h2))
denominator = (scipy.sqrt(scipy.square(h1)) + scipy.sqrt(scipy.square(h2))) / 2.
old_err_state = scipy.seterr(invalid='ignore') # divide through zero only occurs when the bin is zero in both histograms, in which case the division is 0/0 and leads to (and should lead to) 0
result = numerator / denominator
scipy.seterr(**old_err_state)
result[scipy.isnan(result)] = 0 # faster than scipy.nan_to_num, which checks for +inf and -inf also
return scipy.sum(result)

[docs]def chi_square(h1, h2): # 23 us @array, 49 us @list \w 100
r"""
Chi-square distance.

Measure how unlikely it is that one distribution (histogram) was drawn from the
other. The Chi-square distance between two histograms :math:H and :math:H' of size
:math:m is defined as:

.. math::

d_{\chi^2}(H, H') = \sum_{m=1}^M
\frac{
(H_m - H'_m)^2
}{
H_m + H'_m
}

*Attributes:*

- semimetric

*Attributes for normalized histograms:*

- :math:d(H, H')\in[0, 2]
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-normalized histograms:*

- :math:d(H, H')\in[0, \infty)
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-equal histograms:*

- not applicable

Parameters
----------
h1 : sequence
The first histogram.
h2 : sequence
The second histogram.

Returns
-------
chi_square : float
Chi-square distance.
"""
h1, h2 = __prepare_histogram(h1, h2)
old_err_state = scipy.seterr(invalid='ignore') # divide through zero only occurs when the bin is zero in both histograms, in which case the division is 0/0 and leads to (and should lead to) 0
result = scipy.square(h1 - h2) / (h1 + h2)
scipy.seterr(**old_err_state)
result[scipy.isnan(result)] = 0 # faster than scipy.nan_to_num, which checks for +inf and -inf also
return scipy.sum(result)

[docs]def kullback_leibler(h1, h2): # 83 us @array, 109 us @list \w 100 bins
r"""
Kullback-Leibler divergence.

Compute how inefficient it would to be code one histogram into another.
Actually computes :math:\frac{d_{KL}(h1, h2) + d_{KL}(h2, h1)}{2} to achieve symmetry.

The Kullback-Leibler divergence between two histograms :math:H and :math:H' of size
:math:m is defined as:

.. math::

d_{KL}(H, H') = \sum_{m=1}^M H_m\log\frac{H_m}{H'_m}

*Attributes:*

- quasimetric (but made symetric)

*Attributes for normalized histograms:*

- :math:d(H, H')\in[0, \infty)
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-normalized histograms:*

- not applicable

*Attributes for not-equal histograms:*

- not applicable

Parameters
----------
h1 : sequence
The first histogram, where h1[i] > 0 for any i such that h2[i] > 0, normalized.
h2 : sequence
The second histogram, where h2[i] > 0 for any i such that h1[i] > 0, normalized, same bins as h1.

Returns
-------
kullback_leibler : float
Kullback-Leibler divergence.

"""
old_err_state = scipy.seterr(divide='raise')
try:
h1, h2 = __prepare_histogram(h1, h2)
result = (__kullback_leibler(h1, h2) + __kullback_leibler(h2, h1)) / 2.
scipy.seterr(**old_err_state)
return result
except FloatingPointError:
scipy.seterr(**old_err_state)
raise ValueError('h1 can only contain zero values where h2 also contains zero values and vice-versa')

def __kullback_leibler(h1, h2): # 36.3 us
"""
The actual KL implementation. @see kullback_leibler() for details.
Expects the histograms to be of type scipy.ndarray.
"""
result = h1.astype(scipy.float_)
mask = h1 != 0
result[mask] = scipy.multiply(h1[mask], scipy.log(h1[mask] / h2[mask]))
return scipy.sum(result)

[docs]def jensen_shannon(h1, h2): # 85 us @array, 110 us @list \w 100 bins
r"""
Jensen-Shannon divergence.

A symmetric and numerically more stable empirical extension of the Kullback-Leibler
divergence.

The Jensen Shannon divergence between two histograms :math:H and :math:H' of size
:math:m is defined as:

.. math::

d_{JSD}(H, H') =
\frac{1}{2} d_{KL}(H, H^*) +
\frac{1}{2} d_{KL}(H', H^*)

with :math:H^*=\frac{1}{2}(H + H').

*Attributes:*

- semimetric

*Attributes for normalized histograms:*

- :math:d(H, H')\in[0, 1]
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-normalized histograms:*

- :math:d(H, H')\in[0, \infty)
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-equal histograms:*

- not applicable

Parameters
----------
h1 : sequence
The first histogram.
h2 : sequence
The second histogram, same bins as h1.

Returns
-------
jensen_shannon : float
Jensen-Shannon divergence.

"""
h1, h2 = __prepare_histogram(h1, h2)
s = (h1 + h2) / 2.
return __kullback_leibler(h1, s) / 2. + __kullback_leibler(h2, s) / 2.

[docs]def fidelity_based(h1, h2): # 25 us @array, 51 us @list \w 100 bins
r"""
Fidelity based distance.

Also Bhattacharyya distance; see also the extensions noelle_1 to noelle_5.

The metric between two histograms :math:H and :math:H' of size :math:m is defined as:

.. math::

d_{F}(H, H') = \sum_{m=1}^M\sqrt{H_m * H'_m}

*Attributes:*

- not a metric, a similarity

*Attributes for normalized histograms:*

- :math:d(H, H')\in[0, 1]
- :math:d(H, H) = 1
- :math:d(H, H') = d(H', H)

*Attributes for not-normalized histograms:*

- not applicable

*Attributes for not-equal histograms:*

- not applicable

Parameters
----------
h1 : sequence
The first histogram, normalized.
h2 : sequence
The second histogram, normalized, same bins as h1.

Returns
-------
fidelity_based : float
Fidelity based distance.

Notes
-----
The fidelity between two histograms :math:H and :math:H' is the same as the
cosine between their square roots :math:\sqrt{H} and :math:\sqrt{H'}.
"""
h1, h2 = __prepare_histogram(h1, h2)
result = scipy.sum(scipy.sqrt(h1 * h2))
result = 0 if 0 > result else result # for rounding errors
result = 1 if 1 < result else result # for rounding errors
return result

[docs]def noelle_1(h1, h2): # 26 us @array, 52 us @list \w 100 bins
r"""
Extension of fidelity_based proposed by [1]_.

.. math::

d_{\bar{F}}(H, H') = 1 - d_{F}(H, H')

See fidelity_based for the definition of :math:d_{F}(H, H').

*Attributes:*

- semimetric

*Attributes for normalized histograms:*

- :math:d(H, H')\in[0, 1]
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-normalized histograms:*

- not applicable

*Attributes for not-equal histograms:*

- not applicable

Parameters
----------
h1 : sequence
The first histogram, normalized.
h2 : sequence
The second histogram, normalized, same bins as h1.

Returns
-------
fidelity_based : float
Fidelity based distance.

References
----------
.. [1] M. Noelle "Distribution Distance Measures Applied to 3-D Object Recognition", 2003
"""
return 1. - fidelity_based(h1, h2)

[docs]def noelle_2(h1, h2): # 26 us @array, 52 us @list \w 100 bins
r"""
Extension of fidelity_based proposed by [1]_.

.. math::

d_{\sqrt{1-F}}(H, H') = \sqrt{1 - d_{F}(H, H')}

See fidelity_based for the definition of :math:d_{F}(H, H').

*Attributes:*

- metric

*Attributes for normalized histograms:*

- :math:d(H, H')\in[0, 1]
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-normalized histograms:*

- not applicable

*Attributes for not-equal histograms:*

- not applicable

Parameters
----------
h1 : sequence
The first histogram, normalized.
h2 : sequence
The second histogram, normalized, same bins as h1.

Returns
-------
fidelity_based : float
Fidelity based distance.

References
----------
.. [1] M. Noelle "Distribution Distance Measures Applied to 3-D Object Recognition", 2003
"""
return math.sqrt(1. - fidelity_based(h1, h2))

[docs]def noelle_3(h1, h2): # 26 us @array, 52 us @list \w 100 bins
r"""
Extension of fidelity_based proposed by [1]_.

.. math::

d_{\log(2-F)}(H, H') = \log(2 - d_{F}(H, H'))

See fidelity_based for the definition of :math:d_{F}(H, H').

*Attributes:*

- semimetric

*Attributes for normalized histograms:*

- :math:d(H, H')\in[0, log(2)]
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-normalized histograms:*

- not applicable

*Attributes for not-equal histograms:*

- not applicable

Parameters
----------
h1 : sequence
The first histogram, normalized.
h2 : sequence
The second histogram, normalized, same bins as h1.

Returns
-------
fidelity_based : float
Fidelity based distance.

References
----------
.. [1] M. Noelle "Distribution Distance Measures Applied to 3-D Object Recognition", 2003
"""
return math.log(2 - fidelity_based(h1, h2))

[docs]def noelle_4(h1, h2): # 26 us @array, 52 us @list \w 100 bins
r"""
Extension of fidelity_based proposed by [1]_.

.. math::

d_{\arccos F}(H, H') = \frac{2}{\pi} \arccos d_{F}(H, H')

See fidelity_based for the definition of :math:d_{F}(H, H').

*Attributes:*

- metric

*Attributes for normalized histograms:*

- :math:d(H, H')\in[0, 1]
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-normalized histograms:*

- not applicable

*Attributes for not-equal histograms:*

- not applicable

Parameters
----------
h1 : sequence
The first histogram, normalized.
h2 : sequence
The second histogram, normalized, same bins as h1.

Returns
-------
fidelity_based : float
Fidelity based distance.

References
----------
.. [1] M. Noelle "Distribution Distance Measures Applied to 3-D Object Recognition", 2003
"""
return 2. / math.pi * math.acos(fidelity_based(h1, h2))

[docs]def noelle_5(h1, h2): # 26 us @array, 52 us @list \w 100 bins
r"""
Extension of fidelity_based proposed by [1]_.

.. math::

d_{\sin F}(H, H') = \sqrt{1 -d_{F}^2(H, H')}

See fidelity_based for the definition of :math:d_{F}(H, H').

*Attributes:*

- metric

*Attributes for normalized histograms:*

- :math:d(H, H')\in[0, 1]
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-normalized histograms:*

- not applicable

*Attributes for not-equal histograms:*

- not applicable

Parameters
----------
h1 : sequence
The first histogram, normalized.
h2 : sequence
The second histogram, normalized, same bins as h1.

Returns
-------
fidelity_based : float
Fidelity based distance.

References
----------
.. [1] M. Noelle "Distribution Distance Measures Applied to 3-D Object Recognition", 2003
"""
return math.sqrt(1 - math.pow(fidelity_based(h1, h2), 2))

[docs]def cosine_alt(h1, h2): # 17 us @array, 42 us @list \w 100 bins
r"""
Alternative implementation of the cosine distance measure.

Notes
-----
Under development.
"""
h1, h2 = __prepare_histogram(h1, h2)
return -1 * float(scipy.sum(h1 * h2)) / (scipy.sum(scipy.power(h1, 2)) * scipy.sum(scipy.power(h2, 2)))

[docs]def cosine(h1, h2): # 17 us @array, 42 us @list \w 100 bins
r"""
Cosine simmilarity.

Compute the angle between the two histograms in vector space irrespective of their
length. The cosine similarity between two histograms :math:H and :math:H' of size
:math:m is defined as:

.. math::

d_{\cos}(H, H') = \cos\alpha = \frac{H * H'}{\|H\| \|H'\|} = \frac{\sum_{m=1}^M H_m*H'_m}{\sqrt{\sum_{m=1}^M H_m^2} * \sqrt{\sum_{m=1}^M {H'}_m^2}}

*Attributes:*

- not a metric, a similarity

*Attributes for normalized histograms:*

- :math:d(H, H')\in[0, 1]
- :math:d(H, H) = 1
- :math:d(H, H') = d(H', H)

*Attributes for not-normalized histograms:*

- :math:d(H, H')\in[-1, 1]
- :math:d(H, H) = 1
- :math:d(H, H') = d(H', H)

*Attributes for not-equal histograms:*

- not applicable

Parameters
----------
h1 : sequence
The first histogram.
h2 : sequence
The second histogram, same bins as h1.

Returns
-------
cosine : float
Cosine simmilarity.

Notes
-----
The resulting similarity ranges from -1 meaning exactly opposite, to 1 meaning
exactly the same, with 0 usually indicating independence, and in-between values
indicating intermediate similarity or dissimilarity.
"""
h1, h2 = __prepare_histogram(h1, h2)
return scipy.sum(h1 * h2) / math.sqrt(scipy.sum(scipy.square(h1)) * scipy.sum(scipy.square(h2)))

[docs]def cosine_1(h1, h2): # 18 us @array, 43 us @list \w 100 bins
r"""
Cosine simmilarity.

Turns the cosine similarity into a distance measure for normalized, positive
histograms.

.. math::

d_{\bar{\cos}}(H, H') = 1 - d_{\cos}(H, H')

See cosine for the definition of :math:d_{\cos}(H, H').

*Attributes:*

- metric

*Attributes for normalized histograms:*

- :math:d(H, H')\in[0, 1]
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-normalized histograms:*

- not applicable

*Attributes for not-equal histograms:*

- not applicable

Parameters
----------
h1 : sequence
The first histogram, normalized.
h2 : sequence
The second histogram, normalized, same bins as h1.

Returns
-------
cosine : float
Cosine distance.
"""
return 1. - cosine(h1, h2)

[docs]def cosine_2(h1, h2): # 19 us @array, 44 us @list \w 100 bins
r"""
Cosine simmilarity.

Turns the cosine similarity into a distance measure for normalized, positive
histograms.

.. math::

d_{\bar{\cos}}(H, H') = 1 - \frac{2*\arccos d_{\cos}(H, H')}{pi}

See cosine for the definition of :math:d_{\cos}(H, H').

*Attributes:*

- metric

*Attributes for normalized histograms:*

- :math:d(H, H')\in[0, 1]
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-normalized histograms:*

- not applicable

*Attributes for not-equal histograms:*

- not applicable

Parameters
----------
h1 : sequence
The first histogram, normalized.
h2 : sequence
The second histogram, normalized, same bins as h1.

Returns
-------
cosine : float
Cosine distance.
"""
return 1. - (2 * cosine(h1, h2)) / math.pi

[docs]def correlate(h1, h2): # 31 us @array, 55 us @list \w 100 bins
r"""
Correlation between two histograms.

The histogram correlation between two histograms :math:H and :math:H' of size :math:m
is defined as:

.. math::

d_{corr}(H, H') =
\frac{
\sum_{m=1}^M (H_m-\bar{H}) \cdot (H'_m-\bar{H'})
}{
\sqrt{\sum_{m=1}^M (H_m-\bar{H})^2 \cdot \sum_{m=1}^M (H'_m-\bar{H'})^2}
}

with :math:\bar{H} and :math:\bar{H'} being the mean values of :math:H resp. :math:H'

*Attributes:*

- not a metric, a similarity

*Attributes for normalized histograms:*

- :math:d(H, H')\in[-1, 1]
- :math:d(H, H) = 1
- :math:d(H, H') = d(H', H)

*Attributes for not-normalized histograms:*

- :math:d(H, H')\in[-1, 1]
- :math:d(H, H) = 1
- :math:d(H, H') = d(H', H)

*Attributes for not-equal histograms:*

- not applicable

Parameters
----------
h1 : sequence
The first histogram.
h2 : sequence
The second histogram, same bins as h1.

Returns
-------
correlate : float
Correlation between the histograms.

Notes
-----
Returns 0 if one of h1 or h2 contain only zeros.

"""
h1, h2 = __prepare_histogram(h1, h2)
h1m = h1 - scipy.sum(h1) / float(h1.size)
h2m = h2 - scipy.sum(h2) / float(h2.size)
a = scipy.sum(scipy.multiply(h1m, h2m))
b = math.sqrt(scipy.sum(scipy.square(h1m)) * scipy.sum(scipy.square(h2m)))
return 0 if 0 == b else a / b

[docs]def correlate_1(h1, h2): # 32 us @array, 56 us @list \w 100 bins
r"""
Correlation distance.

Turns the histogram correlation into a distance measure for normalized, positive
histograms.

.. math::

d_{\bar{corr}}(H, H') = 1-\frac{d_{corr}(H, H')}{2}.

See correlate for the definition of :math:d_{corr}(H, H').

*Attributes:*

- semimetric

*Attributes for normalized histograms:*

- :math:d(H, H')\in[0, 1]
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-normalized histograms:*

- :math:d(H, H')\in[0, 1]
- :math:d(H, H) = 0
- :math:d(H, H') = d(H', H)

*Attributes for not-equal histograms:*

- not applicable

Parameters
----------
h1 : sequence
The first histogram.
h2 : sequence
The second histogram, same bins as h1.

Returns
-------
correlate : float
Correlation distnace between the histograms.

Notes
-----
Returns 0.5 if one of h1 or h2 contains only zeros.
"""
return (1. - correlate(h1, h2))/2.

# ///////////////////////////// #
# Cross-bin comparison measures #
# ///////////////////////////// #

[docs]def quadratic_forms(h1, h2):
r"""
Quadrativ forms metric.

Notes
-----
UNDER DEVELOPMENT

This distance measure shows very strange behaviour. The expression
transpose(h1-h2) * A * (h1-h2) yields egative values that can not be processed by the
square root. Some examples::

h1        h2                                          transpose(h1-h2) * A * (h1-h2)
[1, 0] to [0.0, 1.0] :                                -2.0
[1, 0] to [0.5, 0.5] :                                 0.0
[1, 0] to [0.6666666666666667, 0.3333333333333333] :   0.111111111111
[1, 0] to [0.75, 0.25] :                               0.0833333333333
[1, 0] to [0.8, 0.2] :                                 0.06
[1, 0] to [0.8333333333333334, 0.16666666666666666] :  0.0444444444444
[1, 0] to [0.8571428571428572, 0.14285714285714285] :  0.0340136054422
[1, 0] to [0.875, 0.125] :                             0.0267857142857
[1, 0] to [0.8888888888888888, 0.1111111111111111] :   0.0216049382716
[1, 0] to [0.9, 0.1] :                                 0.0177777777778
[1, 0] to [1, 0]:                                      0.0

It is clearly undesireable to recieve negative values and even worse to get a value
of zero for other cases than the same histograms.
"""
h1, h2 = __prepare_histogram(h1, h2)
A = __quadratic_forms_matrix_euclidean(h1, h2)
return math.sqrt((h1-h2).dot(A.dot(h1-h2))) # transpose(h1-h2) * A * (h1-h2)

def __quadratic_forms_matrix_euclidean(h1, h2):
r"""
Compute the bin-similarity matrix for the quadratic form distance measure.
The matric :math:A for two histograms :math:H and :math:H' of size :math:m and
:math:n respectively is defined as

.. math::

A_{m,n} = 1 - \frac{d_2(H_m, {H'}_n)}{d_{max}}

with

.. math::

d_{max} = \max_{m,n}d_2(H_m, {H'}_n)

See also
--------
quadratic_forms
"""
A = scipy.repeat(h2[:,scipy.newaxis], h1.size, 1) # repeat second array to form a matrix
A = scipy.absolute(A - h1) # euclidean distances
return 1 - (A / float(A.max()))

# //////////////// #
# Helper functions #
# //////////////// #

def __prepare_histogram(h1, h2):
"""Convert the histograms to scipy.ndarrays if required."""
h1 = h1 if scipy.ndarray == type(h1) else scipy.asarray(h1)
h2 = h2 if scipy.ndarray == type(h2) else scipy.asarray(h2)
if h1.shape != h2.shape or h1.size != h2.size:
raise ValueError('h1 and h2 must be of same shape and size')
return h1, h2