Source code for medpy.metric.histogram

# Copyright (C) 2013 Oskar Maier
# 
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
# author Oskar Maier
# version r0.1.0
# since 2011-12-01
# status Release

# build-in modules
import math

# third-party modules
import scipy

# own modules

# code
# ////////////////////////////// #
# Bin-by-bin comparison measures #
# ////////////////////////////// #

[docs]def minowski(h1, h2, p = 2): # 46..45..14,11..43..44 / 45 us for p=int(-inf..-24..-1,1..24..inf) / float @array, +20 us @list \w 100 bins r""" Minowski distance. With :math:`p=2` equal to the Euclidean distance, with :math:`p=1` equal to the Manhattan distance, and the Chebyshev distance implementation represents the case of :math:`p=\pm inf`. The Minowksi distance between two histograms :math:`H` and :math:`H'` of size :math:`m` is defined as: .. math:: d_p(H, H') = \left(\sum_{m=1}^M|H_m - H'_m|^p \right)^{\frac{1}{p}} *Attributes:* - a real metric *Attributes for normalized histograms:* - :math:`d(H, H')\in[0, \sqrt[p]{2}]` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-normalized histograms:* - :math:`d(H, H')\in[0, \infty)` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-equal histograms:* - not applicable Parameters ---------- h1 : sequence The first histogram. h2 : sequence The second histogram. p : float The :math:`p` value in the Minowksi distance formula. Returns ------- minowski : float Minowski distance. Raises ------ ValueError If ``p`` is zero. """ h1, h2 = __prepare_histogram(h1, h2) if 0 == p: raise ValueError('p can not be zero') elif int == type(p): if p > 0 and p < 25: return __minowski_low_positive_integer_p(h1, h2, p) elif p < 0 and p > -25: return __minowski_low_negative_integer_p(h1, h2, p) return math.pow(scipy.sum(scipy.power(scipy.absolute(h1 - h2), p)), 1./p)
def __minowski_low_positive_integer_p(h1, h2, p = 2): # 11..43 us for p = 1..24 \w 100 bins """ A faster implementation of the Minowski distance for positive integer < 25. @note do not use this function directly, but the general @link minowski() method. @note the passed histograms must be scipy arrays. """ mult = scipy.absolute(h1 - h2) dif = mult for _ in range(p - 1): dif = scipy.multiply(dif, mult) return math.pow(scipy.sum(dif), 1./p) def __minowski_low_negative_integer_p(h1, h2, p = 2): # 14..46 us for p = -1..-24 \w 100 bins """ A faster implementation of the Minowski distance for negative integer > -25. @note do not use this function directly, but the general @link minowski() method. @note the passed histograms must be scipy arrays. """ mult = scipy.absolute(h1 - h2) dif = mult for _ in range(-p + 1): dif = scipy.multiply(dif, mult) return math.pow(scipy.sum(1./dif), 1./p)
[docs]def manhattan(h1, h2): # # 7 us @array, 31 us @list \w 100 bins r""" Equal to Minowski distance with :math:`p=1`. See also -------- minowski """ h1, h2 = __prepare_histogram(h1, h2) return scipy.sum(scipy.absolute(h1 - h2))
[docs]def euclidean(h1, h2): # 9 us @array, 33 us @list \w 100 bins r""" Equal to Minowski distance with :math:`p=2`. See also -------- minowski """ h1, h2 = __prepare_histogram(h1, h2) return math.sqrt(scipy.sum(scipy.square(scipy.absolute(h1 - h2))))
[docs]def chebyshev(h1, h2): # 12 us @array, 36 us @list \w 100 bins r""" Chebyshev distance. Also Tchebychev distance, Maximum or :math:`L_{\infty}` metric; equal to Minowski distance with :math:`p=+\infty`. For the case of :math:`p=-\infty`, use `chebyshev_neg`. The Chebyshev distance between two histograms :math:`H` and :math:`H'` of size :math:`m` is defined as: .. math:: d_{\infty}(H, H') = \max_{m=1}^M|H_m-H'_m| *Attributes:* - semimetric (triangle equation satisfied?) *Attributes for normalized histograms:* - :math:`d(H, H')\in[0, 1]` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-normalized histograms:* - :math:`d(H, H')\in[0, \infty)` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-equal histograms:* - not applicable Parameters ---------- h1 : sequence The first histogram. h2 : sequence The second histogram. Returns ------- chebyshev : float Chebyshev distance. See also -------- minowski, chebyshev_neg """ h1, h2 = __prepare_histogram(h1, h2) return max(scipy.absolute(h1 - h2))
[docs]def chebyshev_neg(h1, h2): # 12 us @array, 36 us @list \w 100 bins r""" Chebyshev negative distance. Also Tchebychev distance, Minimum or :math:`L_{-\infty}` metric; equal to Minowski distance with :math:`p=-\infty`. For the case of :math:`p=+\infty`, use `chebyshev`. The Chebyshev distance between two histograms :math:`H` and :math:`H'` of size :math:`m` is defined as: .. math:: d_{-\infty}(H, H') = \min_{m=1}^M|H_m-H'_m| *Attributes:* - semimetric (triangle equation satisfied?) *Attributes for normalized histograms:* - :math:`d(H, H')\in[0, 1]` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-normalized histograms:* - :math:`d(H, H')\in[0, \infty)` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-equal histograms:* - not applicable Parameters ---------- h1 : sequence The first histogram. h2 : sequence The second histogram. Returns ------- chebyshev_neg : float Chebyshev negative distance. See also -------- minowski, chebyshev """ h1, h2 = __prepare_histogram(h1, h2) return min(scipy.absolute(h1 - h2))
[docs]def histogram_intersection(h1, h2): # 6 us @array, 30 us @list \w 100 bins r""" Calculate the common part of two histograms. The histogram intersection between two histograms :math:`H` and :math:`H'` of size :math:`m` is defined as: .. math:: d_{\cap}(H, H') = \sum_{m=1}^M\min(H_m, H'_m) *Attributes:* - a real metric *Attributes for normalized histograms:* - :math:`d(H, H')\in[0, 1]` - :math:`d(H, H) = 1` - :math:`d(H, H') = d(H', H)` *Attributes for not-normalized histograms:* - not applicable *Attributes for not-equal histograms:* - not applicable Parameters ---------- h1 : sequence The first histogram, normalized. h2 : sequence The second histogram, normalized, same bins as ``h1``. Returns ------- histogram_intersection : float Intersection between the two histograms. """ h1, h2 = __prepare_histogram(h1, h2) return scipy.sum(scipy.minimum(h1, h2))
[docs]def histogram_intersection_1(h1, h2): # 7 us @array, 31 us @list \w 100 bins r""" Turns the histogram intersection similarity into a distance measure for normalized, positive histograms. .. math:: d_{\bar{\cos}}(H, H') = 1 - d_{\cap}(H, H') See `histogram_intersection` for the definition of :math:`d_{\cap}(H, H')`. *Attributes:* - semimetric *Attributes for normalized histograms:* - :math:`d(H, H')\in[0, 1]` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-normalized histograms:* - not applicable *Attributes for not-equal histograms:* - not applicable Parameters ---------- h1 : sequence The first histogram, normalized. h2 : sequence The second histogram, normalized, same bins as ``h1``. Returns ------- histogram_intersection : float Intersection between the two histograms. """ return 1. - histogram_intersection(h1, h2)
[docs]def relative_deviation(h1, h2): # 18 us @array, 42 us @list \w 100 bins r""" Calculate the deviation between two histograms. The relative deviation between two histograms :math:`H` and :math:`H'` of size :math:`m` is defined as: .. math:: d_{rd}(H, H') = \frac{ \sqrt{\sum_{m=1}^M(H_m - H'_m)^2} }{ \frac{1}{2} \left( \sqrt{\sum_{m=1}^M H_m^2} + \sqrt{\sum_{m=1}^M {H'}_m^2} \right) } *Attributes:* - semimetric (triangle equation satisfied?) *Attributes for normalized histograms:* - :math:`d(H, H')\in[0, \sqrt{2}]` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-normalized histograms:* - :math:`d(H, H')\in[0, 2]` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-equal histograms:* - not applicable Parameters ---------- h1 : sequence The first histogram. h2 : sequence The second histogram, same bins as ``h1``. Returns ------- relative_deviation : float Relative deviation between the two histograms. """ h1, h2 = __prepare_histogram(h1, h2) numerator = math.sqrt(scipy.sum(scipy.square(h1 - h2))) denominator = (math.sqrt(scipy.sum(scipy.square(h1))) + math.sqrt(scipy.sum(scipy.square(h2)))) / 2. return numerator / denominator
[docs]def relative_bin_deviation(h1, h2): # 79 us @array, 104 us @list \w 100 bins r""" Calculate the bin-wise deviation between two histograms. The relative bin deviation between two histograms :math:`H` and :math:`H'` of size :math:`m` is defined as: .. math:: d_{rbd}(H, H') = \sum_{m=1}^M \frac{ \sqrt{(H_m - H'_m)^2} }{ \frac{1}{2} \left( \sqrt{H_m^2} + \sqrt{{H'}_m^2} \right) } *Attributes:* - a real metric *Attributes for normalized histograms:* - :math:`d(H, H')\in[0, \infty)` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-normalized histograms:* - :math:`d(H, H')\in[0, \infty)` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-equal histograms:* - not applicable Parameters ---------- h1 : sequence The first histogram. h2 : sequence The second histogram, same bins as ``h1``. Returns ------- relative_bin_deviation : float Relative bin deviation between the two histograms. """ h1, h2 = __prepare_histogram(h1, h2) numerator = scipy.sqrt(scipy.square(h1 - h2)) denominator = (scipy.sqrt(scipy.square(h1)) + scipy.sqrt(scipy.square(h2))) / 2. old_err_state = scipy.seterr(invalid='ignore') # divide through zero only occurs when the bin is zero in both histograms, in which case the division is 0/0 and leads to (and should lead to) 0 result = numerator / denominator scipy.seterr(**old_err_state) result[scipy.isnan(result)] = 0 # faster than scipy.nan_to_num, which checks for +inf and -inf also return scipy.sum(result)
[docs]def chi_square(h1, h2): # 23 us @array, 49 us @list \w 100 r""" Chi-square distance. Measure how unlikely it is that one distribution (histogram) was drawn from the other. The Chi-square distance between two histograms :math:`H` and :math:`H'` of size :math:`m` is defined as: .. math:: d_{\chi^2}(H, H') = \sum_{m=1}^M \frac{ (H_m - H'_m)^2 }{ H_m + H'_m } *Attributes:* - semimetric *Attributes for normalized histograms:* - :math:`d(H, H')\in[0, 2]` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-normalized histograms:* - :math:`d(H, H')\in[0, \infty)` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-equal histograms:* - not applicable Parameters ---------- h1 : sequence The first histogram. h2 : sequence The second histogram. Returns ------- chi_square : float Chi-square distance. """ h1, h2 = __prepare_histogram(h1, h2) old_err_state = scipy.seterr(invalid='ignore') # divide through zero only occurs when the bin is zero in both histograms, in which case the division is 0/0 and leads to (and should lead to) 0 result = scipy.square(h1 - h2) / (h1 + h2) scipy.seterr(**old_err_state) result[scipy.isnan(result)] = 0 # faster than scipy.nan_to_num, which checks for +inf and -inf also return scipy.sum(result)
[docs]def kullback_leibler(h1, h2): # 83 us @array, 109 us @list \w 100 bins r""" Kullback-Leibler divergence. Compute how inefficient it would to be code one histogram into another. Actually computes :math:`\frac{d_{KL}(h1, h2) + d_{KL}(h2, h1)}{2}` to achieve symmetry. The Kullback-Leibler divergence between two histograms :math:`H` and :math:`H'` of size :math:`m` is defined as: .. math:: d_{KL}(H, H') = \sum_{m=1}^M H_m\log\frac{H_m}{H'_m} *Attributes:* - quasimetric (but made symetric) *Attributes for normalized histograms:* - :math:`d(H, H')\in[0, \infty)` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-normalized histograms:* - not applicable *Attributes for not-equal histograms:* - not applicable Parameters ---------- h1 : sequence The first histogram, where h1[i] > 0 for any i such that h2[i] > 0, normalized. h2 : sequence The second histogram, where h2[i] > 0 for any i such that h1[i] > 0, normalized, same bins as ``h1``. Returns ------- kullback_leibler : float Kullback-Leibler divergence. """ old_err_state = scipy.seterr(divide='raise') try: h1, h2 = __prepare_histogram(h1, h2) result = (__kullback_leibler(h1, h2) + __kullback_leibler(h2, h1)) / 2. scipy.seterr(**old_err_state) return result except FloatingPointError: scipy.seterr(**old_err_state) raise ValueError('h1 can only contain zero values where h2 also contains zero values and vice-versa')
def __kullback_leibler(h1, h2): # 36.3 us """ The actual KL implementation. @see kullback_leibler() for details. Expects the histograms to be of type scipy.ndarray. """ result = h1.astype(scipy.float_) mask = h1 != 0 result[mask] = scipy.multiply(h1[mask], scipy.log(h1[mask] / h2[mask])) return scipy.sum(result)
[docs]def jensen_shannon(h1, h2): # 85 us @array, 110 us @list \w 100 bins r""" Jensen-Shannon divergence. A symmetric and numerically more stable empirical extension of the Kullback-Leibler divergence. The Jensen Shannon divergence between two histograms :math:`H` and :math:`H'` of size :math:`m` is defined as: .. math:: d_{JSD}(H, H') = \frac{1}{2} d_{KL}(H, H^*) + \frac{1}{2} d_{KL}(H', H^*) with :math:`H^*=\frac{1}{2}(H + H')`. *Attributes:* - semimetric *Attributes for normalized histograms:* - :math:`d(H, H')\in[0, 1]` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-normalized histograms:* - :math:`d(H, H')\in[0, \infty)` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-equal histograms:* - not applicable Parameters ---------- h1 : sequence The first histogram. h2 : sequence The second histogram, same bins as ``h1``. Returns ------- jensen_shannon : float Jensen-Shannon divergence. """ h1, h2 = __prepare_histogram(h1, h2) s = (h1 + h2) / 2. return __kullback_leibler(h1, s) / 2. + __kullback_leibler(h2, s) / 2.
[docs]def fidelity_based(h1, h2): # 25 us @array, 51 us @list \w 100 bins r""" Fidelity based distance. Also Bhattacharyya distance; see also the extensions `noelle_1` to `noelle_5`. The metric between two histograms :math:`H` and :math:`H'` of size :math:`m` is defined as: .. math:: d_{F}(H, H') = \sum_{m=1}^M\sqrt{H_m * H'_m} *Attributes:* - not a metric, a similarity *Attributes for normalized histograms:* - :math:`d(H, H')\in[0, 1]` - :math:`d(H, H) = 1` - :math:`d(H, H') = d(H', H)` *Attributes for not-normalized histograms:* - not applicable *Attributes for not-equal histograms:* - not applicable Parameters ---------- h1 : sequence The first histogram, normalized. h2 : sequence The second histogram, normalized, same bins as ``h1``. Returns ------- fidelity_based : float Fidelity based distance. Notes ----- The fidelity between two histograms :math:`H` and :math:`H'` is the same as the cosine between their square roots :math:`\sqrt{H}` and :math:`\sqrt{H'}`. """ h1, h2 = __prepare_histogram(h1, h2) result = scipy.sum(scipy.sqrt(h1 * h2)) result = 0 if 0 > result else result # for rounding errors result = 1 if 1 < result else result # for rounding errors return result
[docs]def noelle_1(h1, h2): # 26 us @array, 52 us @list \w 100 bins r""" Extension of `fidelity_based` proposed by [1]_. .. math:: d_{\bar{F}}(H, H') = 1 - d_{F}(H, H') See `fidelity_based` for the definition of :math:`d_{F}(H, H')`. *Attributes:* - semimetric *Attributes for normalized histograms:* - :math:`d(H, H')\in[0, 1]` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-normalized histograms:* - not applicable *Attributes for not-equal histograms:* - not applicable Parameters ---------- h1 : sequence The first histogram, normalized. h2 : sequence The second histogram, normalized, same bins as ``h1``. Returns ------- fidelity_based : float Fidelity based distance. References ---------- .. [1] M. Noelle "Distribution Distance Measures Applied to 3-D Object Recognition", 2003 """ return 1. - fidelity_based(h1, h2)
[docs]def noelle_2(h1, h2): # 26 us @array, 52 us @list \w 100 bins r""" Extension of `fidelity_based` proposed by [1]_. .. math:: d_{\sqrt{1-F}}(H, H') = \sqrt{1 - d_{F}(H, H')} See `fidelity_based` for the definition of :math:`d_{F}(H, H')`. *Attributes:* - metric *Attributes for normalized histograms:* - :math:`d(H, H')\in[0, 1]` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-normalized histograms:* - not applicable *Attributes for not-equal histograms:* - not applicable Parameters ---------- h1 : sequence The first histogram, normalized. h2 : sequence The second histogram, normalized, same bins as ``h1``. Returns ------- fidelity_based : float Fidelity based distance. References ---------- .. [1] M. Noelle "Distribution Distance Measures Applied to 3-D Object Recognition", 2003 """ return math.sqrt(1. - fidelity_based(h1, h2))
[docs]def noelle_3(h1, h2): # 26 us @array, 52 us @list \w 100 bins r""" Extension of `fidelity_based` proposed by [1]_. .. math:: d_{\log(2-F)}(H, H') = \log(2 - d_{F}(H, H')) See `fidelity_based` for the definition of :math:`d_{F}(H, H')`. *Attributes:* - semimetric *Attributes for normalized histograms:* - :math:`d(H, H')\in[0, log(2)]` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-normalized histograms:* - not applicable *Attributes for not-equal histograms:* - not applicable Parameters ---------- h1 : sequence The first histogram, normalized. h2 : sequence The second histogram, normalized, same bins as ``h1``. Returns ------- fidelity_based : float Fidelity based distance. References ---------- .. [1] M. Noelle "Distribution Distance Measures Applied to 3-D Object Recognition", 2003 """ return math.log(2 - fidelity_based(h1, h2))
[docs]def noelle_4(h1, h2): # 26 us @array, 52 us @list \w 100 bins r""" Extension of `fidelity_based` proposed by [1]_. .. math:: d_{\arccos F}(H, H') = \frac{2}{\pi} \arccos d_{F}(H, H') See `fidelity_based` for the definition of :math:`d_{F}(H, H')`. *Attributes:* - metric *Attributes for normalized histograms:* - :math:`d(H, H')\in[0, 1]` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-normalized histograms:* - not applicable *Attributes for not-equal histograms:* - not applicable Parameters ---------- h1 : sequence The first histogram, normalized. h2 : sequence The second histogram, normalized, same bins as ``h1``. Returns ------- fidelity_based : float Fidelity based distance. References ---------- .. [1] M. Noelle "Distribution Distance Measures Applied to 3-D Object Recognition", 2003 """ return 2. / math.pi * math.acos(fidelity_based(h1, h2))
[docs]def noelle_5(h1, h2): # 26 us @array, 52 us @list \w 100 bins r""" Extension of `fidelity_based` proposed by [1]_. .. math:: d_{\sin F}(H, H') = \sqrt{1 -d_{F}^2(H, H')} See `fidelity_based` for the definition of :math:`d_{F}(H, H')`. *Attributes:* - metric *Attributes for normalized histograms:* - :math:`d(H, H')\in[0, 1]` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-normalized histograms:* - not applicable *Attributes for not-equal histograms:* - not applicable Parameters ---------- h1 : sequence The first histogram, normalized. h2 : sequence The second histogram, normalized, same bins as ``h1``. Returns ------- fidelity_based : float Fidelity based distance. References ---------- .. [1] M. Noelle "Distribution Distance Measures Applied to 3-D Object Recognition", 2003 """ return math.sqrt(1 - math.pow(fidelity_based(h1, h2), 2))
[docs]def cosine_alt(h1, h2): # 17 us @array, 42 us @list \w 100 bins r""" Alternative implementation of the `cosine` distance measure. Notes ----- Under development. """ h1, h2 = __prepare_histogram(h1, h2) return -1 * float(scipy.sum(h1 * h2)) / (scipy.sum(scipy.power(h1, 2)) * scipy.sum(scipy.power(h2, 2)))
[docs]def cosine(h1, h2): # 17 us @array, 42 us @list \w 100 bins r""" Cosine simmilarity. Compute the angle between the two histograms in vector space irrespective of their length. The cosine similarity between two histograms :math:`H` and :math:`H'` of size :math:`m` is defined as: .. math:: d_{\cos}(H, H') = \cos\alpha = \frac{H * H'}{\|H\| \|H'\|} = \frac{\sum_{m=1}^M H_m*H'_m}{\sqrt{\sum_{m=1}^M H_m^2} * \sqrt{\sum_{m=1}^M {H'}_m^2}} *Attributes:* - not a metric, a similarity *Attributes for normalized histograms:* - :math:`d(H, H')\in[0, 1]` - :math:`d(H, H) = 1` - :math:`d(H, H') = d(H', H)` *Attributes for not-normalized histograms:* - :math:`d(H, H')\in[-1, 1]` - :math:`d(H, H) = 1` - :math:`d(H, H') = d(H', H)` *Attributes for not-equal histograms:* - not applicable Parameters ---------- h1 : sequence The first histogram. h2 : sequence The second histogram, same bins as ``h1``. Returns ------- cosine : float Cosine simmilarity. Notes ----- The resulting similarity ranges from -1 meaning exactly opposite, to 1 meaning exactly the same, with 0 usually indicating independence, and in-between values indicating intermediate similarity or dissimilarity. """ h1, h2 = __prepare_histogram(h1, h2) return scipy.sum(h1 * h2) / math.sqrt(scipy.sum(scipy.square(h1)) * scipy.sum(scipy.square(h2)))
[docs]def cosine_1(h1, h2): # 18 us @array, 43 us @list \w 100 bins r""" Cosine simmilarity. Turns the cosine similarity into a distance measure for normalized, positive histograms. .. math:: d_{\bar{\cos}}(H, H') = 1 - d_{\cos}(H, H') See `cosine` for the definition of :math:`d_{\cos}(H, H')`. *Attributes:* - metric *Attributes for normalized histograms:* - :math:`d(H, H')\in[0, 1]` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-normalized histograms:* - not applicable *Attributes for not-equal histograms:* - not applicable Parameters ---------- h1 : sequence The first histogram, normalized. h2 : sequence The second histogram, normalized, same bins as ``h1``. Returns ------- cosine : float Cosine distance. """ return 1. - cosine(h1, h2)
[docs]def cosine_2(h1, h2): # 19 us @array, 44 us @list \w 100 bins r""" Cosine simmilarity. Turns the cosine similarity into a distance measure for normalized, positive histograms. .. math:: d_{\bar{\cos}}(H, H') = 1 - \frac{2*\arccos d_{\cos}(H, H')}{pi} See `cosine` for the definition of :math:`d_{\cos}(H, H')`. *Attributes:* - metric *Attributes for normalized histograms:* - :math:`d(H, H')\in[0, 1]` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-normalized histograms:* - not applicable *Attributes for not-equal histograms:* - not applicable Parameters ---------- h1 : sequence The first histogram, normalized. h2 : sequence The second histogram, normalized, same bins as ``h1``. Returns ------- cosine : float Cosine distance. """ return 1. - (2 * cosine(h1, h2)) / math.pi
[docs]def correlate(h1, h2): # 31 us @array, 55 us @list \w 100 bins r""" Correlation between two histograms. The histogram correlation between two histograms :math:`H` and :math:`H'` of size :math:`m` is defined as: .. math:: d_{corr}(H, H') = \frac{ \sum_{m=1}^M (H_m-\bar{H}) \cdot (H'_m-\bar{H'}) }{ \sqrt{\sum_{m=1}^M (H_m-\bar{H})^2 \cdot \sum_{m=1}^M (H'_m-\bar{H'})^2} } with :math:`\bar{H}` and :math:`\bar{H'}` being the mean values of :math:`H` resp. :math:`H'` *Attributes:* - not a metric, a similarity *Attributes for normalized histograms:* - :math:`d(H, H')\in[-1, 1]` - :math:`d(H, H) = 1` - :math:`d(H, H') = d(H', H)` *Attributes for not-normalized histograms:* - :math:`d(H, H')\in[-1, 1]` - :math:`d(H, H) = 1` - :math:`d(H, H') = d(H', H)` *Attributes for not-equal histograms:* - not applicable Parameters ---------- h1 : sequence The first histogram. h2 : sequence The second histogram, same bins as ``h1``. Returns ------- correlate : float Correlation between the histograms. Notes ----- Returns 0 if one of h1 or h2 contain only zeros. """ h1, h2 = __prepare_histogram(h1, h2) h1m = h1 - scipy.sum(h1) / float(h1.size) h2m = h2 - scipy.sum(h2) / float(h2.size) a = scipy.sum(scipy.multiply(h1m, h2m)) b = math.sqrt(scipy.sum(scipy.square(h1m)) * scipy.sum(scipy.square(h2m))) return 0 if 0 == b else a / b
[docs]def correlate_1(h1, h2): # 32 us @array, 56 us @list \w 100 bins r""" Correlation distance. Turns the histogram correlation into a distance measure for normalized, positive histograms. .. math:: d_{\bar{corr}}(H, H') = 1-\frac{d_{corr}(H, H')}{2}. See `correlate` for the definition of :math:`d_{corr}(H, H')`. *Attributes:* - semimetric *Attributes for normalized histograms:* - :math:`d(H, H')\in[0, 1]` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-normalized histograms:* - :math:`d(H, H')\in[0, 1]` - :math:`d(H, H) = 0` - :math:`d(H, H') = d(H', H)` *Attributes for not-equal histograms:* - not applicable Parameters ---------- h1 : sequence The first histogram. h2 : sequence The second histogram, same bins as ``h1``. Returns ------- correlate : float Correlation distnace between the histograms. Notes ----- Returns 0.5 if one of h1 or h2 contains only zeros. """ return (1. - correlate(h1, h2))/2.
# ///////////////////////////// # # Cross-bin comparison measures # # ///////////////////////////// #
[docs]def quadratic_forms(h1, h2): r""" Quadrativ forms metric. Notes ----- UNDER DEVELOPMENT This distance measure shows very strange behaviour. The expression transpose(h1-h2) * A * (h1-h2) yields egative values that can not be processed by the square root. Some examples:: h1 h2 transpose(h1-h2) * A * (h1-h2) [1, 0] to [0.0, 1.0] : -2.0 [1, 0] to [0.5, 0.5] : 0.0 [1, 0] to [0.6666666666666667, 0.3333333333333333] : 0.111111111111 [1, 0] to [0.75, 0.25] : 0.0833333333333 [1, 0] to [0.8, 0.2] : 0.06 [1, 0] to [0.8333333333333334, 0.16666666666666666] : 0.0444444444444 [1, 0] to [0.8571428571428572, 0.14285714285714285] : 0.0340136054422 [1, 0] to [0.875, 0.125] : 0.0267857142857 [1, 0] to [0.8888888888888888, 0.1111111111111111] : 0.0216049382716 [1, 0] to [0.9, 0.1] : 0.0177777777778 [1, 0] to [1, 0]: 0.0 It is clearly undesireable to recieve negative values and even worse to get a value of zero for other cases than the same histograms. """ h1, h2 = __prepare_histogram(h1, h2) A = __quadratic_forms_matrix_euclidean(h1, h2) return math.sqrt((h1-h2).dot(A.dot(h1-h2))) # transpose(h1-h2) * A * (h1-h2)
def __quadratic_forms_matrix_euclidean(h1, h2): r""" Compute the bin-similarity matrix for the quadratic form distance measure. The matric :math:`A` for two histograms :math:`H` and :math:`H'` of size :math:`m` and :math:`n` respectively is defined as .. math:: A_{m,n} = 1 - \frac{d_2(H_m, {H'}_n)}{d_{max}} with .. math:: d_{max} = \max_{m,n}d_2(H_m, {H'}_n) See also -------- quadratic_forms """ A = scipy.repeat(h2[:,scipy.newaxis], h1.size, 1) # repeat second array to form a matrix A = scipy.absolute(A - h1) # euclidean distances return 1 - (A / float(A.max())) # //////////////// # # Helper functions # # //////////////// # def __prepare_histogram(h1, h2): """Convert the histograms to scipy.ndarrays if required.""" h1 = h1 if scipy.ndarray == type(h1) else scipy.asarray(h1) h2 = h2 if scipy.ndarray == type(h2) else scipy.asarray(h2) if h1.shape != h2.shape or h1.size != h2.size: raise ValueError('h1 and h2 must be of same shape and size') return h1, h2