Source code for megaman.geometry.rmetric

"""Riemannian Metric learning utilities and algorithms.

   To use the "geometric" Laplacian from geometry.py for statistically
   consistent results.

"""

# Author: Marina Meila <mmp@stat.washington.edu>
#         after the Matlab function rmetric.m by Dominique Perrault-Joncas
# LICENSE: Simplified BSD https://github.com/mmp2/megaman/blob/master/LICENSE

import warnings
import numpy as np

from scipy import sparse
from numpy.linalg import eigh, svd, inv


[docs]def riemann_metric( Y, laplacian=None, n_dim=None, invert_h=False, mode_inv = 'svd'):
    """
    Parameters
    ----------
    Y: array-like, shape = (n_samples, mdimY )
        The embedding coordinates of the points
    laplacian: array-like, shape = (n_samples, n_samples)
        The Laplacian of the data. It is recommended to use the "geometric"
        Laplacian (default) option from geometry.graph_laplacian()
    n_dim : integer, optional
        Use only the first n_dim <= mdimY dimensions.All dimensions
        n_dim:mdimY are ignored.
    invert_h: boolean, optional
        if False, only the "dual Riemannian metric" is computed
        if True, the dual metric matrices are inverted to obtain the
        Riemannian metric G.
    mode_inv: string, optional
       How to compute the inverses of h_dual_metric, if invert_h
        "inv", use numpy.inv()
        "svd" (default), use numpy.linalg.svd(), then invert the eigenvalues
        (possibly a more numerically stable method with H is symmetric and
        ill conditioned)

    Returns
    -------
    h_dual_metric : array, shape=(n_samples, n_dim, n_dim)
    Optionally :
    g_riemann_metric : array, shape=(n_samples, n_dim, n_dim )
    Hvv : singular vectors of H, transposed, shape = ( n_samples, n_dim, n_dim )
    Hsvals : singular values of H, shape = ( n_samples, n_dim )
    Gsvals : singular values of G, shape = ( n_samples, n_dim )

    Notes
    -----
    References
    ----------
    "Non-linear dimensionality reduction: Riemannian metric estimation and
    the problem of geometric discovery",
    Dominique Perraul-Joncas, Marina Meila, arXiv:1305.7255
    """
    n_samples = laplacian.shape[0]
    h_dual_metric = np.zeros((n_samples, n_dim, n_dim ))
    for i in np.arange(n_dim ):
        for j in np.arange(i,n_dim ):
            yij = Y[:,i]*Y[:,j]
            h_dual_metric[ :, i, j] = 0.5*(laplacian.dot(yij)-Y[:,j]*laplacian.dot(Y[:,i])-Y[:,i]*laplacian.dot(Y[:,j]))
    for j in np.arange(n_dim-1):
        for i in np.arange(j+1,n_dim):
            h_dual_metric[ :,i,j] = h_dual_metric[:,j,i]

    # compute rmetric if requested
    if( invert_h ):
        riemann_metric, Hvv, Hsvals, Gsvals = compute_G_from_H( h_dual_metric )
    else:
        riemann_metric = Hvv = Hsvals = Gsvals = None

    return h_dual_metric, riemann_metric, Hvv, Hsvals, Gsvals

[docs]def riemann_metric_lazy( Y, sample,laplacian, n_dim, invert_h=False, mode_inv = 'svd'):
    """
    Parameters
    ----------
    Y: array-like, shape = (n_samples, mdimY )
        The embedding coordinates of the points
    sample : array-like, shape (n_samples)
        array of indices over which to calculate the riemannian metric.
    laplacian: array-like, shape = (n_samples, n_samples)
        The Laplacian of the data. It is recommended to use the "geometric"
        Laplacian (default) option from geometry.graph_laplacian()
    n_dim : integer, optional
        Use only the first n_dim <= mdimY dimensions.All dimensions
        n_dim:mdimY are ignored.
    invert_h: boolean, optional
        if False, only the "dual Riemannian metric" is computed
        if True, the dual metric matrices are inverted to obtain the
        Riemannian metric G.
    mode_inv: string, optional
       How to compute the inverses of h_dual_metric, if invert_h
        "inv", use numpy.inv()
        "svd" (default), use numpy.linalg.svd(), then invert the eigenvalues
        (possibly a more numerically stable method with H is symmetric and
        ill conditioned)

    Returns
    -------
    h_dual_metric : array, shape=(n_samples, n_dim, n_dim)
    Optionally :
    g_riemann_metric : array, shape=(n_samples, n_dim, n_dim )
    Hvv : singular vectors of H, transposed, shape = ( n_samples, n_dim, n_dim )
    Hsvals : singular values of H, shape = ( n_samples, n_dim )
    Gsvals : singular values of G, shape = ( n_samples, n_dim )

    Notes
    -----
    References
    ----------
    "Non-linear dimensionality reduction: Riemannian metric estimation and
    the problem of geometric discovery",
    Dominique Perraul-Joncas, Marina Meila, arXiv:1305.7255
    """
    n_samples = laplacian.shape[0]
    laplacian = laplacian[sample,:]
    h_dual_metric = np.zeros((len(sample), n_dim, n_dim))
    for i in np.arange(n_dim ):
        for j in np.arange(i,n_dim ):
            yij = Y[:,i]*Y[:,j]
            h_dual_metric[ :, i, j] = 0.5*(laplacian.dot(yij)-Y[sample,j]*laplacian.dot(Y[:,i])-Y[sample,i]*laplacian.dot(Y[:,j]))
    for j in np.arange(n_dim-1):
        for i in np.arange(j+1,n_dim):
            h_dual_metric[ :,i,j] = h_dual_metric[:,j,i]
 
    if( invert_h ):
        riemann_metric, Hvv, Hsvals, Gsvals = compute_G_from_H( h_dual_metric )
    else:
        riemann_metric = Hvv = Hsvals = Gsvals = None
   
    return h_dual_metric,riemann_metric, Hvv, Hsvals, Gsvals

[docs]def compute_G_from_H( H, mdimG = None, mode_inv = "svd" ):
    """
    Parameters
    ----------
    H : the inverse R. Metric
    if mode_inv == 'svd':
       also returns Hvv, Hsvals, Gsvals the (transposed) eigenvectors of
       H and the singular values of H and G
    if mdimG < H.shape[2]:
       G.shape = [ n_samples, mdimG, mdimG ] with n_samples = H.shape[0]

    Notes
    ------
    currently Hvv, Hsvals are n_dim = H.shape[2], and Gsvals, G are mdimG
    (This contradicts the documentation of riemann_metric which states
    that riemann_metric and h_dual_metric have the same dimensions)

    See the notes in RiemannMetric
    """
    n_samples = H.shape[0]
    n_dim = H.shape[2]
    if mode_inv is 'svd':
        Huu, Hsvals, Hvv = np.linalg.svd( H )
        if mdimG is None:
            Gsvals = 1./Hsvals
            G = np.zeros((n_samples, n_dim, n_dim))
            for i in np.arange(n_samples):
                G[i,:,:] = np.dot(Huu[i,:,:], np.dot( np.diag(Gsvals[i,:]), Hvv[i,:,:]))
        elif mdimG < n_dim:
            Gsvals[:,:mdimG] = 1./Hsvals[:,:mdimG]
            Gsvals[:,mdimG:] = 0.
            # this can be redone with np.einsum() but it's barbaric
            G = np.zeros((n_samples, mdimG, mdimG))
            for i in np.arange(n_samples):
                G[i,:,:mdimG] = np.dot(Huu[i,:,mdimG], np.dot( np.diag(Gsvals[i,:mdimG]), Hvv[i,:,:mdimG]))
        else:
            raise ValueError('mdimG must be <= H.shape[1]')
        return G, Hvv, Hsvals, Gsvals
    else:
        riemann_metric = np.linalg.inv(h_dual_metric)
        return riemann_metric, None, None, None


[docs]class RiemannMetric(object):
    """
    RiemannMetric computes and stores the Riemannian metric and its dual
    associated with an embedding Y. The Riemannian metric is currently denoted
    by G, its dual by H, and the Laplacian by L. G at each point is the
    matrix inverse of H.

    For performance, the following choices have been made:
    * the class makes no defensive copies of L, Y
    * no defensive copies of the array attributes H, G, Hvv, ....
    * G is computed on request only
    In the future, this class will be extended to compute H only once,
    for mdimY dimensions, but to store multiple G's, with different dimensions.

    In the near future plans is also a "lazy" implementation, which will
    compute G (and maybe even H) only at the requested points.

    Parameters
    -----------
    Y : embedding coordinates, shape = (n, mdimY)
    laplacian : estimated laplacian from data  shape = (n, n)
    n_dim : the manifold domension
    mod_inv : if mode_inv = svd, also returns Hvv, Hsvals,
        Gsvals the (transposed) eigenvectors of
        H and the singular values of H and G

    Returns
    ----------
    mdimG : dimension of G, H
    mdimY : dimension of Y
    H : dual Riemann metric, shape = (n, mdimY, mdimY)
    G : Riemann metric, shape = (n, mdimG, mdimG)
    Hvv : (transposed) singular vectors of H, shape = (n, mdimY, mdimY)
    Hsvals : singular values of H, shape = (n, mdimY)
    Gsvals : singular values of G, shape = (n, mdimG)
    detG : determinants of G, shape = (n,1)

    Notes
    -----
    H is always computed at full dimension self.mdimY
    G is computed at mdimG (some contradictions persist here)

    References
    ----------
    "Non-linear dimensionality reduction: Riemannian metric estimation and
    the problem of geometric discovery",
    Dominique Perraul-Joncas, Marina Meila, arXiv:1305.7255

    """
    def __init__(self, Y, laplacian, n_dim=None, mode_inv='svd'):
        # input data
        self.Y = Y
        self.n, self.mdimY = Y.shape
        self.L = laplacian

        # input params
        if n_dim is None:
            self.mdimG = self.mdimY
        else:
            if n_dim > self.mdimY:
                raise ValueError('n_dim must be <= Y.shape[1]')
            self.mdimG = n_dim    # dimension of the riemann_metric computed
        self.mode_inv = mode_inv
        if self.mode_inv not in set(('svd', 'inv')):
            raise ValueError(("%s is not a valid value. Expected "
                              "'svd', 'inv'") % self.mode_inv)

        # results and outputs
        self.H = None
        self.G = None
        self.Hvv = None
        self.Hsvals = None
        self.Gsvals = None
        self.detG = None

[docs]    def get_dual_rmetric( self, invert_h = False, mode_inv = 'svd' ):
        """
        Compute the dual Riemannian Metric
        This is not satisfactory, because if mdimG<mdimY the shape of H
        will not be the same as the shape of G. TODO(maybe): return a (copied)
        smaller H with only the rows and columns in G.
        """
        if self.H is None:
            self.H, self.G, self.Hvv, self.Hsvals, self.Gsvals = riemann_metric(self.Y, self.L, self.mdimG, invert_h = invert_h, mode_inv = mode_inv)
        if invert_h:
            return self.H, self.G
        else:
            return self.H

[docs]    def get_rmetric( self, mode_inv = 'svd', return_svd = False ):
        """
        Compute the Reimannian Metric
        """
        if self.H is None:
            self.H, self.G, self.Hvv, self.Hsval = riemann_metric(self.Y, self.L, self.mdimG, invert_h = True, mode_inv = mode_inv)
        if self.G is None:
            self.G, self.Hvv, self.Hsvals,  self.Gsvals = compute_G_from_H( self.H, mode_inv = self.mode_inv )
        if mode_inv is 'svd' and return_svd:
            return self.G, self.Hvv, self.Hsvals, self.Gsvals
        else:
            return self.G

    def get_mdimG(self):
        return self.mdimG

    def get_detG(self):
        if self.G is None:
            self.H, self.G, self.Hvv, self.Hsval = riemann_metric(self.Y, self.L, self.mdimG, invert_h = True, mode_inv = self.mode_inv)
        if self.detG is None:
            if self.mdimG == self.mdimY:
                self.detG = 1./np.linalg.det(H)
            else:
                self.detG = 1./np.linalg.det(H[:,:mdimG,:mdimG])
            # could also be prod eigenvalues
            # inefficient? shall I do it on G?