You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
295 lines
12 KiB
C++
295 lines
12 KiB
C++
/*=========================================================================
|
|
|
|
Program: Visualization Toolkit
|
|
Module: vtkPCAStatistics.h
|
|
|
|
Copyright (c) Ken Martin, Will Schroeder, Bill Lorensen
|
|
All rights reserved.
|
|
See Copyright.txt or http://www.kitware.com/Copyright.htm for details.
|
|
|
|
This software is distributed WITHOUT ANY WARRANTY; without even
|
|
the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
|
PURPOSE. See the above copyright notice for more information.
|
|
|
|
=========================================================================*/
|
|
/*-------------------------------------------------------------------------
|
|
Copyright 2010 Sandia Corporation.
|
|
Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
|
the U.S. Government retains certain rights in this software.
|
|
-------------------------------------------------------------------------*/
|
|
/**
|
|
* @class vtkPCAStatistics
|
|
* @brief A class for multivariate principal component analysis
|
|
*
|
|
*
|
|
* This class derives from the multi-correlative statistics algorithm and
|
|
* uses the covariance matrix and Cholesky decomposition computed by it.
|
|
* However, when it finalizes the statistics in learn operation, the PCA class
|
|
* computes the SVD of the covariance matrix in order to obtain its eigenvectors.
|
|
*
|
|
* In the assess operation, the input data are
|
|
* - projected into the basis defined by the eigenvectors,
|
|
* - the energy associated with each datum is computed,
|
|
* - or some combination thereof.
|
|
* Additionally, the user may specify some threshold energy or
|
|
* eigenvector entry below which the basis is truncated. This allows
|
|
* projection into a lower-dimensional state while minimizing (in a
|
|
* least squares sense) the projection error.
|
|
*
|
|
* In the test operation, a Jarque-Bera-Srivastava test of n-d normality is performed.
|
|
*
|
|
* The Robust PCA can be computed by using the median instead of the mean,
|
|
* and the MAD matrix (Median Absolute Deviation) instead of the covariance matrix.
|
|
* This can be done by activating the MedianAbsoluteDeviation boolean (declared in
|
|
* the superclass).
|
|
*
|
|
* @par Thanks:
|
|
* Thanks to David Thompson, Philippe Pebay and Jackson Mayo from
|
|
* Sandia National Laboratories for implementing this class.
|
|
* Updated by Philippe Pebay, Kitware SAS 2012
|
|
* Updated by Tristan Coulange and Joachim Pouderoux, Kitware SAS 2013
|
|
*/
|
|
|
|
#ifndef vtkPCAStatistics_h
|
|
#define vtkPCAStatistics_h
|
|
|
|
#include "vtkFiltersStatisticsModule.h" // For export macro
|
|
#include "vtkMultiCorrelativeStatistics.h"
|
|
|
|
class vtkDoubleArray;
|
|
class vtkIdTypeArray;
|
|
|
|
class VTKFILTERSSTATISTICS_EXPORT vtkPCAStatistics : public vtkMultiCorrelativeStatistics
|
|
{
|
|
public:
|
|
vtkTypeMacro(vtkPCAStatistics, vtkMultiCorrelativeStatistics);
|
|
void PrintSelf(ostream& os, vtkIndent indent) override;
|
|
static vtkPCAStatistics* New();
|
|
|
|
/**
|
|
* Methods by which the covariance matrix may be normalized.
|
|
*/
|
|
enum NormalizationType
|
|
{
|
|
NONE, //!< The covariance matrix should be used as computed.
|
|
TRIANGLE_SPECIFIED, //!< Normalize cov(i,j) by V(i,j) where V is supplied by the user.
|
|
DIAGONAL_SPECIFIED, //!< Normalize cov(i,j) by sqrt(V(i)*V(j)) where V is supplied by the user.
|
|
DIAGONAL_VARIANCE, //!< Normalize cov(i,j) by sqrt(cov(i,i)*cov(j,j)).
|
|
NUM_NORMALIZATION_SCHEMES //!< The number of normalization schemes.
|
|
};
|
|
|
|
/**
|
|
* These are the enumeration values that SetBasisScheme() accepts and GetBasisScheme returns.
|
|
*/
|
|
enum ProjectionType
|
|
{
|
|
FULL_BASIS, //!< Use all entries in the basis matrix
|
|
FIXED_BASIS_SIZE, //!< Use the first N entries in the basis matrix
|
|
FIXED_BASIS_ENERGY, //!< Use consecutive basis matrix entries whose energies sum to at least T
|
|
NUM_BASIS_SCHEMES //!< The number of schemes (not a valid scheme).
|
|
};
|
|
|
|
//@{
|
|
/**
|
|
* This determines how (or if) the covariance matrix \a cov is normalized before PCA.
|
|
|
|
* When set to NONE, no normalization is performed. This is the default.
|
|
|
|
* When set to TRIANGLE_SPECIFIED, each entry cov(i,j) is divided by V(i,j).
|
|
* The list V of normalization factors must be set using the SetNormalization method
|
|
* before the filter is executed.
|
|
|
|
* When set to DIAGONAL_SPECIFIED, each entry cov(i,j) is divided by sqrt(V(i)*V(j)).
|
|
* The list V of normalization factors must be set using the SetNormalization method
|
|
* before the filter is executed.
|
|
|
|
* When set to DIAGONAL_VARIANCE, each entry cov(i,j) is divided by sqrt(cov(i,i)*cov(j,j)).
|
|
* <b>Warning</b>: Although this is accepted practice in some fields,
|
|
* some people think you should not turn this option on unless there
|
|
* is a good physically-based reason for doing so. Much better instead
|
|
* to determine how component magnitudes should be compared using
|
|
* physical reasoning and use DIAGONAL_SPECIFIED, TRIANGLE_SPECIFIED, or
|
|
* perform some pre-processing to shift and scale input data columns
|
|
* appropriately than to expect magical results from a shady normalization hack.
|
|
*/
|
|
vtkSetMacro(NormalizationScheme, int);
|
|
vtkGetMacro(NormalizationScheme, int);
|
|
virtual void SetNormalizationSchemeByName(const char* sname);
|
|
virtual const char* GetNormalizationSchemeName(int scheme);
|
|
//@}
|
|
|
|
//@{
|
|
/**
|
|
* These methods allow you to set/get values used to normalize the covariance matrix before PCA.
|
|
* The normalization values apply to all requests, so you do not specify a single
|
|
* vector but a 3-column table.
|
|
|
|
* The first two columns contain the names of columns from input 0 and the third column contains
|
|
* the value to normalize the corresponding entry in the covariance matrix.
|
|
* The table must always have 3 columns even when the NormalizationScheme is DIAGONAL_SPECIFIED.
|
|
* When only diagonal entries are to be used, only table rows where the first two columns are
|
|
* identical to one another will be employed.
|
|
* If there are multiple rows specifying different values for the same pair of columns,
|
|
* the entry nearest the bottom of the table takes precedence.
|
|
|
|
* These functions are actually convenience methods that set/get the third input of the filter.
|
|
* Because the table is the third input, you may use other filters to produce a table of
|
|
* normalizations and have the pipeline take care of updates.
|
|
|
|
* Any missing entries will be set to 1.0 and a warning issued.
|
|
* An error will occur if the third input to the filter is not set and the
|
|
* NormalizationScheme is DIAGONAL_SPECIFIED or TRIANGLE_SPECIFIED.
|
|
|
|
* NOTE: SetSpecifiedNormalization( table ) is equivalent to SetInputData(3, table)
|
|
* and therefore does not make a pipeline connection.
|
|
*/
|
|
virtual vtkTable* GetSpecifiedNormalization();
|
|
virtual void SetSpecifiedNormalization(vtkTable*);
|
|
//@}
|
|
|
|
//@{
|
|
/**
|
|
* Get the eigenvalues. The eigenvalues are ordered according from largest to smallest.
|
|
* This function:
|
|
* void GetEigenvalues(int request, int i, vtkDoubleArray*);
|
|
* does all of the work. The other functions simply call this function with the appropriate
|
|
* parameters. These functions are not valid unless Update() has been called and the Derive
|
|
* option is turned on.
|
|
*/
|
|
void GetEigenvalues(int request, vtkDoubleArray*);
|
|
void GetEigenvalues(vtkDoubleArray*);
|
|
double GetEigenvalue(int request, int i);
|
|
double GetEigenvalue(int i);
|
|
//@}
|
|
|
|
//@{
|
|
/**
|
|
* Get the eigenvectors. The eigenvectors are ordered according to the magnitude of their
|
|
* associated eigenvalues, sorted from largest to smallest. That is, eigenvector 0 corresponds
|
|
* to the largest eigenvalue.
|
|
* This function:
|
|
* void GetEigenvectors(int request, vtkDoubleArray* eigenvectors)
|
|
* does all of the work. The other functions are convenience functions that call this function
|
|
* with default arguments. These functions are not valid unless Update() has been called and the
|
|
* Derive option is turned on.
|
|
*/
|
|
void GetEigenvectors(int request, vtkDoubleArray* eigenvectors);
|
|
void GetEigenvectors(vtkDoubleArray* eigenvectors);
|
|
void GetEigenvector(int i, vtkDoubleArray* eigenvector);
|
|
void GetEigenvector(int request, int i, vtkDoubleArray* eigenvector);
|
|
//@}
|
|
|
|
//@{
|
|
/**
|
|
* This variable controls the dimensionality of output tuples in Assess operation.
|
|
* Consider the case where you have requested a PCA on D columns.
|
|
|
|
* When set to vtkPCAStatistics::FULL_BASIS, the entire set of basis vectors
|
|
* is used to derive new coordinates for each tuple being assessed.
|
|
* In this mode, you are guaranteed to have output tuples of the same dimension
|
|
* as the input tuples. (That dimension is D, so there will be D additional
|
|
* columns added to the table for the request.)
|
|
|
|
* When set to vtkPCAStatistics::FIXED_BASIS_SIZE, only the first N basis vectors
|
|
* are used to derive new coordinates for each tuple being assessed.
|
|
* In this mode, you are guaranteed to have output tuples of dimension min(N,D).
|
|
* You must set N prior to assessing data using the SetFixedBasisSize() method.
|
|
* When N < D, this turns the PCA into a projection (instead of change of basis).
|
|
|
|
* When set to vtkPCAStatistics::FIXED_BASIS_ENERGY, the number of basis vectors
|
|
* used to derive new coordinates for each tuple will be the minimum number
|
|
* of columns N that satisfy
|
|
* \f[
|
|
* \frac{\sum_{i=1}^{N} \lambda_i}{\sum_{i=1}^{D} \lambda_i} < T
|
|
* \f]
|
|
* You must set T prior to assessing data using the SetFixedBasisEnergy() method.
|
|
* When T < 1, this turns the PCA into a projection (instead of change of basis).
|
|
|
|
* By default BasisScheme is set to vtkPCAStatistics::FULL_BASIS.
|
|
*/
|
|
vtkSetMacro(BasisScheme, int);
|
|
vtkGetMacro(BasisScheme, int);
|
|
virtual const char* GetBasisSchemeName(int schemeIndex);
|
|
virtual void SetBasisSchemeByName(const char* schemeName);
|
|
//@}
|
|
|
|
//@{
|
|
/**
|
|
* The number of basis vectors to use. See SetBasisScheme() for more information.
|
|
* When FixedBasisSize <= 0 (the default), the fixed basis size scheme is equivalent to the full
|
|
* basis scheme.
|
|
*/
|
|
vtkSetMacro(FixedBasisSize, int);
|
|
vtkGetMacro(FixedBasisSize, int);
|
|
//@}
|
|
|
|
//@{
|
|
/**
|
|
* The minimum energy the new basis should use, as a fraction. See SetBasisScheme() for more
|
|
* information. When FixedBasisEnergy >= 1 (the default), the fixed basis energy scheme is
|
|
* equivalent to the full basis scheme.
|
|
*/
|
|
vtkSetClampMacro(FixedBasisEnergy, double, 0., 1.);
|
|
vtkGetMacro(FixedBasisEnergy, double);
|
|
//@}
|
|
|
|
/**
|
|
* A convenience method (in particular for access from other applications) to
|
|
* set parameter values.
|
|
* Return true if setting of requested parameter name was executed, false otherwise.
|
|
*/
|
|
bool SetParameter(const char* parameter, int index, vtkVariant value) override;
|
|
|
|
protected:
|
|
vtkPCAStatistics();
|
|
~vtkPCAStatistics() override;
|
|
|
|
/**
|
|
* This algorithm accepts a vtkTable containing normalization values for
|
|
* its fourth input (port 3).
|
|
* We override FillInputPortInformation to indicate this.
|
|
*/
|
|
int FillInputPortInformation(int port, vtkInformation* info) override;
|
|
|
|
/**
|
|
* Execute the calculations required by the Derive option.
|
|
*/
|
|
void Derive(vtkMultiBlockDataSet*) override;
|
|
|
|
/**
|
|
* Execute the calculations required by the Test option.
|
|
*/
|
|
void Test(vtkTable*, vtkMultiBlockDataSet*, vtkTable*) override;
|
|
|
|
/**
|
|
* Execute the calculations required by the Assess option.
|
|
*/
|
|
void Assess(vtkTable*, vtkMultiBlockDataSet*, vtkTable*) override;
|
|
|
|
/**
|
|
* Calculate p-value. This will be overridden using the object factory with an
|
|
* R implementation if R is present.
|
|
*/
|
|
virtual vtkDoubleArray* CalculatePValues(vtkIdTypeArray*, vtkDoubleArray*);
|
|
|
|
/**
|
|
* Provide the appropriate assessment functor.
|
|
*/
|
|
void SelectAssessFunctor(vtkTable* inData, vtkDataObject* inMeta, vtkStringArray* rowNames,
|
|
AssessFunctor*& dfunc) override;
|
|
|
|
int NormalizationScheme;
|
|
int BasisScheme;
|
|
int FixedBasisSize;
|
|
double FixedBasisEnergy;
|
|
|
|
static const char* BasisSchemeEnumNames[NUM_BASIS_SCHEMES + 1];
|
|
static const char* NormalizationSchemeEnumNames[NUM_NORMALIZATION_SCHEMES + 1];
|
|
|
|
private:
|
|
vtkPCAStatistics(const vtkPCAStatistics&) = delete;
|
|
void operator=(const vtkPCAStatistics&) = delete;
|
|
};
|
|
|
|
#endif // vtkPCAStatistics_h
|