using System; using CenterSpace.NMath.Core; using System.IO; namespace CenterSpace.NMath.Examples.CSharp { /// <summary> /// A .NET example in C# showing how to compute a consensus matrix averaging different MNF clusterings. /// </summary> /// <remarks> /// A Nonnegative Matrix Factorization (NMF) is an approximate factorization /// of a positive matrix v into a product of two matrices w and h: /// v ~ wh /// This factorization can by used to group, or cluster, the columns of v /// (the columns of v are usually refered to as "samples" ). NMF uses an /// iterative algorithm with random starting values for w and h. This, coupled /// with the fact that the factorization is not unique, means that if you cluster /// the columns of v using an NMF cluster several different times, you may get several /// different clusterings. The NMF consensus matrix is a way to average /// the possibly different clusterings, and is computed using the following process: /// /// Cluster the columns of v using NMF n times. Each NMF clustering will yield /// a "connectivity matrix". The connectivity matrix is a symmetric matrix /// whose i, jth entry is 1 if columns i and j of v were clustered together, /// and 0 if they were not. The "consensus matrix" is also a symmetric matrix /// whose i, jth entry is formed by taking the average of the i, jth entries of /// the n connectivity matrices. /// /// It is clear that each i, jth entry of the consensus matrix has a value between 0 /// (columns i and j were not clustered together on any of the n runs) and 1 (columns /// i and j were clustered together on all n runs). Thus the i, jth entry of a /// consensus matrix may be considered, in some sense, a "probability" that columns /// i and j belong to the same cluster. /// A consensus matrix C may also used to perform a hierarchical clustering of the /// columns of v by using as the distance function: /// /// distance between columns i and j = 1.0 - C[i,j] /// /// This is demonstrated in the example below. /// </remarks> class NMFConsensusMatrixExample { static void Main( string[] args ) { // Read in some data.. DataFrame data = DataFrame.Load( "nmf_data.dat", true, true, "\t", true ); // Extract the data as a DoubleMatrix. DoubleMatrix v = data.ToDoubleMatrix(); // Set the order of the NMF (this is the number of columns in w, where // v ~ wh int k = 3; // Set the number of runs or connectivity matrices to use to form the // consensus matrix. int numberOfRuns = 70; // Construct a consensus matrix using the "divergence" update // algorithm. var consensusMatrix = new NMFConsensusMatrix<NMFDivergenceUpdate>( v, data.ColumnHeaders, k, numberOfRuns ); Console.WriteLine(); // Print out the number of runs in which the NMF algorithm actually converged to an answer, and the // resulting consensus matrix. Console.WriteLine( "{0} runs out of {1} converged.", consensusMatrix.NumberOfConvergedRuns, numberOfRuns ); Console.WriteLine(); Console.WriteLine( "Consensus Matrix:" ); Console.WriteLine( consensusMatrix.ToTabDelimited( "G3" ) ); // Lets look at the first column and for each successive column print out the // "probability" that they are clustered together (well use the column // names from the data frame instead of column numbers). string label = consensusMatrix.Labels[0]; Console.WriteLine(); for ( int j = 1; j < consensusMatrix.Order; j++ ) { Console.WriteLine( "The \"probability\" that {0} is clustered with {1} is {2}", label, consensusMatrix.Labels[j], consensusMatrix[0, j] ); } // Perform a hierarchical cluster analysis using the consensus matrix // to define the distance function as described in the class description // above. // The cluster analysis class wants to cluster the rows of a matrix. Since we // are essentially clustering a bunch of column numbers, well provide a matrix // with one column and n rows where n is the number of columns of v (and the // order of of the consensus matrix). The column will contain the numbers 0 // to n - 1 (basically, were just clustering the numbers 0,...,n - 1). var itemNumbers = new DoubleMatrix( consensusMatrix.Order, 1, 0, 1 ); // The distance function object holds the consensus matrix C and returns the distance // between i and j as 1.0 - C[i,j] var distanceFunctionObject = new ConsensusMatrixDistance( consensusMatrix ); var clusterAnalysisDist = new Distance.Function( distanceFunctionObject.CaDistance ); var ca = new ClusterAnalysis( itemNumbers, clusterAnalysisDist ); // Form three clusters using the cluster analysis cut tree function and print them out. ClusterSet clusters = ca.CutTree( 3 ); Console.WriteLine(); for ( int clusterNumber = 0; clusterNumber < clusters.NumberOfClusters; clusterNumber++ ) { int[] members = clusters.Cluster( clusterNumber ); Console.Write( "Cluster number {0} contains: ", clusterNumber ); for ( int i = 0; i < members.Length; i++ ) { Console.Write( "{0} ", consensusMatrix.Labels[members[i]] ); } Console.WriteLine(); } Console.WriteLine(); Console.WriteLine( "Press Enter Key" ); Console.Read(); } } public class ConsensusMatrixDistance { private ConnectivityMatrix consensusMatrix; public ConsensusMatrixDistance( ConnectivityMatrix conn ) { consensusMatrix = conn; } public double CaDistance( DoubleVector data1, DoubleVector data2 ) { int i = (int) data1[0]; int j = (int) data2[0]; return 1.0 - consensusMatrix[i, j]; } } }← All NMath Code Examples