1 /*********************************/
2 /* Principal Components Analysis */
3 /*********************************/
5 /*********************************************************************/
6 /* Principal Components Analysis or the Karhunen-Loeve expansion is a
7 classical method for dimensionality reduction or exploratory data
8 analysis. One reference among many is: F. Murtagh and A. Heck,
9 Multivariate Data Analysis, Kluwer Academic, Dordrecht, 1987.
13 Phone: + 49 89 32006298 (work)
15 Earn/Bitnet: fionn@dgaeso51, fim@dgaipp1s, murtagh@stsci
17 Internet: murtagh@scivax.stsci.edu
19 F. Murtagh, Munich, 6 June 1989 */
20 /*********************************************************************/
28 #define SIGN(a, b) ( (b) < 0 ? -fabs(a) : fabs(a) )
30 /** Variance-covariance matrix: creation *****************************/
32 /* Create m * m covariance matrix from given n * m data matrix. */
33 void covcol(double** data, int n, int m, double** symmat)
38 /* Allocate storage for mean vector */
40 mean = (double*) malloc(m*sizeof(double));
42 /* Determine mean of column vectors of input data matrix */
44 for (j = 0; j < m; j++)
47 for (i = 0; i < n; i++)
49 mean[j] += data[i][j];
55 printf("\nMeans of column vectors:\n");
56 for (j = 0; j < m; j++) {
57 printf("%12.1f",mean[j]); } printf("\n");
60 /* Center the column vectors. */
62 for (i = 0; i < n; i++)
64 for (j = 0; j < m; j++)
66 data[i][j] -= mean[j];
70 /* Calculate the m * m covariance matrix. */
71 for (j1 = 0; j1 < m; j1++)
73 for (j2 = j1; j2 < m; j2++)
76 for (i = 0; i < n; i++)
78 symmat[j1][j2] += data[i][j1] * data[i][j2];
80 symmat[j2][j1] = symmat[j1][j2];
90 /** Error handler **************************************************/
92 void erhand(char* err_msg)
94 fprintf(stderr,"Run-time error:\n");
95 fprintf(stderr,"%s\n", err_msg);
96 fprintf(stderr,"Exiting to system.\n");
101 /** Reduce a real, symmetric matrix to a symmetric, tridiag. matrix. */
103 /* Householder reduction of matrix a to tridiagonal form.
104 Algorithm: Martin et al., Num. Math. 11, 181-195, 1968.
105 Ref: Smith et al., Matrix Eigensystem Routines -- EISPACK Guide
106 Springer-Verlag, 1976, pp. 489-494.
107 W H Press et al., Numerical Recipes in C, Cambridge U P,
108 1988, pp. 373-374. */
109 void tred2(double** a, int n, double* d, double* e)
112 double scale, hh, h, g, f;
114 for (i = n-1; i >= 1; i--)
120 for (k = 0; k <= l; k++)
121 scale += fabs(a[i][k]);
126 for (k = 0; k <= l; k++)
129 h += a[i][k] * a[i][k];
132 g = f>0 ? -sqrt(h) : sqrt(h);
137 for (j = 0; j <= l; j++)
141 for (k = 0; k <= j; k++)
142 g += a[j][k] * a[i][k];
143 for (k = j+1; k <= l; k++)
144 g += a[k][j] * a[i][k];
149 for (j = 0; j <= l; j++)
152 e[j] = g = e[j] - hh * f;
153 for (k = 0; k <= j; k++)
154 a[j][k] -= (f * e[k] + g * a[i][k]);
164 for (i = 0; i < n; i++)
169 for (j = 0; j <= l; j++)
172 for (k = 0; k <= l; k++)
173 g += a[i][k] * a[k][j];
174 for (k = 0; k <= l; k++)
175 a[k][j] -= g * a[k][i];
180 for (j = 0; j <= l; j++)
181 a[j][i] = a[i][j] = 0.0;
185 /** Tridiagonal QL algorithm -- Implicit **********************/
187 void tqli(double* d, double* e, int n, double** z)
189 int m, l, iter, i, k;
190 double s, r, p, g, f, dd, c, b;
192 for (i = 1; i < n; i++)
195 for (l = 0; l < n; l++)
200 for (m = l; m < n-1; m++)
202 dd = fabs(d[m]) + fabs(d[m+1]);
203 if (fabs(e[m]) + dd == dd) break;
207 if (iter++ == 30) erhand("No convergence in TLQI.");
208 g = (d[l+1] - d[l]) / (2.0 * e[l]);
209 r = sqrt((g * g) + 1.0);
210 g = d[m] - d[l] + e[l] / (g + SIGN(r, g));
213 for (i = m-1; i >= l; i--)
217 if (fabs(f) >= fabs(g))
220 r = sqrt((c * c) + 1.0);
227 r = sqrt((s * s) + 1.0);
232 r = (d[i] - g) * s + 2.0 * c * b;
236 for (k = 0; k < n; k++)
239 z[k][i+1] = s * z[k][i] + c * f;
240 z[k][i] = c * z[k][i] - s * f;
251 /* In place projection onto basis vectors */
252 void pca_project(double** data, int n, int m, int ncomponents)
255 double **symmat, **symmat2, *evals, *interm;
257 //TODO: assert ncomponents < m
259 symmat = (double**) malloc(m*sizeof(double*));
260 for (i = 0; i < m; i++)
261 symmat[i] = (double*) malloc(m*sizeof(double));
263 covcol(data, n, m, symmat);
265 /*********************************************************************
267 **********************************************************************/
269 /* Allocate storage for dummy and new vectors. */
270 evals = (double*) malloc(m*sizeof(double)); /* Storage alloc. for vector of eigenvalues */
271 interm = (double*) malloc(m*sizeof(double)); /* Storage alloc. for 'intermediate' vector */
272 //MALLOC_ARRAY(symmat2,m,m,double);
273 //for (i = 0; i < m; i++) {
274 // for (j = 0; j < m; j++) {
275 // symmat2[i][j] = symmat[i][j]; /* Needed below for col. projections */
278 tred2(symmat, m, evals, interm); /* Triangular decomposition */
279 tqli(evals, interm, m, symmat); /* Reduction of sym. trid. matrix */
280 /* evals now contains the eigenvalues,
281 columns of symmat now contain the associated eigenvectors. */
284 printf("\nEigenvalues:\n");
285 for (j = m-1; j >= 0; j--) {
286 printf("%18.5f\n", evals[j]); }
287 printf("\n(Eigenvalues should be strictly positive; limited\n");
288 printf("precision machine arithmetic may affect this.\n");
289 printf("Eigenvalues are often expressed as cumulative\n");
290 printf("percentages, representing the 'percentage variance\n");
291 printf("explained' by the associated axis or principal component.)\n");
293 printf("\nEigenvectors:\n");
294 printf("(First three; their definition in terms of original vbes.)\n");
295 for (j = 0; j < m; j++) {
296 for (i = 1; i <= 3; i++) {
297 printf("%12.4f", symmat[j][m-i]); }
301 /* Form projections of row-points on prin. components. */
302 /* Store in 'data', overwriting original data. */
303 for (i = 0; i < n; i++) {
304 for (j = 0; j < m; j++) {
305 interm[j] = data[i][j]; } /* data[i][j] will be overwritten */
306 for (k = 0; k < ncomponents; k++) {
308 for (k2 = 0; k2 < m; k2++) {
309 data[i][k] += interm[k2] * symmat[k2][m-k-1]; }
314 printf("\nProjections of row-points on first 3 prin. comps.:\n");
315 for (i = 0; i < n; i++) {
316 for (j = 0; j < 3; j++) {
317 printf("%12.4f", data[i][j]); }
321 /* Form projections of col.-points on first three prin. components. */
322 /* Store in 'symmat2', overwriting what was stored in this. */
323 //for (j = 0; j < m; j++) {
324 // for (k = 0; k < m; k++) {
325 // interm[k] = symmat2[j][k]; } /*symmat2[j][k] will be overwritten*/
326 // for (i = 0; i < 3; i++) {
327 // symmat2[j][i] = 0.0;
328 // for (k2 = 0; k2 < m; k2++) {
329 // symmat2[j][i] += interm[k2] * symmat[k2][m-i-1]; }
330 // if (evals[m-i-1] > 0.0005) /* Guard against zero eigenvalue */
331 // symmat2[j][i] /= sqrt(evals[m-i-1]); /* Rescale */
333 // symmat2[j][i] = 0.0; /* Standard kludge */
338 printf("\nProjections of column-points on first 3 prin. comps.:\n");
339 for (j = 0; j < m; j++) {
340 for (k = 0; k < 3; k++) {
341 printf("%12.4f", symmat2[j][k]); }
346 for (i = 0; i < m; i++)
349 //FREE_ARRAY(symmat2,m);