SDDS ToolKit Programs and Libraries for C and Python
All Classes Files Functions Variables Macros Pages
sddscorrelate.c
Go to the documentation of this file.
1/**
2 * @file sddscorrelate.c
3 * @brief Computes and evaluates correlations among columns of data in SDDS files.
4 *
5 * @details
6 * This program reads an input SDDS file, calculates the correlation coefficients
7 * between specified or all numeric columns, and outputs the results to a new SDDS file.
8 * It supports options for rank-order correlation, standard deviation-based outlier removal,
9 * and column/row major data ordering.
10 *
11 * @section Usage
12 * ```
13 * sddscorrelate [<inputfile>] [<outputfile>]
14 * [-pipe=[input][,output]]
15 * [-columns=<list-of-names>]
16 * [-excludeColumns=<list-of-names>]
17 * [-withOnly=<name>]
18 * [-rankOrder]
19 * [-stDevOutlier[=limit=<factor>][,passes=<integer>]]
20 * [-majorOrder=row|column]
21 * ```
22 *
23 * @section Options
24 * | Option | Description |
25 * |-----------------------------------|-----------------------------------------------------------------------------|
26 * | `-pipe` | Use standard input/output for input/output. |
27 * | `-columns` | Specify columns to include in correlation analysis. |
28 * | `-excludeColumns` | Specify columns to exclude from analysis. |
29 * | `-withOnly` | Correlate only with the specified column. |
30 * | `-rankOrder` | Use rank-order (Spearman) correlation instead of linear (Pearson). |
31 * | `-stDevOutlier` | Remove outliers based on standard deviation. |
32 * | `-majorOrder` | Set data ordering to row-major or column-major. |
33 *
34 * @subsection Incompatibilities
35 * - `-columns` and `-excludeColumns` cannot be used together.
36 * - `-withOnly` is mutually exclusive with `-excludeColumns`.
37 *
38 * ### Features
39 * - Linear (Pearson) or Rank-Order (Spearman) correlation calculation.
40 * - Exclusion of specific columns or correlation with a single column.
41 * - Outlier detection and removal using standard deviation thresholds.
42 * - Customizable data ordering (row-major or column-major).
43 *
44 * ### Output
45 * The output SDDS file contains:
46 * - Correlation coefficients and significance for column pairs.
47 * - Number of points used in the correlation.
48 * - Parameters summarizing the correlation analysis.
49 *
50 * @copyright
51 * - (c) 2002 The University of Chicago, as Operator of Argonne National Laboratory.
52 * - (c) 2002 The Regents of the University of California, as Operator of Los Alamos National Laboratory.
53 *
54 * @license
55 * This file is distributed under the terms of the Software License Agreement
56 * found in the file LICENSE included with this distribution.
57 *
58 * @author
59 * M. Borland, C. Saunders, R. Soliday
60 */
61
62#include "mdb.h"
63#include "SDDS.h"
64#include "scan.h"
65#include "SDDSutils.h"
66#include <ctype.h>
67
68/* Enumeration for option types */
69enum option_type {
70 SET_COLUMNS,
71 SET_EXCLUDE,
72 SET_WITHONLY,
73 SET_PIPE,
74 SET_RANKORDER,
75 SET_STDEVOUTLIER,
76 SET_MAJOR_ORDER,
77 N_OPTIONS
78};
79
80char *option[N_OPTIONS] = {
81 "columns",
82 "excludecolumns",
83 "withonly",
84 "pipe",
85 "rankorder",
86 "stdevoutlier",
87 "majorOrder",
88};
89
90#define USAGE "sddscorrelate [<inputfile>] [<outputfile>]\n\
91 [-pipe=[input][,output]]\n\
92 [-columns=<list-of-names>]\n\
93 [-excludeColumns=<list-of-names>]\n\
94 [-withOnly=<name>]\n\
95 [-rankOrder]\n\
96 [-stDevOutlier[=limit=<factor>][,passes=<integer>]]\n\
97 [-majorOrder=row|column]\n\
98\n\
99Compute and evaluate correlations among columns of data.\n\
100\n\
101Options:\n\
102 -pipe=[input][,output] Use standard input/output as input and/or output.\n\
103 -columns=<list-of-names> Specify columns to include in correlation analysis.\n\
104 -excludeColumns=<list-of-names> Specify columns to exclude from correlation analysis.\n\
105 -withOnly=<name> Correlate only with the specified column.\n\
106 -rankOrder Use rank-order (Spearman) correlation instead of linear (Pearson).\n\
107 -stDevOutlier[=limit=<factor>][,passes=<integer>]\n\
108 Remove outliers based on standard deviation.\n\
109 -majorOrder=row|column Set data ordering to row-major or column-major.\n\
110\n\
111Program by Michael Borland. ("__DATE__ " "__TIME__ ", SVN revision: " SVN_VERSION ")\n"
112
113void replaceWithRank(double *data, int64_t n);
114double *findRank(double *data, int64_t n);
115void markStDevOutliers(double *data, double limit, long passes, short *keep, int64_t n);
116
117int main(int argc, char **argv) {
118 int iArg;
119 char **column, **excludeColumn, *withOnly;
120 long columns, excludeColumns;
121 char *input, *output;
122 SCANNED_ARG *scanned;
123 SDDS_DATASET SDDSin, SDDSout;
124 long i, j, row, count, readCode, rankOrder, iName1, iName2;
125 int64_t rows;
126 int32_t outlierStDevPasses;
127 double **data, correlation, significance, outlierStDevLimit;
128 double **rank;
129 short **accept;
130 char s[SDDS_MAXLINE];
131 unsigned long pipeFlags, dummyFlags, majorOrderFlag;
132 short columnMajorOrder = -1;
133
135 argc = scanargs(&scanned, argc, argv);
136 if (argc < 2)
137 bomb(NULL, USAGE);
138
139 output = input = withOnly = NULL;
140 columns = excludeColumns = 0;
141 column = excludeColumn = NULL;
142 pipeFlags = 0;
143 rankOrder = 0;
144 outlierStDevPasses = 0;
145 outlierStDevLimit = 1.0;
146 rank = NULL;
147 accept = NULL;
148
149 for (iArg = 1; iArg < argc; iArg++) {
150 if (scanned[iArg].arg_type == OPTION) {
151 /* process options here */
152 switch (match_string(scanned[iArg].list[0], option, N_OPTIONS, 0)) {
153 case SET_MAJOR_ORDER:
154 majorOrderFlag = 0;
155 scanned[iArg].n_items--;
156 if (scanned[iArg].n_items > 0 &&
157 (!scanItemList(&majorOrderFlag, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
158 "row", -1, NULL, 0, SDDS_ROW_MAJOR_ORDER,
159 "column", -1, NULL, 0, SDDS_COLUMN_MAJOR_ORDER, NULL)))
160 SDDS_Bomb("invalid -majorOrder syntax/values");
161 if (majorOrderFlag & SDDS_COLUMN_MAJOR_ORDER)
162 columnMajorOrder = 1;
163 else if (majorOrderFlag & SDDS_ROW_MAJOR_ORDER)
164 columnMajorOrder = 0;
165 break;
166 case SET_COLUMNS:
167 if (columns)
168 SDDS_Bomb("only one -columns option may be given");
169 if (scanned[iArg].n_items < 2)
170 SDDS_Bomb("invalid -columns syntax");
171 column = tmalloc(sizeof(*column) * (columns = scanned[iArg].n_items - 1));
172 for (i = 0; i < columns; i++)
173 column[i] = scanned[iArg].list[i + 1];
174 break;
175 case SET_EXCLUDE:
176 if (scanned[iArg].n_items < 2)
177 SDDS_Bomb("invalid -excludeColumns syntax");
178 moveToStringArray(&excludeColumn, &excludeColumns, scanned[iArg].list + 1, scanned[iArg].n_items - 1);
179 break;
180 case SET_WITHONLY:
181 if (withOnly)
182 SDDS_Bomb("only one -withOnly option may be given");
183 if (scanned[iArg].n_items < 2)
184 SDDS_Bomb("invalid -withOnly syntax");
185 withOnly = scanned[iArg].list[1];
186 break;
187 case SET_PIPE:
188 if (!processPipeOption(scanned[iArg].list + 1, scanned[iArg].n_items - 1, &pipeFlags))
189 SDDS_Bomb("invalid -pipe syntax");
190 break;
191 case SET_RANKORDER:
192 rankOrder = 1;
193 break;
194 case SET_STDEVOUTLIER:
195 scanned[iArg].n_items--;
196 outlierStDevPasses = 1;
197 outlierStDevLimit = 1.0;
198 if (!scanItemList(&dummyFlags, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
199 "limit", SDDS_DOUBLE, &outlierStDevLimit, 1, 0,
200 "passes", SDDS_LONG, &outlierStDevPasses, 1, 0, NULL) ||
201 outlierStDevPasses <= 0 || outlierStDevLimit <= 0.0)
202 SDDS_Bomb("invalid -stdevOutlier syntax/values");
203 break;
204 default:
205 fprintf(stderr, "Error: unknown or ambiguous option: %s\n", scanned[iArg].list[0]);
206 exit(EXIT_FAILURE);
207 break;
208 }
209 } else {
210 if (!input)
211 input = scanned[iArg].list[0];
212 else if (!output)
213 output = scanned[iArg].list[0];
214 else
215 SDDS_Bomb("too many filenames seen");
216 }
217 }
218
219 processFilenames("sddscorrelate", &input, &output, pipeFlags, 0, NULL);
220
221 if (!SDDS_InitializeInput(&SDDSin, input))
222 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
223
224 if (!columns)
225 columns = appendToStringArray(&column, columns, "*");
226 if (withOnly)
227 columns = appendToStringArray(&column, columns, withOnly);
228
229 if ((columns = expandColumnPairNames(&SDDSin, &column, NULL, columns, excludeColumn, excludeColumns, FIND_NUMERIC_TYPE, 0)) <= 0) {
230 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
231 SDDS_Bomb("no columns selected for correlation analysis");
232 }
233
234 if (!SDDS_InitializeOutput(&SDDSout, SDDS_BINARY, 0, NULL, "sddscorrelate output", output) ||
235 SDDS_DefineColumn(&SDDSout, "Correlate1Name", NULL, NULL, "Name of correlated quantity 1", NULL, SDDS_STRING, 0) < 0 ||
236 SDDS_DefineColumn(&SDDSout, "Correlate2Name", NULL, NULL, "Name of correlated quantity 2", NULL, SDDS_STRING, 0) < 0 ||
237 SDDS_DefineColumn(&SDDSout, "CorrelatePair", NULL, NULL, "Names of correlated quantities", NULL, SDDS_STRING, 0) < 0 ||
238 SDDS_DefineColumn(&SDDSout, "CorrelationCoefficient", "r", NULL, "Linear correlation coefficient", NULL, SDDS_DOUBLE, 0) < 0 ||
239 SDDS_DefineColumn(&SDDSout, "CorrelationSignificance", "P$br$n", NULL, "Linear correlation coefficient significance", NULL, SDDS_DOUBLE, 0) < 0 ||
240 SDDS_DefineColumn(&SDDSout, "CorrelationPoints", NULL, NULL, "Number of points used for correlation", NULL, SDDS_LONG, 0) < 0 ||
241 SDDS_DefineParameter(&SDDSout, "CorrelatedRows", NULL, NULL, "Number of data rows in correlation analysis", NULL, SDDS_LONG, NULL) < 0 ||
242 SDDS_DefineParameter(&SDDSout, "sddscorrelateInputFile", NULL, NULL, "Data file processed by sddscorrelate", NULL, SDDS_STRING, input ? input : "stdin") < 0 ||
243 SDDS_DefineParameter(&SDDSout, "sddscorrelateMode", NULL, NULL, NULL, NULL, SDDS_STRING, rankOrder ? "Rank-Order (Spearman)" : "Linear (Pearson)") < 0 ||
244 SDDS_DefineParameter1(&SDDSout, "sddscorrelateStDevOutlierPasses", NULL, NULL, "Number of passes of standard-deviation outlier elimination applied", NULL, SDDS_LONG, &outlierStDevPasses) < 0 ||
245 SDDS_DefineParameter1(&SDDSout, "sddscorrelateStDevOutlierLimit", NULL, NULL, "Standard-deviation outlier limit applied", NULL, SDDS_DOUBLE, &outlierStDevLimit) < 0) {
246 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
247 }
248
249 if (columnMajorOrder != -1)
250 SDDSout.layout.data_mode.column_major = columnMajorOrder;
251 else
252 SDDSout.layout.data_mode.column_major = SDDSin.layout.data_mode.column_major;
253
254 if (!SDDS_WriteLayout(&SDDSout))
255 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
256
257 data = malloc(sizeof(*data) * columns);
258 if (!data ||
259 (rankOrder && !(rank = malloc(sizeof(*rank) * columns))) ||
260 !(accept = malloc(sizeof(*accept) * columns))) {
261 SDDS_Bomb("allocation failure");
262 }
263
264 while ((readCode = SDDS_ReadPage(&SDDSin)) > 0) {
265 if ((rows = SDDS_CountRowsOfInterest(&SDDSin)) < 3)
266 continue;
267 if (!SDDS_StartPage(&SDDSout, columns * (columns - 1) / 2) ||
268 !SDDS_SetParameters(&SDDSout, SDDS_SET_BY_NAME | SDDS_PASS_BY_VALUE, "CorrelatedRows", rows, NULL)) {
269 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
270 }
271 for (i = 0; i < columns; i++) {
272 data[i] = SDDS_GetColumnInDoubles(&SDDSin, column[i]);
273 if (!data[i])
274 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
275 if (rankOrder)
276 rank[i] = findRank(data[i], rows);
277 if (outlierStDevPasses) {
278 accept[i] = malloc(sizeof(**accept) * rows);
279 if (!accept[i])
280 SDDS_Bomb("allocation failure");
281 markStDevOutliers(data[i], outlierStDevLimit, outlierStDevPasses, accept[i], rows);
282 } else {
283 accept[i] = NULL;
284 }
285 }
286 for (i = row = 0; i < columns; i++) {
287 for (j = i + 1; j < columns; j++) {
288 iName1 = i;
289 iName2 = j;
290 if (withOnly) {
291 if (strcmp(withOnly, column[i]) == 0) {
292 iName1 = j;
293 iName2 = i;
294 } else if (strcmp(withOnly, column[j]) == 0) {
295 iName1 = i;
296 iName2 = j;
297 } else {
298 continue;
299 }
300 }
301 correlation = linearCorrelationCoefficient(rankOrder ? rank[i] : data[i],
302 rankOrder ? rank[j] : data[j],
303 accept[i], accept[j], rows, &count);
304 significance = linearCorrelationSignificance(correlation, count);
305 snprintf(s, sizeof(s), "%s.%s", column[iName1], column[iName2]);
306 if (!SDDS_SetRowValues(&SDDSout, SDDS_SET_BY_INDEX | SDDS_PASS_BY_VALUE, row++,
307 0, column[iName1],
308 1, column[iName2],
309 2, s,
310 3, correlation,
311 4, significance,
312 5, count,
313 -1)) {
314 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
315 }
316 }
317 }
318 for (i = 0; i < columns; i++) {
319 free(data[i]);
320 if (rankOrder)
321 free(rank[i]);
322 if (accept[i])
323 free(accept[i]);
324 }
325 if (!SDDS_WritePage(&SDDSout))
326 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
327 }
328
329 free(data);
330 if (rankOrder)
331 free(rank);
332 free(accept);
333
334 if (!SDDS_Terminate(&SDDSin)) {
335 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
336 exit(EXIT_FAILURE);
337 }
338 if (!SDDS_Terminate(&SDDSout)) {
339 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
340 exit(EXIT_FAILURE);
341 }
342
343 return EXIT_SUCCESS;
344}
345
346void markStDevOutliers(double *data, double limit, long passes, short *keep, int64_t n) {
347 double sum1, sum2, variance, mean, absLimit;
348 long pass;
349 int64_t i, summed, kept;
350
351 for (i = 0; i < n; i++)
352 keep[i] = 1;
353 kept = n;
354 for (pass = 0; pass < passes && kept; pass++) {
355 sum1 = 0.0;
356 summed = 0;
357 for (i = 0; i < n; i++) {
358 if (keep[i]) {
359 sum1 += data[i];
360 summed += 1;
361 }
362 }
363 if (summed < 2)
364 return;
365 mean = sum1 / summed;
366 sum2 = 0.0;
367 for (i = 0; i < n; i++) {
368 if (keep[i])
369 sum2 += sqr(data[i] - mean);
370 }
371 variance = sum2 / summed;
372 if (variance > 0.0) {
373 absLimit = limit * sqrt(variance);
374 for (i = 0; i < n; i++) {
375 if (keep[i] && fabs(data[i] - mean) > absLimit) {
376 keep[i] = 0;
377 kept--;
378 }
379 }
380 }
381 }
382}
383
384typedef struct {
385 double data;
386 long originalIndex;
387} DATAnINDEX;
388
389int compareData(const void *d1, const void *d2) {
390 double diff = ((DATAnINDEX *)d1)->data - ((DATAnINDEX *)d2)->data;
391 if (diff < 0)
392 return -1;
393 else if (diff > 0)
394 return 1;
395 else
396 return 0;
397}
398
399double *findRank(double *data, int64_t n) {
400 double *rank = malloc(sizeof(*rank) * n);
401 if (!rank)
402 return NULL;
403 for (int64_t i = 0; i < n; i++)
404 rank[i] = data[i];
405 replaceWithRank(rank, n);
406 return rank;
407}
408
409void replaceWithRank(double *data, int64_t n) {
410 static DATAnINDEX *indexedData = NULL;
411 int64_t i, j, iStart, iEnd;
412
413 indexedData = SDDS_Realloc(indexedData, sizeof(*indexedData) * n);
414 for (i = 0; i < n; i++) {
415 indexedData[i].data = data[i];
416 indexedData[i].originalIndex = i;
417 }
418 qsort(indexedData, n, sizeof(*indexedData), compareData);
419 for (i = 0; i < n; i++)
420 data[indexedData[i].originalIndex] = (double)i;
421 for (i = 0; i < n - 1; i++) {
422 if (data[i] == data[i + 1]) {
423 iStart = i;
424 for (j = i + 2; j < n; j++) {
425 if (data[j] != data[i])
426 break;
427 }
428 iEnd = j - 1;
429 double averageRank = (iStart + iEnd) / 2.0;
430 for (j = iStart; j <= iEnd; j++)
431 data[indexedData[j].originalIndex] = averageRank;
432 i = iEnd;
433 }
434 }
435}
SDDS (Self Describing Data Set) Data Types Definitions and Function Prototypes.
int32_t SDDS_SetRowValues(SDDS_DATASET *SDDS_dataset, int32_t mode, int64_t row,...)
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int32_t SDDS_SetParameters(SDDS_DATASET *SDDS_dataset, int32_t mode,...)
int64_t SDDS_CountRowsOfInterest(SDDS_DATASET *SDDS_dataset)
Counts the number of rows marked as "of interest" in the current data table.
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_DefineParameter1(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, void *fixed_value)
Defines a data parameter with a fixed numerical value.
int32_t SDDS_InitializeOutput(SDDS_DATASET *SDDS_dataset, int32_t data_mode, int32_t lines_per_row, const char *description, const char *contents, const char *filename)
Initializes the SDDS output dataset.
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_DefineColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, int32_t field_length)
Defines a data column within the SDDS dataset.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_DefineParameter(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, char *fixed_value)
Defines a data parameter with a fixed string value.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
Definition SDDS_utils.c:677
#define SDDS_STRING
Identifier for the string data type.
Definition SDDStypes.h:85
#define SDDS_LONG
Identifier for the signed 32-bit integer data type.
Definition SDDStypes.h:61
#define SDDS_DOUBLE
Identifier for the double data type.
Definition SDDStypes.h:37
Utility functions for SDDS dataset manipulation and string array operations.
void * tmalloc(uint64_t size_of_block)
Allocates a memory block of the specified size with zero initialization.
Definition array.c:59
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
Definition bomb.c:26
double linearCorrelationSignificance(double r, long rows)
Compute the statistical significance of a linear correlation coefficient.
Definition lincorr.c:79
double linearCorrelationCoefficient(double *data1, double *data2, short *accept1, short *accept2, long rows, long *count)
Compute the linear correlation coefficient for two data sets.
Definition lincorr.c:32
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390
long scanItemList(unsigned long *flags, char **item, long *items, unsigned long mode,...)
Scans a list of items and assigns values based on provided keywords and types.