SDDSlib
Loading...
Searching...
No Matches
sddscorrelate.c
Go to the documentation of this file.
1/**
2 * @file sddscorrelate.c
3 * @brief Computes and evaluates correlations among columns of data in SDDS files.
4 *
5 * This program reads an input SDDS file, calculates the correlation coefficients
6 * between specified or all numeric columns, and outputs the results to a new SDDS file.
7 * It supports options for rank-order correlation, standard deviation-based outlier removal,
8 * and column/row major data ordering.
9 *
10 * ### Features:
11 * - Linear (Pearson) or Rank-Order (Spearman) correlation calculation.
12 * - Exclusion of specific columns or correlation with a single column.
13 * - Outlier detection and removal using standard deviation thresholds.
14 * - Customizable data ordering (row-major or column-major).
15 *
16 * ### Options:
17 * - `-pipe=[input][,output]` : Use standard input/output for input/output.
18 * - `-columns=<list-of-names>` : Specify columns to include in correlation analysis.
19 * - `-excludeColumns=<list-of-names>` : Specify columns to exclude from analysis.
20 * - `-withOnly=<name>` : Correlate only with the specified column.
21 * - `-rankOrder` : Use rank-order (Spearman) correlation instead of linear (Pearson).
22 * - `-stDevOutlier[=limit=<factor>][,passes=<integer>]` : Remove outliers based on standard deviation.
23 * - `-majorOrder=row|column` : Set data ordering to row-major or column-major.
24 *
25 * ### Output:
26 * The output SDDS file contains:
27 * - Correlation coefficients and significance for column pairs.
28 * - Number of points used in the correlation.
29 * - Parameters summarizing the correlation analysis.
30 *
31 * @copyright
32 * - (c) 2002 The University of Chicago, as Operator of Argonne National Laboratory.
33 * - (c) 2002 The Regents of the University of California, as Operator of Los Alamos National Laboratory.
34 *
35 * @license
36 * This file is distributed under the terms of the Software License Agreement
37 * found in the file LICENSE included with this distribution.
38 *
39 * @author M. Borland, C. Saunders, R. Soliday
40 */
41
42#include "mdb.h"
43#include "SDDS.h"
44#include "scan.h"
45#include "SDDSutils.h"
46#include <ctype.h>
47
48/* Enumeration for option types */
49enum option_type {
50 SET_COLUMNS,
51 SET_EXCLUDE,
52 SET_WITHONLY,
53 SET_PIPE,
54 SET_RANKORDER,
55 SET_STDEVOUTLIER,
56 SET_MAJOR_ORDER,
57 N_OPTIONS
58};
59
60char *option[N_OPTIONS] = {
61 "columns",
62 "excludecolumns",
63 "withonly",
64 "pipe",
65 "rankorder",
66 "stdevoutlier",
67 "majorOrder",
68};
69
70#define USAGE "Usage: sddscorrelate [OPTIONS] [<inputfile>] [<outputfile>]\n\
71\n\
72Compute and evaluate correlations among columns of data.\n\
73\n\
74Options:\n\
75 -pipe=[input][,output] Use standard input/output as input and/or output.\n\
76 -columns=<list-of-names> Specify columns to include in correlation analysis.\n\
77 -excludeColumns=<list-of-names> Specify columns to exclude from correlation analysis.\n\
78 -withOnly=<name> Correlate only with the specified column.\n\
79 -rankOrder Use rank-order (Spearman) correlation instead of linear (Pearson).\n\
80 -stDevOutlier[=limit=<factor>][,passes=<integer>]\n\
81 Remove outliers based on standard deviation.\n\
82 -majorOrder=row|column Set data ordering to row-major or column-major.\n\
83\n\
84Program by Michael Borland. ("__DATE__ " "__TIME__ ", SVN revision: " SVN_VERSION ")\n"
85
86void replaceWithRank(double *data, int64_t n);
87double *findRank(double *data, int64_t n);
88void markStDevOutliers(double *data, double limit, long passes, short *keep, int64_t n);
89
90int main(int argc, char **argv) {
91 int iArg;
92 char **column, **excludeColumn, *withOnly;
93 long columns, excludeColumns;
94 char *input, *output;
95 SCANNED_ARG *scanned;
96 SDDS_DATASET SDDSin, SDDSout;
97 long i, j, row, count, readCode, rankOrder, iName1, iName2;
98 int64_t rows;
99 int32_t outlierStDevPasses;
100 double **data, correlation, significance, outlierStDevLimit;
101 double **rank;
102 short **accept;
103 char s[SDDS_MAXLINE];
104 unsigned long pipeFlags, dummyFlags, majorOrderFlag;
105 short columnMajorOrder = -1;
106
108 argc = scanargs(&scanned, argc, argv);
109 if (argc < 2)
110 bomb(NULL, USAGE);
111
112 output = input = withOnly = NULL;
113 columns = excludeColumns = 0;
114 column = excludeColumn = NULL;
115 pipeFlags = 0;
116 rankOrder = 0;
117 outlierStDevPasses = 0;
118 outlierStDevLimit = 1.0;
119 rank = NULL;
120 accept = NULL;
121
122 for (iArg = 1; iArg < argc; iArg++) {
123 if (scanned[iArg].arg_type == OPTION) {
124 /* process options here */
125 switch (match_string(scanned[iArg].list[0], option, N_OPTIONS, 0)) {
126 case SET_MAJOR_ORDER:
127 majorOrderFlag = 0;
128 scanned[iArg].n_items--;
129 if (scanned[iArg].n_items > 0 &&
130 (!scanItemList(&majorOrderFlag, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
131 "row", -1, NULL, 0, SDDS_ROW_MAJOR_ORDER,
132 "column", -1, NULL, 0, SDDS_COLUMN_MAJOR_ORDER, NULL)))
133 SDDS_Bomb("invalid -majorOrder syntax/values");
134 if (majorOrderFlag & SDDS_COLUMN_MAJOR_ORDER)
135 columnMajorOrder = 1;
136 else if (majorOrderFlag & SDDS_ROW_MAJOR_ORDER)
137 columnMajorOrder = 0;
138 break;
139 case SET_COLUMNS:
140 if (columns)
141 SDDS_Bomb("only one -columns option may be given");
142 if (scanned[iArg].n_items < 2)
143 SDDS_Bomb("invalid -columns syntax");
144 column = tmalloc(sizeof(*column) * (columns = scanned[iArg].n_items - 1));
145 for (i = 0; i < columns; i++)
146 column[i] = scanned[iArg].list[i + 1];
147 break;
148 case SET_EXCLUDE:
149 if (scanned[iArg].n_items < 2)
150 SDDS_Bomb("invalid -excludeColumns syntax");
151 moveToStringArray(&excludeColumn, &excludeColumns, scanned[iArg].list + 1, scanned[iArg].n_items - 1);
152 break;
153 case SET_WITHONLY:
154 if (withOnly)
155 SDDS_Bomb("only one -withOnly option may be given");
156 if (scanned[iArg].n_items < 2)
157 SDDS_Bomb("invalid -withOnly syntax");
158 withOnly = scanned[iArg].list[1];
159 break;
160 case SET_PIPE:
161 if (!processPipeOption(scanned[iArg].list + 1, scanned[iArg].n_items - 1, &pipeFlags))
162 SDDS_Bomb("invalid -pipe syntax");
163 break;
164 case SET_RANKORDER:
165 rankOrder = 1;
166 break;
167 case SET_STDEVOUTLIER:
168 scanned[iArg].n_items--;
169 outlierStDevPasses = 1;
170 outlierStDevLimit = 1.0;
171 if (!scanItemList(&dummyFlags, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
172 "limit", SDDS_DOUBLE, &outlierStDevLimit, 1, 0,
173 "passes", SDDS_LONG, &outlierStDevPasses, 1, 0, NULL) ||
174 outlierStDevPasses <= 0 || outlierStDevLimit <= 0.0)
175 SDDS_Bomb("invalid -stdevOutlier syntax/values");
176 break;
177 default:
178 fprintf(stderr, "Error: unknown or ambiguous option: %s\n", scanned[iArg].list[0]);
179 exit(EXIT_FAILURE);
180 break;
181 }
182 } else {
183 if (!input)
184 input = scanned[iArg].list[0];
185 else if (!output)
186 output = scanned[iArg].list[0];
187 else
188 SDDS_Bomb("too many filenames seen");
189 }
190 }
191
192 processFilenames("sddscorrelate", &input, &output, pipeFlags, 0, NULL);
193
194 if (!SDDS_InitializeInput(&SDDSin, input))
195 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
196
197 if (!columns)
198 columns = appendToStringArray(&column, columns, "*");
199 if (withOnly)
200 columns = appendToStringArray(&column, columns, withOnly);
201
202 if ((columns = expandColumnPairNames(&SDDSin, &column, NULL, columns, excludeColumn, excludeColumns, FIND_NUMERIC_TYPE, 0)) <= 0) {
203 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
204 SDDS_Bomb("no columns selected for correlation analysis");
205 }
206
207 if (!SDDS_InitializeOutput(&SDDSout, SDDS_BINARY, 0, NULL, "sddscorrelate output", output) ||
208 SDDS_DefineColumn(&SDDSout, "Correlate1Name", NULL, NULL, "Name of correlated quantity 1", NULL, SDDS_STRING, 0) < 0 ||
209 SDDS_DefineColumn(&SDDSout, "Correlate2Name", NULL, NULL, "Name of correlated quantity 2", NULL, SDDS_STRING, 0) < 0 ||
210 SDDS_DefineColumn(&SDDSout, "CorrelatePair", NULL, NULL, "Names of correlated quantities", NULL, SDDS_STRING, 0) < 0 ||
211 SDDS_DefineColumn(&SDDSout, "CorrelationCoefficient", "r", NULL, "Linear correlation coefficient", NULL, SDDS_DOUBLE, 0) < 0 ||
212 SDDS_DefineColumn(&SDDSout, "CorrelationSignificance", "P$br$n", NULL, "Linear correlation coefficient significance", NULL, SDDS_DOUBLE, 0) < 0 ||
213 SDDS_DefineColumn(&SDDSout, "CorrelationPoints", NULL, NULL, "Number of points used for correlation", NULL, SDDS_LONG, 0) < 0 ||
214 SDDS_DefineParameter(&SDDSout, "CorrelatedRows", NULL, NULL, "Number of data rows in correlation analysis", NULL, SDDS_LONG, NULL) < 0 ||
215 SDDS_DefineParameter(&SDDSout, "sddscorrelateInputFile", NULL, NULL, "Data file processed by sddscorrelate", NULL, SDDS_STRING, input ? input : "stdin") < 0 ||
216 SDDS_DefineParameter(&SDDSout, "sddscorrelateMode", NULL, NULL, NULL, NULL, SDDS_STRING, rankOrder ? "Rank-Order (Spearman)" : "Linear (Pearson)") < 0 ||
217 SDDS_DefineParameter1(&SDDSout, "sddscorrelateStDevOutlierPasses", NULL, NULL, "Number of passes of standard-deviation outlier elimination applied", NULL, SDDS_LONG, &outlierStDevPasses) < 0 ||
218 SDDS_DefineParameter1(&SDDSout, "sddscorrelateStDevOutlierLimit", NULL, NULL, "Standard-deviation outlier limit applied", NULL, SDDS_DOUBLE, &outlierStDevLimit) < 0) {
219 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
220 }
221
222 if (columnMajorOrder != -1)
223 SDDSout.layout.data_mode.column_major = columnMajorOrder;
224 else
225 SDDSout.layout.data_mode.column_major = SDDSin.layout.data_mode.column_major;
226
227 if (!SDDS_WriteLayout(&SDDSout))
228 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
229
230 data = malloc(sizeof(*data) * columns);
231 if (!data ||
232 (rankOrder && !(rank = malloc(sizeof(*rank) * columns))) ||
233 !(accept = malloc(sizeof(*accept) * columns))) {
234 SDDS_Bomb("allocation failure");
235 }
236
237 while ((readCode = SDDS_ReadPage(&SDDSin)) > 0) {
238 if ((rows = SDDS_CountRowsOfInterest(&SDDSin)) < 3)
239 continue;
240 if (!SDDS_StartPage(&SDDSout, columns * (columns - 1) / 2) ||
241 !SDDS_SetParameters(&SDDSout, SDDS_SET_BY_NAME | SDDS_PASS_BY_VALUE, "CorrelatedRows", rows, NULL)) {
242 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
243 }
244 for (i = 0; i < columns; i++) {
245 data[i] = SDDS_GetColumnInDoubles(&SDDSin, column[i]);
246 if (!data[i])
247 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
248 if (rankOrder)
249 rank[i] = findRank(data[i], rows);
250 if (outlierStDevPasses) {
251 accept[i] = malloc(sizeof(**accept) * rows);
252 if (!accept[i])
253 SDDS_Bomb("allocation failure");
254 markStDevOutliers(data[i], outlierStDevLimit, outlierStDevPasses, accept[i], rows);
255 } else {
256 accept[i] = NULL;
257 }
258 }
259 for (i = row = 0; i < columns; i++) {
260 for (j = i + 1; j < columns; j++) {
261 iName1 = i;
262 iName2 = j;
263 if (withOnly) {
264 if (strcmp(withOnly, column[i]) == 0) {
265 iName1 = j;
266 iName2 = i;
267 } else if (strcmp(withOnly, column[j]) == 0) {
268 iName1 = i;
269 iName2 = j;
270 } else {
271 continue;
272 }
273 }
274 correlation = linearCorrelationCoefficient(rankOrder ? rank[i] : data[i],
275 rankOrder ? rank[j] : data[j],
276 accept[i], accept[j], rows, &count);
277 significance = linearCorrelationSignificance(correlation, count);
278 snprintf(s, sizeof(s), "%s.%s", column[iName1], column[iName2]);
279 if (!SDDS_SetRowValues(&SDDSout, SDDS_SET_BY_INDEX | SDDS_PASS_BY_VALUE, row++,
280 0, column[iName1],
281 1, column[iName2],
282 2, s,
283 3, correlation,
284 4, significance,
285 5, count,
286 -1)) {
287 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
288 }
289 }
290 }
291 for (i = 0; i < columns; i++) {
292 free(data[i]);
293 if (rankOrder)
294 free(rank[i]);
295 if (accept[i])
296 free(accept[i]);
297 }
298 if (!SDDS_WritePage(&SDDSout))
299 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
300 }
301
302 free(data);
303 if (rankOrder)
304 free(rank);
305 free(accept);
306
307 if (!SDDS_Terminate(&SDDSin)) {
308 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
309 exit(EXIT_FAILURE);
310 }
311 if (!SDDS_Terminate(&SDDSout)) {
312 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
313 exit(EXIT_FAILURE);
314 }
315
316 return EXIT_SUCCESS;
317}
318
319void markStDevOutliers(double *data, double limit, long passes, short *keep, int64_t n) {
320 double sum1, sum2, variance, mean, absLimit;
321 long pass;
322 int64_t i, summed, kept;
323
324 for (i = 0; i < n; i++)
325 keep[i] = 1;
326 kept = n;
327 for (pass = 0; pass < passes && kept; pass++) {
328 sum1 = 0.0;
329 summed = 0;
330 for (i = 0; i < n; i++) {
331 if (keep[i]) {
332 sum1 += data[i];
333 summed += 1;
334 }
335 }
336 if (summed < 2)
337 return;
338 mean = sum1 / summed;
339 sum2 = 0.0;
340 for (i = 0; i < n; i++) {
341 if (keep[i])
342 sum2 += sqr(data[i] - mean);
343 }
344 variance = sum2 / summed;
345 if (variance > 0.0) {
346 absLimit = limit * sqrt(variance);
347 for (i = 0; i < n; i++) {
348 if (keep[i] && fabs(data[i] - mean) > absLimit) {
349 keep[i] = 0;
350 kept--;
351 }
352 }
353 }
354 }
355}
356
357typedef struct {
358 double data;
359 long originalIndex;
360} DATAnINDEX;
361
362int compareData(const void *d1, const void *d2) {
363 double diff = ((DATAnINDEX *)d1)->data - ((DATAnINDEX *)d2)->data;
364 if (diff < 0)
365 return -1;
366 else if (diff > 0)
367 return 1;
368 else
369 return 0;
370}
371
372double *findRank(double *data, int64_t n) {
373 double *rank = malloc(sizeof(*rank) * n);
374 if (!rank)
375 return NULL;
376 for (int64_t i = 0; i < n; i++)
377 rank[i] = data[i];
378 replaceWithRank(rank, n);
379 return rank;
380}
381
382void replaceWithRank(double *data, int64_t n) {
383 static DATAnINDEX *indexedData = NULL;
384 int64_t i, j, iStart, iEnd;
385
386 indexedData = SDDS_Realloc(indexedData, sizeof(*indexedData) * n);
387 for (i = 0; i < n; i++) {
388 indexedData[i].data = data[i];
389 indexedData[i].originalIndex = i;
390 }
391 qsort(indexedData, n, sizeof(*indexedData), compareData);
392 for (i = 0; i < n; i++)
393 data[indexedData[i].originalIndex] = (double)i;
394 for (i = 0; i < n - 1; i++) {
395 if (data[i] == data[i + 1]) {
396 iStart = i;
397 for (j = i + 2; j < n; j++) {
398 if (data[j] != data[i])
399 break;
400 }
401 iEnd = j - 1;
402 double averageRank = (iStart + iEnd) / 2.0;
403 for (j = iStart; j <= iEnd; j++)
404 data[indexedData[j].originalIndex] = averageRank;
405 i = iEnd;
406 }
407 }
408}
SDDS (Self Describing Data Set) Data Types Definitions and Function Prototypes.
int32_t SDDS_SetRowValues(SDDS_DATASET *SDDS_dataset, int32_t mode, int64_t row,...)
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int32_t SDDS_SetParameters(SDDS_DATASET *SDDS_dataset, int32_t mode,...)
int64_t SDDS_CountRowsOfInterest(SDDS_DATASET *SDDS_dataset)
Counts the number of rows marked as "of interest" in the current data table.
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_DefineParameter1(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, void *fixed_value)
Defines a data parameter with a fixed numerical value.
int32_t SDDS_InitializeOutput(SDDS_DATASET *SDDS_dataset, int32_t data_mode, int32_t lines_per_row, const char *description, const char *contents, const char *filename)
Initializes the SDDS output dataset.
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_DefineColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, int32_t field_length)
Defines a data column within the SDDS dataset.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_DefineParameter(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, char *fixed_value)
Defines a data parameter with a fixed string value.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
Definition SDDS_utils.c:677
#define SDDS_STRING
Identifier for the string data type.
Definition SDDStypes.h:85
#define SDDS_LONG
Identifier for the signed 32-bit integer data type.
Definition SDDStypes.h:61
#define SDDS_DOUBLE
Identifier for the double data type.
Definition SDDStypes.h:37
void * tmalloc(uint64_t size_of_block)
Allocates a memory block of the specified size with zero initialization.
Definition array.c:59
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
Definition bomb.c:26
double linearCorrelationSignificance(double r, long rows)
Compute the statistical significance of a linear correlation coefficient.
Definition lincorr.c:79
double linearCorrelationCoefficient(double *data1, double *data2, short *accept1, short *accept2, long rows, long *count)
Compute the linear correlation coefficient for two data sets.
Definition lincorr.c:32
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390
long scanItemList(unsigned long *flags, char **item, long *items, unsigned long mode,...)
Scans a list of items and assigns values based on provided keywords and types.