SDDS ToolKit Programs and Libraries for C and Python
All Classes Files Functions Variables Macros Pages
sddscorrelate.c File Reference

Detailed Description

Computes and evaluates correlations among columns of data in SDDS files.

This program reads an input SDDS file, calculates the correlation coefficients between specified or all numeric columns, and outputs the results to a new SDDS file. It supports options for rank-order correlation, standard deviation-based outlier removal, and column/row major data ordering.

Usage

sddscorrelate [<inputfile>] [<outputfile>]
[-pipe=[input][,output]]
[-columns=<list-of-names>]
[-excludeColumns=<list-of-names>]
[-withOnly=<name>]
[-rankOrder]
[-stDevOutlier[=limit=<factor>][,passes=<integer>]]
[-majorOrder=row|column]

Options

Option Description
-pipe Use standard input/output for input/output.
-columns Specify columns to include in correlation analysis.
-excludeColumns Specify columns to exclude from analysis.
-withOnly Correlate only with the specified column.
-rankOrder Use rank-order (Spearman) correlation instead of linear (Pearson).
-stDevOutlier Remove outliers based on standard deviation.
-majorOrder Set data ordering to row-major or column-major.

Incompatibilities

  • -columns and -excludeColumns cannot be used together.
  • -withOnly is mutually exclusive with -excludeColumns.

Features

  • Linear (Pearson) or Rank-Order (Spearman) correlation calculation.
  • Exclusion of specific columns or correlation with a single column.
  • Outlier detection and removal using standard deviation thresholds.
  • Customizable data ordering (row-major or column-major).

Output

The output SDDS file contains:

  • Correlation coefficients and significance for column pairs.
  • Number of points used in the correlation.
  • Parameters summarizing the correlation analysis.
License
This file is distributed under the terms of the Software License Agreement found in the file LICENSE included with this distribution.
Author
M. Borland, C. Saunders, R. Soliday

Definition in file sddscorrelate.c.

#include "mdb.h"
#include "SDDS.h"
#include "scan.h"
#include "SDDSutils.h"
#include <ctype.h>

Go to the source code of this file.

Functions

void replaceWithRank (double *data, int64_t n)
 
double * findRank (double *data, int64_t n)
 
void markStDevOutliers (double *data, double limit, long passes, short *keep, int64_t n)
 
int main (int argc, char **argv)
 
int compareData (const void *d1, const void *d2)
 

Function Documentation

◆ compareData()

int compareData ( const void * d1,
const void * d2 )

Definition at line 389 of file sddscorrelate.c.

389 {
390 double diff = ((DATAnINDEX *)d1)->data - ((DATAnINDEX *)d2)->data;
391 if (diff < 0)
392 return -1;
393 else if (diff > 0)
394 return 1;
395 else
396 return 0;
397}

◆ findRank()

double * findRank ( double * data,
int64_t n )

Definition at line 399 of file sddscorrelate.c.

399 {
400 double *rank = malloc(sizeof(*rank) * n);
401 if (!rank)
402 return NULL;
403 for (int64_t i = 0; i < n; i++)
404 rank[i] = data[i];
405 replaceWithRank(rank, n);
406 return rank;
407}

◆ main()

int main ( int argc,
char ** argv )

Definition at line 117 of file sddscorrelate.c.

117 {
118 int iArg;
119 char **column, **excludeColumn, *withOnly;
120 long columns, excludeColumns;
121 char *input, *output;
122 SCANNED_ARG *scanned;
123 SDDS_DATASET SDDSin, SDDSout;
124 long i, j, row, count, readCode, rankOrder, iName1, iName2;
125 int64_t rows;
126 int32_t outlierStDevPasses;
127 double **data, correlation, significance, outlierStDevLimit;
128 double **rank;
129 short **accept;
130 char s[SDDS_MAXLINE];
131 unsigned long pipeFlags, dummyFlags, majorOrderFlag;
132 short columnMajorOrder = -1;
133
135 argc = scanargs(&scanned, argc, argv);
136 if (argc < 2)
137 bomb(NULL, USAGE);
138
139 output = input = withOnly = NULL;
140 columns = excludeColumns = 0;
141 column = excludeColumn = NULL;
142 pipeFlags = 0;
143 rankOrder = 0;
144 outlierStDevPasses = 0;
145 outlierStDevLimit = 1.0;
146 rank = NULL;
147 accept = NULL;
148
149 for (iArg = 1; iArg < argc; iArg++) {
150 if (scanned[iArg].arg_type == OPTION) {
151 /* process options here */
152 switch (match_string(scanned[iArg].list[0], option, N_OPTIONS, 0)) {
153 case SET_MAJOR_ORDER:
154 majorOrderFlag = 0;
155 scanned[iArg].n_items--;
156 if (scanned[iArg].n_items > 0 &&
157 (!scanItemList(&majorOrderFlag, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
158 "row", -1, NULL, 0, SDDS_ROW_MAJOR_ORDER,
159 "column", -1, NULL, 0, SDDS_COLUMN_MAJOR_ORDER, NULL)))
160 SDDS_Bomb("invalid -majorOrder syntax/values");
161 if (majorOrderFlag & SDDS_COLUMN_MAJOR_ORDER)
162 columnMajorOrder = 1;
163 else if (majorOrderFlag & SDDS_ROW_MAJOR_ORDER)
164 columnMajorOrder = 0;
165 break;
166 case SET_COLUMNS:
167 if (columns)
168 SDDS_Bomb("only one -columns option may be given");
169 if (scanned[iArg].n_items < 2)
170 SDDS_Bomb("invalid -columns syntax");
171 column = tmalloc(sizeof(*column) * (columns = scanned[iArg].n_items - 1));
172 for (i = 0; i < columns; i++)
173 column[i] = scanned[iArg].list[i + 1];
174 break;
175 case SET_EXCLUDE:
176 if (scanned[iArg].n_items < 2)
177 SDDS_Bomb("invalid -excludeColumns syntax");
178 moveToStringArray(&excludeColumn, &excludeColumns, scanned[iArg].list + 1, scanned[iArg].n_items - 1);
179 break;
180 case SET_WITHONLY:
181 if (withOnly)
182 SDDS_Bomb("only one -withOnly option may be given");
183 if (scanned[iArg].n_items < 2)
184 SDDS_Bomb("invalid -withOnly syntax");
185 withOnly = scanned[iArg].list[1];
186 break;
187 case SET_PIPE:
188 if (!processPipeOption(scanned[iArg].list + 1, scanned[iArg].n_items - 1, &pipeFlags))
189 SDDS_Bomb("invalid -pipe syntax");
190 break;
191 case SET_RANKORDER:
192 rankOrder = 1;
193 break;
194 case SET_STDEVOUTLIER:
195 scanned[iArg].n_items--;
196 outlierStDevPasses = 1;
197 outlierStDevLimit = 1.0;
198 if (!scanItemList(&dummyFlags, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
199 "limit", SDDS_DOUBLE, &outlierStDevLimit, 1, 0,
200 "passes", SDDS_LONG, &outlierStDevPasses, 1, 0, NULL) ||
201 outlierStDevPasses <= 0 || outlierStDevLimit <= 0.0)
202 SDDS_Bomb("invalid -stdevOutlier syntax/values");
203 break;
204 default:
205 fprintf(stderr, "Error: unknown or ambiguous option: %s\n", scanned[iArg].list[0]);
206 exit(EXIT_FAILURE);
207 break;
208 }
209 } else {
210 if (!input)
211 input = scanned[iArg].list[0];
212 else if (!output)
213 output = scanned[iArg].list[0];
214 else
215 SDDS_Bomb("too many filenames seen");
216 }
217 }
218
219 processFilenames("sddscorrelate", &input, &output, pipeFlags, 0, NULL);
220
221 if (!SDDS_InitializeInput(&SDDSin, input))
222 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
223
224 if (!columns)
225 columns = appendToStringArray(&column, columns, "*");
226 if (withOnly)
227 columns = appendToStringArray(&column, columns, withOnly);
228
229 if ((columns = expandColumnPairNames(&SDDSin, &column, NULL, columns, excludeColumn, excludeColumns, FIND_NUMERIC_TYPE, 0)) <= 0) {
230 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
231 SDDS_Bomb("no columns selected for correlation analysis");
232 }
233
234 if (!SDDS_InitializeOutput(&SDDSout, SDDS_BINARY, 0, NULL, "sddscorrelate output", output) ||
235 SDDS_DefineColumn(&SDDSout, "Correlate1Name", NULL, NULL, "Name of correlated quantity 1", NULL, SDDS_STRING, 0) < 0 ||
236 SDDS_DefineColumn(&SDDSout, "Correlate2Name", NULL, NULL, "Name of correlated quantity 2", NULL, SDDS_STRING, 0) < 0 ||
237 SDDS_DefineColumn(&SDDSout, "CorrelatePair", NULL, NULL, "Names of correlated quantities", NULL, SDDS_STRING, 0) < 0 ||
238 SDDS_DefineColumn(&SDDSout, "CorrelationCoefficient", "r", NULL, "Linear correlation coefficient", NULL, SDDS_DOUBLE, 0) < 0 ||
239 SDDS_DefineColumn(&SDDSout, "CorrelationSignificance", "P$br$n", NULL, "Linear correlation coefficient significance", NULL, SDDS_DOUBLE, 0) < 0 ||
240 SDDS_DefineColumn(&SDDSout, "CorrelationPoints", NULL, NULL, "Number of points used for correlation", NULL, SDDS_LONG, 0) < 0 ||
241 SDDS_DefineParameter(&SDDSout, "CorrelatedRows", NULL, NULL, "Number of data rows in correlation analysis", NULL, SDDS_LONG, NULL) < 0 ||
242 SDDS_DefineParameter(&SDDSout, "sddscorrelateInputFile", NULL, NULL, "Data file processed by sddscorrelate", NULL, SDDS_STRING, input ? input : "stdin") < 0 ||
243 SDDS_DefineParameter(&SDDSout, "sddscorrelateMode", NULL, NULL, NULL, NULL, SDDS_STRING, rankOrder ? "Rank-Order (Spearman)" : "Linear (Pearson)") < 0 ||
244 SDDS_DefineParameter1(&SDDSout, "sddscorrelateStDevOutlierPasses", NULL, NULL, "Number of passes of standard-deviation outlier elimination applied", NULL, SDDS_LONG, &outlierStDevPasses) < 0 ||
245 SDDS_DefineParameter1(&SDDSout, "sddscorrelateStDevOutlierLimit", NULL, NULL, "Standard-deviation outlier limit applied", NULL, SDDS_DOUBLE, &outlierStDevLimit) < 0) {
246 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
247 }
248
249 if (columnMajorOrder != -1)
250 SDDSout.layout.data_mode.column_major = columnMajorOrder;
251 else
252 SDDSout.layout.data_mode.column_major = SDDSin.layout.data_mode.column_major;
253
254 if (!SDDS_WriteLayout(&SDDSout))
255 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
256
257 data = malloc(sizeof(*data) * columns);
258 if (!data ||
259 (rankOrder && !(rank = malloc(sizeof(*rank) * columns))) ||
260 !(accept = malloc(sizeof(*accept) * columns))) {
261 SDDS_Bomb("allocation failure");
262 }
263
264 while ((readCode = SDDS_ReadPage(&SDDSin)) > 0) {
265 if ((rows = SDDS_CountRowsOfInterest(&SDDSin)) < 3)
266 continue;
267 if (!SDDS_StartPage(&SDDSout, columns * (columns - 1) / 2) ||
268 !SDDS_SetParameters(&SDDSout, SDDS_SET_BY_NAME | SDDS_PASS_BY_VALUE, "CorrelatedRows", rows, NULL)) {
269 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
270 }
271 for (i = 0; i < columns; i++) {
272 data[i] = SDDS_GetColumnInDoubles(&SDDSin, column[i]);
273 if (!data[i])
274 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
275 if (rankOrder)
276 rank[i] = findRank(data[i], rows);
277 if (outlierStDevPasses) {
278 accept[i] = malloc(sizeof(**accept) * rows);
279 if (!accept[i])
280 SDDS_Bomb("allocation failure");
281 markStDevOutliers(data[i], outlierStDevLimit, outlierStDevPasses, accept[i], rows);
282 } else {
283 accept[i] = NULL;
284 }
285 }
286 for (i = row = 0; i < columns; i++) {
287 for (j = i + 1; j < columns; j++) {
288 iName1 = i;
289 iName2 = j;
290 if (withOnly) {
291 if (strcmp(withOnly, column[i]) == 0) {
292 iName1 = j;
293 iName2 = i;
294 } else if (strcmp(withOnly, column[j]) == 0) {
295 iName1 = i;
296 iName2 = j;
297 } else {
298 continue;
299 }
300 }
301 correlation = linearCorrelationCoefficient(rankOrder ? rank[i] : data[i],
302 rankOrder ? rank[j] : data[j],
303 accept[i], accept[j], rows, &count);
304 significance = linearCorrelationSignificance(correlation, count);
305 snprintf(s, sizeof(s), "%s.%s", column[iName1], column[iName2]);
306 if (!SDDS_SetRowValues(&SDDSout, SDDS_SET_BY_INDEX | SDDS_PASS_BY_VALUE, row++,
307 0, column[iName1],
308 1, column[iName2],
309 2, s,
310 3, correlation,
311 4, significance,
312 5, count,
313 -1)) {
314 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
315 }
316 }
317 }
318 for (i = 0; i < columns; i++) {
319 free(data[i]);
320 if (rankOrder)
321 free(rank[i]);
322 if (accept[i])
323 free(accept[i]);
324 }
325 if (!SDDS_WritePage(&SDDSout))
326 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
327 }
328
329 free(data);
330 if (rankOrder)
331 free(rank);
332 free(accept);
333
334 if (!SDDS_Terminate(&SDDSin)) {
335 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
336 exit(EXIT_FAILURE);
337 }
338 if (!SDDS_Terminate(&SDDSout)) {
339 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
340 exit(EXIT_FAILURE);
341 }
342
343 return EXIT_SUCCESS;
344}
int32_t SDDS_SetRowValues(SDDS_DATASET *SDDS_dataset, int32_t mode, int64_t row,...)
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int32_t SDDS_SetParameters(SDDS_DATASET *SDDS_dataset, int32_t mode,...)
int64_t SDDS_CountRowsOfInterest(SDDS_DATASET *SDDS_dataset)
Counts the number of rows marked as "of interest" in the current data table.
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_DefineParameter1(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, void *fixed_value)
Defines a data parameter with a fixed numerical value.
int32_t SDDS_InitializeOutput(SDDS_DATASET *SDDS_dataset, int32_t data_mode, int32_t lines_per_row, const char *description, const char *contents, const char *filename)
Initializes the SDDS output dataset.
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_DefineColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, int32_t field_length)
Defines a data column within the SDDS dataset.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_DefineParameter(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, char *fixed_value)
Defines a data parameter with a fixed string value.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
#define SDDS_STRING
Identifier for the string data type.
Definition SDDStypes.h:85
#define SDDS_LONG
Identifier for the signed 32-bit integer data type.
Definition SDDStypes.h:61
#define SDDS_DOUBLE
Identifier for the double data type.
Definition SDDStypes.h:37
void * tmalloc(uint64_t size_of_block)
Allocates a memory block of the specified size with zero initialization.
Definition array.c:59
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
Definition bomb.c:26
double linearCorrelationSignificance(double r, long rows)
Compute the statistical significance of a linear correlation coefficient.
Definition lincorr.c:79
double linearCorrelationCoefficient(double *data1, double *data2, short *accept1, short *accept2, long rows, long *count)
Compute the linear correlation coefficient for two data sets.
Definition lincorr.c:32
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390
long scanItemList(unsigned long *flags, char **item, long *items, unsigned long mode,...)
Scans a list of items and assigns values based on provided keywords and types.

◆ markStDevOutliers()

void markStDevOutliers ( double * data,
double limit,
long passes,
short * keep,
int64_t n )

Definition at line 346 of file sddscorrelate.c.

346 {
347 double sum1, sum2, variance, mean, absLimit;
348 long pass;
349 int64_t i, summed, kept;
350
351 for (i = 0; i < n; i++)
352 keep[i] = 1;
353 kept = n;
354 for (pass = 0; pass < passes && kept; pass++) {
355 sum1 = 0.0;
356 summed = 0;
357 for (i = 0; i < n; i++) {
358 if (keep[i]) {
359 sum1 += data[i];
360 summed += 1;
361 }
362 }
363 if (summed < 2)
364 return;
365 mean = sum1 / summed;
366 sum2 = 0.0;
367 for (i = 0; i < n; i++) {
368 if (keep[i])
369 sum2 += sqr(data[i] - mean);
370 }
371 variance = sum2 / summed;
372 if (variance > 0.0) {
373 absLimit = limit * sqrt(variance);
374 for (i = 0; i < n; i++) {
375 if (keep[i] && fabs(data[i] - mean) > absLimit) {
376 keep[i] = 0;
377 kept--;
378 }
379 }
380 }
381 }
382}

◆ replaceWithRank()

void replaceWithRank ( double * data,
int64_t n )

Definition at line 409 of file sddscorrelate.c.

409 {
410 static DATAnINDEX *indexedData = NULL;
411 int64_t i, j, iStart, iEnd;
412
413 indexedData = SDDS_Realloc(indexedData, sizeof(*indexedData) * n);
414 for (i = 0; i < n; i++) {
415 indexedData[i].data = data[i];
416 indexedData[i].originalIndex = i;
417 }
418 qsort(indexedData, n, sizeof(*indexedData), compareData);
419 for (i = 0; i < n; i++)
420 data[indexedData[i].originalIndex] = (double)i;
421 for (i = 0; i < n - 1; i++) {
422 if (data[i] == data[i + 1]) {
423 iStart = i;
424 for (j = i + 2; j < n; j++) {
425 if (data[j] != data[i])
426 break;
427 }
428 iEnd = j - 1;
429 double averageRank = (iStart + iEnd) / 2.0;
430 for (j = iStart; j <= iEnd; j++)
431 data[indexedData[j].originalIndex] = averageRank;
432 i = iEnd;
433 }
434 }
435}
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
Definition SDDS_utils.c:677