SDDSlib
Loading...
Searching...
No Matches
sddscorrelate.c File Reference

Computes and evaluates correlations among columns of data in SDDS files. More...

#include "mdb.h"
#include "SDDS.h"
#include "scan.h"
#include "SDDSutils.h"
#include <ctype.h>

Go to the source code of this file.

Classes

struct  DATAnINDEX
 

Macros

#define USAGE   "Usage: sddscorrelate [OPTIONS] [<inputfile>] [<outputfile>]\n\\n\Compute and evaluate correlations among columns of data.\n\\n\Options:\n\ -pipe=[input][,output] Use standard input/output as input and/or output.\n\ -columns=<list-of-names> Specify columns to include in correlation analysis.\n\ -excludeColumns=<list-of-names> Specify columns to exclude from correlation analysis.\n\ -withOnly=<name> Correlate only with the specified column.\n\ -rankOrder Use rank-order (Spearman) correlation instead of linear (Pearson).\n\ -stDevOutlier[=limit=<factor>][,passes=<integer>]\n\ Remove outliers based on standard deviation.\n\ -majorOrder=row|column Set data ordering to row-major or column-major.\n\\n\Program by Michael Borland. ("__DATE__ " "__TIME__ ", SVN revision: " SVN_VERSION ")\n"
 

Enumerations

enum  option_type {
  SET_COLUMNS , SET_EXCLUDE , SET_WITHONLY , SET_PIPE ,
  SET_RANKORDER , SET_STDEVOUTLIER , SET_MAJOR_ORDER , N_OPTIONS
}
 

Functions

void replaceWithRank (double *data, int64_t n)
 
double * findRank (double *data, int64_t n)
 
void markStDevOutliers (double *data, double limit, long passes, short *keep, int64_t n)
 
int main (int argc, char **argv)
 
int compareData (const void *d1, const void *d2)
 

Variables

char * option [N_OPTIONS]
 

Detailed Description

Computes and evaluates correlations among columns of data in SDDS files.

This program reads an input SDDS file, calculates the correlation coefficients between specified or all numeric columns, and outputs the results to a new SDDS file. It supports options for rank-order correlation, standard deviation-based outlier removal, and column/row major data ordering.

Features:

  • Linear (Pearson) or Rank-Order (Spearman) correlation calculation.
  • Exclusion of specific columns or correlation with a single column.
  • Outlier detection and removal using standard deviation thresholds.
  • Customizable data ordering (row-major or column-major).

Options:

  • -pipe=[input][,output] : Use standard input/output for input/output.
  • -columns=<list-of-names> : Specify columns to include in correlation analysis.
  • -excludeColumns=<list-of-names> : Specify columns to exclude from analysis.
  • -withOnly=<name> : Correlate only with the specified column.
  • -rankOrder : Use rank-order (Spearman) correlation instead of linear (Pearson).
  • -stDevOutlier[=limit=<factor>][,passes=<integer>] : Remove outliers based on standard deviation.
  • -majorOrder=row|column : Set data ordering to row-major or column-major.

Output:

The output SDDS file contains:

  • Correlation coefficients and significance for column pairs.
  • Number of points used in the correlation.
  • Parameters summarizing the correlation analysis.
License
This file is distributed under the terms of the Software License Agreement found in the file LICENSE included with this distribution.
Author
M. Borland, C. Saunders, R. Soliday

Definition in file sddscorrelate.c.

Macro Definition Documentation

◆ USAGE

#define USAGE   "Usage: sddscorrelate [OPTIONS] [<inputfile>] [<outputfile>]\n\\n\Compute and evaluate correlations among columns of data.\n\\n\Options:\n\ -pipe=[input][,output] Use standard input/output as input and/or output.\n\ -columns=<list-of-names> Specify columns to include in correlation analysis.\n\ -excludeColumns=<list-of-names> Specify columns to exclude from correlation analysis.\n\ -withOnly=<name> Correlate only with the specified column.\n\ -rankOrder Use rank-order (Spearman) correlation instead of linear (Pearson).\n\ -stDevOutlier[=limit=<factor>][,passes=<integer>]\n\ Remove outliers based on standard deviation.\n\ -majorOrder=row|column Set data ordering to row-major or column-major.\n\\n\Program by Michael Borland. ("__DATE__ " "__TIME__ ", SVN revision: " SVN_VERSION ")\n"

Definition at line 70 of file sddscorrelate.c.

Enumeration Type Documentation

◆ option_type

enum option_type

Definition at line 49 of file sddscorrelate.c.

49 {
50 SET_COLUMNS,
51 SET_EXCLUDE,
52 SET_WITHONLY,
53 SET_PIPE,
54 SET_RANKORDER,
55 SET_STDEVOUTLIER,
56 SET_MAJOR_ORDER,
57 N_OPTIONS
58};

Function Documentation

◆ compareData()

int compareData ( const void * d1,
const void * d2 )

Definition at line 362 of file sddscorrelate.c.

362 {
363 double diff = ((DATAnINDEX *)d1)->data - ((DATAnINDEX *)d2)->data;
364 if (diff < 0)
365 return -1;
366 else if (diff > 0)
367 return 1;
368 else
369 return 0;
370}

◆ findRank()

double * findRank ( double * data,
int64_t n )

Definition at line 372 of file sddscorrelate.c.

372 {
373 double *rank = malloc(sizeof(*rank) * n);
374 if (!rank)
375 return NULL;
376 for (int64_t i = 0; i < n; i++)
377 rank[i] = data[i];
378 replaceWithRank(rank, n);
379 return rank;
380}

◆ main()

int main ( int argc,
char ** argv )

Definition at line 90 of file sddscorrelate.c.

90 {
91 int iArg;
92 char **column, **excludeColumn, *withOnly;
93 long columns, excludeColumns;
94 char *input, *output;
95 SCANNED_ARG *scanned;
96 SDDS_DATASET SDDSin, SDDSout;
97 long i, j, row, count, readCode, rankOrder, iName1, iName2;
98 int64_t rows;
99 int32_t outlierStDevPasses;
100 double **data, correlation, significance, outlierStDevLimit;
101 double **rank;
102 short **accept;
103 char s[SDDS_MAXLINE];
104 unsigned long pipeFlags, dummyFlags, majorOrderFlag;
105 short columnMajorOrder = -1;
106
108 argc = scanargs(&scanned, argc, argv);
109 if (argc < 2)
110 bomb(NULL, USAGE);
111
112 output = input = withOnly = NULL;
113 columns = excludeColumns = 0;
114 column = excludeColumn = NULL;
115 pipeFlags = 0;
116 rankOrder = 0;
117 outlierStDevPasses = 0;
118 outlierStDevLimit = 1.0;
119 rank = NULL;
120 accept = NULL;
121
122 for (iArg = 1; iArg < argc; iArg++) {
123 if (scanned[iArg].arg_type == OPTION) {
124 /* process options here */
125 switch (match_string(scanned[iArg].list[0], option, N_OPTIONS, 0)) {
126 case SET_MAJOR_ORDER:
127 majorOrderFlag = 0;
128 scanned[iArg].n_items--;
129 if (scanned[iArg].n_items > 0 &&
130 (!scanItemList(&majorOrderFlag, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
131 "row", -1, NULL, 0, SDDS_ROW_MAJOR_ORDER,
132 "column", -1, NULL, 0, SDDS_COLUMN_MAJOR_ORDER, NULL)))
133 SDDS_Bomb("invalid -majorOrder syntax/values");
134 if (majorOrderFlag & SDDS_COLUMN_MAJOR_ORDER)
135 columnMajorOrder = 1;
136 else if (majorOrderFlag & SDDS_ROW_MAJOR_ORDER)
137 columnMajorOrder = 0;
138 break;
139 case SET_COLUMNS:
140 if (columns)
141 SDDS_Bomb("only one -columns option may be given");
142 if (scanned[iArg].n_items < 2)
143 SDDS_Bomb("invalid -columns syntax");
144 column = tmalloc(sizeof(*column) * (columns = scanned[iArg].n_items - 1));
145 for (i = 0; i < columns; i++)
146 column[i] = scanned[iArg].list[i + 1];
147 break;
148 case SET_EXCLUDE:
149 if (scanned[iArg].n_items < 2)
150 SDDS_Bomb("invalid -excludeColumns syntax");
151 moveToStringArray(&excludeColumn, &excludeColumns, scanned[iArg].list + 1, scanned[iArg].n_items - 1);
152 break;
153 case SET_WITHONLY:
154 if (withOnly)
155 SDDS_Bomb("only one -withOnly option may be given");
156 if (scanned[iArg].n_items < 2)
157 SDDS_Bomb("invalid -withOnly syntax");
158 withOnly = scanned[iArg].list[1];
159 break;
160 case SET_PIPE:
161 if (!processPipeOption(scanned[iArg].list + 1, scanned[iArg].n_items - 1, &pipeFlags))
162 SDDS_Bomb("invalid -pipe syntax");
163 break;
164 case SET_RANKORDER:
165 rankOrder = 1;
166 break;
167 case SET_STDEVOUTLIER:
168 scanned[iArg].n_items--;
169 outlierStDevPasses = 1;
170 outlierStDevLimit = 1.0;
171 if (!scanItemList(&dummyFlags, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
172 "limit", SDDS_DOUBLE, &outlierStDevLimit, 1, 0,
173 "passes", SDDS_LONG, &outlierStDevPasses, 1, 0, NULL) ||
174 outlierStDevPasses <= 0 || outlierStDevLimit <= 0.0)
175 SDDS_Bomb("invalid -stdevOutlier syntax/values");
176 break;
177 default:
178 fprintf(stderr, "Error: unknown or ambiguous option: %s\n", scanned[iArg].list[0]);
179 exit(EXIT_FAILURE);
180 break;
181 }
182 } else {
183 if (!input)
184 input = scanned[iArg].list[0];
185 else if (!output)
186 output = scanned[iArg].list[0];
187 else
188 SDDS_Bomb("too many filenames seen");
189 }
190 }
191
192 processFilenames("sddscorrelate", &input, &output, pipeFlags, 0, NULL);
193
194 if (!SDDS_InitializeInput(&SDDSin, input))
195 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
196
197 if (!columns)
198 columns = appendToStringArray(&column, columns, "*");
199 if (withOnly)
200 columns = appendToStringArray(&column, columns, withOnly);
201
202 if ((columns = expandColumnPairNames(&SDDSin, &column, NULL, columns, excludeColumn, excludeColumns, FIND_NUMERIC_TYPE, 0)) <= 0) {
203 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
204 SDDS_Bomb("no columns selected for correlation analysis");
205 }
206
207 if (!SDDS_InitializeOutput(&SDDSout, SDDS_BINARY, 0, NULL, "sddscorrelate output", output) ||
208 SDDS_DefineColumn(&SDDSout, "Correlate1Name", NULL, NULL, "Name of correlated quantity 1", NULL, SDDS_STRING, 0) < 0 ||
209 SDDS_DefineColumn(&SDDSout, "Correlate2Name", NULL, NULL, "Name of correlated quantity 2", NULL, SDDS_STRING, 0) < 0 ||
210 SDDS_DefineColumn(&SDDSout, "CorrelatePair", NULL, NULL, "Names of correlated quantities", NULL, SDDS_STRING, 0) < 0 ||
211 SDDS_DefineColumn(&SDDSout, "CorrelationCoefficient", "r", NULL, "Linear correlation coefficient", NULL, SDDS_DOUBLE, 0) < 0 ||
212 SDDS_DefineColumn(&SDDSout, "CorrelationSignificance", "P$br$n", NULL, "Linear correlation coefficient significance", NULL, SDDS_DOUBLE, 0) < 0 ||
213 SDDS_DefineColumn(&SDDSout, "CorrelationPoints", NULL, NULL, "Number of points used for correlation", NULL, SDDS_LONG, 0) < 0 ||
214 SDDS_DefineParameter(&SDDSout, "CorrelatedRows", NULL, NULL, "Number of data rows in correlation analysis", NULL, SDDS_LONG, NULL) < 0 ||
215 SDDS_DefineParameter(&SDDSout, "sddscorrelateInputFile", NULL, NULL, "Data file processed by sddscorrelate", NULL, SDDS_STRING, input ? input : "stdin") < 0 ||
216 SDDS_DefineParameter(&SDDSout, "sddscorrelateMode", NULL, NULL, NULL, NULL, SDDS_STRING, rankOrder ? "Rank-Order (Spearman)" : "Linear (Pearson)") < 0 ||
217 SDDS_DefineParameter1(&SDDSout, "sddscorrelateStDevOutlierPasses", NULL, NULL, "Number of passes of standard-deviation outlier elimination applied", NULL, SDDS_LONG, &outlierStDevPasses) < 0 ||
218 SDDS_DefineParameter1(&SDDSout, "sddscorrelateStDevOutlierLimit", NULL, NULL, "Standard-deviation outlier limit applied", NULL, SDDS_DOUBLE, &outlierStDevLimit) < 0) {
219 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
220 }
221
222 if (columnMajorOrder != -1)
223 SDDSout.layout.data_mode.column_major = columnMajorOrder;
224 else
225 SDDSout.layout.data_mode.column_major = SDDSin.layout.data_mode.column_major;
226
227 if (!SDDS_WriteLayout(&SDDSout))
228 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
229
230 data = malloc(sizeof(*data) * columns);
231 if (!data ||
232 (rankOrder && !(rank = malloc(sizeof(*rank) * columns))) ||
233 !(accept = malloc(sizeof(*accept) * columns))) {
234 SDDS_Bomb("allocation failure");
235 }
236
237 while ((readCode = SDDS_ReadPage(&SDDSin)) > 0) {
238 if ((rows = SDDS_CountRowsOfInterest(&SDDSin)) < 3)
239 continue;
240 if (!SDDS_StartPage(&SDDSout, columns * (columns - 1) / 2) ||
241 !SDDS_SetParameters(&SDDSout, SDDS_SET_BY_NAME | SDDS_PASS_BY_VALUE, "CorrelatedRows", rows, NULL)) {
242 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
243 }
244 for (i = 0; i < columns; i++) {
245 data[i] = SDDS_GetColumnInDoubles(&SDDSin, column[i]);
246 if (!data[i])
247 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
248 if (rankOrder)
249 rank[i] = findRank(data[i], rows);
250 if (outlierStDevPasses) {
251 accept[i] = malloc(sizeof(**accept) * rows);
252 if (!accept[i])
253 SDDS_Bomb("allocation failure");
254 markStDevOutliers(data[i], outlierStDevLimit, outlierStDevPasses, accept[i], rows);
255 } else {
256 accept[i] = NULL;
257 }
258 }
259 for (i = row = 0; i < columns; i++) {
260 for (j = i + 1; j < columns; j++) {
261 iName1 = i;
262 iName2 = j;
263 if (withOnly) {
264 if (strcmp(withOnly, column[i]) == 0) {
265 iName1 = j;
266 iName2 = i;
267 } else if (strcmp(withOnly, column[j]) == 0) {
268 iName1 = i;
269 iName2 = j;
270 } else {
271 continue;
272 }
273 }
274 correlation = linearCorrelationCoefficient(rankOrder ? rank[i] : data[i],
275 rankOrder ? rank[j] : data[j],
276 accept[i], accept[j], rows, &count);
277 significance = linearCorrelationSignificance(correlation, count);
278 snprintf(s, sizeof(s), "%s.%s", column[iName1], column[iName2]);
279 if (!SDDS_SetRowValues(&SDDSout, SDDS_SET_BY_INDEX | SDDS_PASS_BY_VALUE, row++,
280 0, column[iName1],
281 1, column[iName2],
282 2, s,
283 3, correlation,
284 4, significance,
285 5, count,
286 -1)) {
287 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
288 }
289 }
290 }
291 for (i = 0; i < columns; i++) {
292 free(data[i]);
293 if (rankOrder)
294 free(rank[i]);
295 if (accept[i])
296 free(accept[i]);
297 }
298 if (!SDDS_WritePage(&SDDSout))
299 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
300 }
301
302 free(data);
303 if (rankOrder)
304 free(rank);
305 free(accept);
306
307 if (!SDDS_Terminate(&SDDSin)) {
308 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
309 exit(EXIT_FAILURE);
310 }
311 if (!SDDS_Terminate(&SDDSout)) {
312 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
313 exit(EXIT_FAILURE);
314 }
315
316 return EXIT_SUCCESS;
317}
int32_t SDDS_SetRowValues(SDDS_DATASET *SDDS_dataset, int32_t mode, int64_t row,...)
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int32_t SDDS_SetParameters(SDDS_DATASET *SDDS_dataset, int32_t mode,...)
int64_t SDDS_CountRowsOfInterest(SDDS_DATASET *SDDS_dataset)
Counts the number of rows marked as "of interest" in the current data table.
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_DefineParameter1(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, void *fixed_value)
Defines a data parameter with a fixed numerical value.
int32_t SDDS_InitializeOutput(SDDS_DATASET *SDDS_dataset, int32_t data_mode, int32_t lines_per_row, const char *description, const char *contents, const char *filename)
Initializes the SDDS output dataset.
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_DefineColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, int32_t field_length)
Defines a data column within the SDDS dataset.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_DefineParameter(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, char *fixed_value)
Defines a data parameter with a fixed string value.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
#define SDDS_STRING
Identifier for the string data type.
Definition SDDStypes.h:85
#define SDDS_LONG
Identifier for the signed 32-bit integer data type.
Definition SDDStypes.h:61
#define SDDS_DOUBLE
Identifier for the double data type.
Definition SDDStypes.h:37
void * tmalloc(uint64_t size_of_block)
Allocates a memory block of the specified size with zero initialization.
Definition array.c:59
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
Definition bomb.c:26
double linearCorrelationSignificance(double r, long rows)
Compute the statistical significance of a linear correlation coefficient.
Definition lincorr.c:79
double linearCorrelationCoefficient(double *data1, double *data2, short *accept1, short *accept2, long rows, long *count)
Compute the linear correlation coefficient for two data sets.
Definition lincorr.c:32
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390
long scanItemList(unsigned long *flags, char **item, long *items, unsigned long mode,...)
Scans a list of items and assigns values based on provided keywords and types.

◆ markStDevOutliers()

void markStDevOutliers ( double * data,
double limit,
long passes,
short * keep,
int64_t n )

Definition at line 319 of file sddscorrelate.c.

319 {
320 double sum1, sum2, variance, mean, absLimit;
321 long pass;
322 int64_t i, summed, kept;
323
324 for (i = 0; i < n; i++)
325 keep[i] = 1;
326 kept = n;
327 for (pass = 0; pass < passes && kept; pass++) {
328 sum1 = 0.0;
329 summed = 0;
330 for (i = 0; i < n; i++) {
331 if (keep[i]) {
332 sum1 += data[i];
333 summed += 1;
334 }
335 }
336 if (summed < 2)
337 return;
338 mean = sum1 / summed;
339 sum2 = 0.0;
340 for (i = 0; i < n; i++) {
341 if (keep[i])
342 sum2 += sqr(data[i] - mean);
343 }
344 variance = sum2 / summed;
345 if (variance > 0.0) {
346 absLimit = limit * sqrt(variance);
347 for (i = 0; i < n; i++) {
348 if (keep[i] && fabs(data[i] - mean) > absLimit) {
349 keep[i] = 0;
350 kept--;
351 }
352 }
353 }
354 }
355}

◆ replaceWithRank()

void replaceWithRank ( double * data,
int64_t n )

Definition at line 382 of file sddscorrelate.c.

382 {
383 static DATAnINDEX *indexedData = NULL;
384 int64_t i, j, iStart, iEnd;
385
386 indexedData = SDDS_Realloc(indexedData, sizeof(*indexedData) * n);
387 for (i = 0; i < n; i++) {
388 indexedData[i].data = data[i];
389 indexedData[i].originalIndex = i;
390 }
391 qsort(indexedData, n, sizeof(*indexedData), compareData);
392 for (i = 0; i < n; i++)
393 data[indexedData[i].originalIndex] = (double)i;
394 for (i = 0; i < n - 1; i++) {
395 if (data[i] == data[i + 1]) {
396 iStart = i;
397 for (j = i + 2; j < n; j++) {
398 if (data[j] != data[i])
399 break;
400 }
401 iEnd = j - 1;
402 double averageRank = (iStart + iEnd) / 2.0;
403 for (j = iStart; j <= iEnd; j++)
404 data[indexedData[j].originalIndex] = averageRank;
405 i = iEnd;
406 }
407 }
408}
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
Definition SDDS_utils.c:677

Variable Documentation

◆ option

char* option[N_OPTIONS]
Initial value:
= {
"columns",
"excludecolumns",
"withonly",
"pipe",
"rankorder",
"stdevoutlier",
"majorOrder",
}

Definition at line 60 of file sddscorrelate.c.

60 {
61 "columns",
62 "excludecolumns",
63 "withonly",
64 "pipe",
65 "rankorder",
66 "stdevoutlier",
67 "majorOrder",
68};