80char *option[N_OPTIONS] = {
90#define USAGE "sddscorrelate [<inputfile>] [<outputfile>]\n\
91 [-pipe=[input][,output]]\n\
92 [-columns=<list-of-names>]\n\
93 [-excludeColumns=<list-of-names>]\n\
96 [-stDevOutlier[=limit=<factor>][,passes=<integer>]]\n\
97 [-majorOrder=row|column]\n\
99Compute and evaluate correlations among columns of data.\n\
102 -pipe=[input][,output] Use standard input/output as input and/or output.\n\
103 -columns=<list-of-names> Specify columns to include in correlation analysis.\n\
104 -excludeColumns=<list-of-names> Specify columns to exclude from correlation analysis.\n\
105 -withOnly=<name> Correlate only with the specified column.\n\
106 -rankOrder Use rank-order (Spearman) correlation instead of linear (Pearson).\n\
107 -stDevOutlier[=limit=<factor>][,passes=<integer>]\n\
108 Remove outliers based on standard deviation.\n\
109 -majorOrder=row|column Set data ordering to row-major or column-major.\n\
111Program by Michael Borland. ("__DATE__ " "__TIME__ ", SVN revision: " SVN_VERSION ")\n"
113void replaceWithRank(
double *data, int64_t n);
114double *findRank(
double *data, int64_t n);
115void markStDevOutliers(
double *data,
double limit,
long passes,
short *keep, int64_t n);
117int main(
int argc,
char **argv) {
119 char **column, **excludeColumn, *withOnly;
120 long columns, excludeColumns;
121 char *input, *output;
122 SCANNED_ARG *scanned;
124 long i, j, row, count, readCode, rankOrder, iName1, iName2;
126 int32_t outlierStDevPasses;
127 double **data, correlation, significance, outlierStDevLimit;
130 char s[SDDS_MAXLINE];
131 unsigned long pipeFlags, dummyFlags, majorOrderFlag;
132 short columnMajorOrder = -1;
135 argc =
scanargs(&scanned, argc, argv);
139 output = input = withOnly = NULL;
140 columns = excludeColumns = 0;
141 column = excludeColumn = NULL;
144 outlierStDevPasses = 0;
145 outlierStDevLimit = 1.0;
149 for (iArg = 1; iArg < argc; iArg++) {
150 if (scanned[iArg].arg_type == OPTION) {
152 switch (
match_string(scanned[iArg].list[0], option, N_OPTIONS, 0)) {
153 case SET_MAJOR_ORDER:
155 scanned[iArg].n_items--;
156 if (scanned[iArg].n_items > 0 &&
157 (!
scanItemList(&majorOrderFlag, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
158 "row", -1, NULL, 0, SDDS_ROW_MAJOR_ORDER,
159 "column", -1, NULL, 0, SDDS_COLUMN_MAJOR_ORDER, NULL)))
160 SDDS_Bomb(
"invalid -majorOrder syntax/values");
161 if (majorOrderFlag & SDDS_COLUMN_MAJOR_ORDER)
162 columnMajorOrder = 1;
163 else if (majorOrderFlag & SDDS_ROW_MAJOR_ORDER)
164 columnMajorOrder = 0;
168 SDDS_Bomb(
"only one -columns option may be given");
169 if (scanned[iArg].n_items < 2)
171 column =
tmalloc(
sizeof(*column) * (columns = scanned[iArg].n_items - 1));
172 for (i = 0; i < columns; i++)
173 column[i] = scanned[iArg].list[i + 1];
176 if (scanned[iArg].n_items < 2)
177 SDDS_Bomb(
"invalid -excludeColumns syntax");
178 moveToStringArray(&excludeColumn, &excludeColumns, scanned[iArg].list + 1, scanned[iArg].n_items - 1);
182 SDDS_Bomb(
"only one -withOnly option may be given");
183 if (scanned[iArg].n_items < 2)
185 withOnly = scanned[iArg].list[1];
188 if (!
processPipeOption(scanned[iArg].list + 1, scanned[iArg].n_items - 1, &pipeFlags))
194 case SET_STDEVOUTLIER:
195 scanned[iArg].n_items--;
196 outlierStDevPasses = 1;
197 outlierStDevLimit = 1.0;
198 if (!
scanItemList(&dummyFlags, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
200 "passes",
SDDS_LONG, &outlierStDevPasses, 1, 0, NULL) ||
201 outlierStDevPasses <= 0 || outlierStDevLimit <= 0.0)
202 SDDS_Bomb(
"invalid -stdevOutlier syntax/values");
205 fprintf(stderr,
"Error: unknown or ambiguous option: %s\n", scanned[iArg].list[0]);
211 input = scanned[iArg].list[0];
213 output = scanned[iArg].list[0];
225 columns = appendToStringArray(&column, columns,
"*");
227 columns = appendToStringArray(&column, columns, withOnly);
229 if ((columns = expandColumnPairNames(&SDDSin, &column, NULL, columns, excludeColumn, excludeColumns, FIND_NUMERIC_TYPE, 0)) <= 0) {
231 SDDS_Bomb(
"no columns selected for correlation analysis");
239 SDDS_DefineColumn(&SDDSout,
"CorrelationSignificance",
"P$br$n", NULL,
"Linear correlation coefficient significance", NULL,
SDDS_DOUBLE, 0) < 0 ||
240 SDDS_DefineColumn(&SDDSout,
"CorrelationPoints", NULL, NULL,
"Number of points used for correlation", NULL,
SDDS_LONG, 0) < 0 ||
242 SDDS_DefineParameter(&SDDSout,
"sddscorrelateInputFile", NULL, NULL,
"Data file processed by sddscorrelate", NULL,
SDDS_STRING, input ? input :
"stdin") < 0 ||
243 SDDS_DefineParameter(&SDDSout,
"sddscorrelateMode", NULL, NULL, NULL, NULL,
SDDS_STRING, rankOrder ?
"Rank-Order (Spearman)" :
"Linear (Pearson)") < 0 ||
244 SDDS_DefineParameter1(&SDDSout,
"sddscorrelateStDevOutlierPasses", NULL, NULL,
"Number of passes of standard-deviation outlier elimination applied", NULL,
SDDS_LONG, &outlierStDevPasses) < 0 ||
245 SDDS_DefineParameter1(&SDDSout,
"sddscorrelateStDevOutlierLimit", NULL, NULL,
"Standard-deviation outlier limit applied", NULL,
SDDS_DOUBLE, &outlierStDevLimit) < 0) {
249 if (columnMajorOrder != -1)
250 SDDSout.layout.data_mode.column_major = columnMajorOrder;
252 SDDSout.layout.data_mode.column_major = SDDSin.layout.data_mode.column_major;
257 data = malloc(
sizeof(*data) * columns);
259 (rankOrder && !(rank = malloc(
sizeof(*rank) * columns))) ||
260 !(accept = malloc(
sizeof(*accept) * columns))) {
268 !
SDDS_SetParameters(&SDDSout, SDDS_SET_BY_NAME | SDDS_PASS_BY_VALUE,
"CorrelatedRows", rows, NULL)) {
271 for (i = 0; i < columns; i++) {
276 rank[i] = findRank(data[i], rows);
277 if (outlierStDevPasses) {
278 accept[i] = malloc(
sizeof(**accept) * rows);
281 markStDevOutliers(data[i], outlierStDevLimit, outlierStDevPasses, accept[i], rows);
286 for (i = row = 0; i < columns; i++) {
287 for (j = i + 1; j < columns; j++) {
291 if (strcmp(withOnly, column[i]) == 0) {
294 }
else if (strcmp(withOnly, column[j]) == 0) {
302 rankOrder ? rank[j] : data[j],
303 accept[i], accept[j], rows, &count);
305 snprintf(s,
sizeof(s),
"%s.%s", column[iName1], column[iName2]);
318 for (i = 0; i < columns; i++) {
346void markStDevOutliers(
double *data,
double limit,
long passes,
short *keep, int64_t n) {
347 double sum1, sum2, variance, mean, absLimit;
349 int64_t i, summed, kept;
351 for (i = 0; i < n; i++)
354 for (pass = 0; pass < passes && kept; pass++) {
357 for (i = 0; i < n; i++) {
365 mean = sum1 / summed;
367 for (i = 0; i < n; i++) {
369 sum2 += sqr(data[i] - mean);
371 variance = sum2 / summed;
372 if (variance > 0.0) {
373 absLimit = limit * sqrt(variance);
374 for (i = 0; i < n; i++) {
375 if (keep[i] && fabs(data[i] - mean) > absLimit) {
389int compareData(
const void *d1,
const void *d2) {
399double *findRank(
double *data, int64_t n) {
400 double *rank = malloc(
sizeof(*rank) * n);
403 for (int64_t i = 0; i < n; i++)
405 replaceWithRank(rank, n);
409void replaceWithRank(
double *data, int64_t n) {
411 int64_t i, j, iStart, iEnd;
413 indexedData =
SDDS_Realloc(indexedData,
sizeof(*indexedData) * n);
414 for (i = 0; i < n; i++) {
415 indexedData[i].data = data[i];
416 indexedData[i].originalIndex = i;
418 qsort(indexedData, n,
sizeof(*indexedData), compareData);
419 for (i = 0; i < n; i++)
420 data[indexedData[i].originalIndex] = (
double)i;
421 for (i = 0; i < n - 1; i++) {
422 if (data[i] == data[i + 1]) {
424 for (j = i + 2; j < n; j++) {
425 if (data[j] != data[i])
429 double averageRank = (iStart + iEnd) / 2.0;
430 for (j = iStart; j <= iEnd; j++)
431 data[indexedData[j].originalIndex] = averageRank;
SDDS (Self Describing Data Set) Data Types Definitions and Function Prototypes.
int32_t SDDS_SetRowValues(SDDS_DATASET *SDDS_dataset, int32_t mode, int64_t row,...)
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int32_t SDDS_SetParameters(SDDS_DATASET *SDDS_dataset, int32_t mode,...)
int32_t SDDS_DefineParameter1(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, void *fixed_value)
Defines a data parameter with a fixed numerical value.
int32_t SDDS_InitializeOutput(SDDS_DATASET *SDDS_dataset, int32_t data_mode, int32_t lines_per_row, const char *description, const char *contents, const char *filename)
Initializes the SDDS output dataset.
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_DefineColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, int32_t field_length)
Defines a data column within the SDDS dataset.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_DefineParameter(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, char *fixed_value)
Defines a data parameter with a fixed string value.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
#define SDDS_STRING
Identifier for the string data type.
#define SDDS_LONG
Identifier for the signed 32-bit integer data type.
#define SDDS_DOUBLE
Identifier for the double data type.
Utility functions for SDDS dataset manipulation and string array operations.
void * tmalloc(uint64_t size_of_block)
Allocates a memory block of the specified size with zero initialization.
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
double linearCorrelationSignificance(double r, long rows)
Compute the statistical significance of a linear correlation coefficient.
double linearCorrelationCoefficient(double *data1, double *data2, short *accept1, short *accept2, long rows, long *count)
Compute the linear correlation coefficient for two data sets.
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
long processPipeOption(char **item, long items, unsigned long *flags)
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
long scanItemList(unsigned long *flags, char **item, long *items, unsigned long mode,...)
Scans a list of items and assigns values based on provided keywords and types.