60char *option[N_OPTIONS] = {
70#define USAGE "Usage: sddscorrelate [OPTIONS] [<inputfile>] [<outputfile>]\n\
72Compute and evaluate correlations among columns of data.\n\
75 -pipe=[input][,output] Use standard input/output as input and/or output.\n\
76 -columns=<list-of-names> Specify columns to include in correlation analysis.\n\
77 -excludeColumns=<list-of-names> Specify columns to exclude from correlation analysis.\n\
78 -withOnly=<name> Correlate only with the specified column.\n\
79 -rankOrder Use rank-order (Spearman) correlation instead of linear (Pearson).\n\
80 -stDevOutlier[=limit=<factor>][,passes=<integer>]\n\
81 Remove outliers based on standard deviation.\n\
82 -majorOrder=row|column Set data ordering to row-major or column-major.\n\
84Program by Michael Borland. ("__DATE__ " "__TIME__ ", SVN revision: " SVN_VERSION ")\n"
86void replaceWithRank(
double *data, int64_t n);
87double *findRank(
double *data, int64_t n);
88void markStDevOutliers(
double *data,
double limit,
long passes,
short *keep, int64_t n);
90int main(
int argc,
char **argv) {
92 char **column, **excludeColumn, *withOnly;
93 long columns, excludeColumns;
97 long i, j, row, count, readCode, rankOrder, iName1, iName2;
99 int32_t outlierStDevPasses;
100 double **data, correlation, significance, outlierStDevLimit;
103 char s[SDDS_MAXLINE];
104 unsigned long pipeFlags, dummyFlags, majorOrderFlag;
105 short columnMajorOrder = -1;
108 argc =
scanargs(&scanned, argc, argv);
112 output = input = withOnly = NULL;
113 columns = excludeColumns = 0;
114 column = excludeColumn = NULL;
117 outlierStDevPasses = 0;
118 outlierStDevLimit = 1.0;
122 for (iArg = 1; iArg < argc; iArg++) {
123 if (scanned[iArg].arg_type == OPTION) {
125 switch (
match_string(scanned[iArg].list[0], option, N_OPTIONS, 0)) {
126 case SET_MAJOR_ORDER:
128 scanned[iArg].n_items--;
129 if (scanned[iArg].n_items > 0 &&
130 (!
scanItemList(&majorOrderFlag, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
131 "row", -1, NULL, 0, SDDS_ROW_MAJOR_ORDER,
132 "column", -1, NULL, 0, SDDS_COLUMN_MAJOR_ORDER, NULL)))
133 SDDS_Bomb(
"invalid -majorOrder syntax/values");
134 if (majorOrderFlag & SDDS_COLUMN_MAJOR_ORDER)
135 columnMajorOrder = 1;
136 else if (majorOrderFlag & SDDS_ROW_MAJOR_ORDER)
137 columnMajorOrder = 0;
141 SDDS_Bomb(
"only one -columns option may be given");
142 if (scanned[iArg].n_items < 2)
144 column =
tmalloc(
sizeof(*column) * (columns = scanned[iArg].n_items - 1));
145 for (i = 0; i < columns; i++)
146 column[i] = scanned[iArg].list[i + 1];
149 if (scanned[iArg].n_items < 2)
150 SDDS_Bomb(
"invalid -excludeColumns syntax");
151 moveToStringArray(&excludeColumn, &excludeColumns, scanned[iArg].list + 1, scanned[iArg].n_items - 1);
155 SDDS_Bomb(
"only one -withOnly option may be given");
156 if (scanned[iArg].n_items < 2)
158 withOnly = scanned[iArg].list[1];
161 if (!
processPipeOption(scanned[iArg].list + 1, scanned[iArg].n_items - 1, &pipeFlags))
167 case SET_STDEVOUTLIER:
168 scanned[iArg].n_items--;
169 outlierStDevPasses = 1;
170 outlierStDevLimit = 1.0;
171 if (!
scanItemList(&dummyFlags, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
173 "passes",
SDDS_LONG, &outlierStDevPasses, 1, 0, NULL) ||
174 outlierStDevPasses <= 0 || outlierStDevLimit <= 0.0)
175 SDDS_Bomb(
"invalid -stdevOutlier syntax/values");
178 fprintf(stderr,
"Error: unknown or ambiguous option: %s\n", scanned[iArg].list[0]);
184 input = scanned[iArg].list[0];
186 output = scanned[iArg].list[0];
198 columns = appendToStringArray(&column, columns,
"*");
200 columns = appendToStringArray(&column, columns, withOnly);
202 if ((columns = expandColumnPairNames(&SDDSin, &column, NULL, columns, excludeColumn, excludeColumns, FIND_NUMERIC_TYPE, 0)) <= 0) {
204 SDDS_Bomb(
"no columns selected for correlation analysis");
212 SDDS_DefineColumn(&SDDSout,
"CorrelationSignificance",
"P$br$n", NULL,
"Linear correlation coefficient significance", NULL,
SDDS_DOUBLE, 0) < 0 ||
213 SDDS_DefineColumn(&SDDSout,
"CorrelationPoints", NULL, NULL,
"Number of points used for correlation", NULL,
SDDS_LONG, 0) < 0 ||
215 SDDS_DefineParameter(&SDDSout,
"sddscorrelateInputFile", NULL, NULL,
"Data file processed by sddscorrelate", NULL,
SDDS_STRING, input ? input :
"stdin") < 0 ||
216 SDDS_DefineParameter(&SDDSout,
"sddscorrelateMode", NULL, NULL, NULL, NULL,
SDDS_STRING, rankOrder ?
"Rank-Order (Spearman)" :
"Linear (Pearson)") < 0 ||
217 SDDS_DefineParameter1(&SDDSout,
"sddscorrelateStDevOutlierPasses", NULL, NULL,
"Number of passes of standard-deviation outlier elimination applied", NULL,
SDDS_LONG, &outlierStDevPasses) < 0 ||
218 SDDS_DefineParameter1(&SDDSout,
"sddscorrelateStDevOutlierLimit", NULL, NULL,
"Standard-deviation outlier limit applied", NULL,
SDDS_DOUBLE, &outlierStDevLimit) < 0) {
222 if (columnMajorOrder != -1)
223 SDDSout.layout.data_mode.column_major = columnMajorOrder;
225 SDDSout.layout.data_mode.column_major = SDDSin.layout.data_mode.column_major;
230 data = malloc(
sizeof(*data) * columns);
232 (rankOrder && !(rank = malloc(
sizeof(*rank) * columns))) ||
233 !(accept = malloc(
sizeof(*accept) * columns))) {
241 !
SDDS_SetParameters(&SDDSout, SDDS_SET_BY_NAME | SDDS_PASS_BY_VALUE,
"CorrelatedRows", rows, NULL)) {
244 for (i = 0; i < columns; i++) {
249 rank[i] = findRank(data[i], rows);
250 if (outlierStDevPasses) {
251 accept[i] = malloc(
sizeof(**accept) * rows);
254 markStDevOutliers(data[i], outlierStDevLimit, outlierStDevPasses, accept[i], rows);
259 for (i = row = 0; i < columns; i++) {
260 for (j = i + 1; j < columns; j++) {
264 if (strcmp(withOnly, column[i]) == 0) {
267 }
else if (strcmp(withOnly, column[j]) == 0) {
275 rankOrder ? rank[j] : data[j],
276 accept[i], accept[j], rows, &count);
278 snprintf(s,
sizeof(s),
"%s.%s", column[iName1], column[iName2]);
291 for (i = 0; i < columns; i++) {
319void markStDevOutliers(
double *data,
double limit,
long passes,
short *keep, int64_t n) {
320 double sum1, sum2, variance, mean, absLimit;
322 int64_t i, summed, kept;
324 for (i = 0; i < n; i++)
327 for (pass = 0; pass < passes && kept; pass++) {
330 for (i = 0; i < n; i++) {
338 mean = sum1 / summed;
340 for (i = 0; i < n; i++) {
342 sum2 += sqr(data[i] - mean);
344 variance = sum2 / summed;
345 if (variance > 0.0) {
346 absLimit = limit * sqrt(variance);
347 for (i = 0; i < n; i++) {
348 if (keep[i] && fabs(data[i] - mean) > absLimit) {
362int compareData(
const void *d1,
const void *d2) {
372double *findRank(
double *data, int64_t n) {
373 double *rank = malloc(
sizeof(*rank) * n);
376 for (int64_t i = 0; i < n; i++)
378 replaceWithRank(rank, n);
382void replaceWithRank(
double *data, int64_t n) {
384 int64_t i, j, iStart, iEnd;
386 indexedData =
SDDS_Realloc(indexedData,
sizeof(*indexedData) * n);
387 for (i = 0; i < n; i++) {
388 indexedData[i].data = data[i];
389 indexedData[i].originalIndex = i;
391 qsort(indexedData, n,
sizeof(*indexedData), compareData);
392 for (i = 0; i < n; i++)
393 data[indexedData[i].originalIndex] = (
double)i;
394 for (i = 0; i < n - 1; i++) {
395 if (data[i] == data[i + 1]) {
397 for (j = i + 2; j < n; j++) {
398 if (data[j] != data[i])
402 double averageRank = (iStart + iEnd) / 2.0;
403 for (j = iStart; j <= iEnd; j++)
404 data[indexedData[j].originalIndex] = averageRank;
SDDS (Self Describing Data Set) Data Types Definitions and Function Prototypes.
int32_t SDDS_SetRowValues(SDDS_DATASET *SDDS_dataset, int32_t mode, int64_t row,...)
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int32_t SDDS_SetParameters(SDDS_DATASET *SDDS_dataset, int32_t mode,...)
int32_t SDDS_DefineParameter1(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, void *fixed_value)
Defines a data parameter with a fixed numerical value.
int32_t SDDS_InitializeOutput(SDDS_DATASET *SDDS_dataset, int32_t data_mode, int32_t lines_per_row, const char *description, const char *contents, const char *filename)
Initializes the SDDS output dataset.
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_DefineColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, int32_t field_length)
Defines a data column within the SDDS dataset.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_DefineParameter(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, char *fixed_value)
Defines a data parameter with a fixed string value.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
#define SDDS_STRING
Identifier for the string data type.
#define SDDS_LONG
Identifier for the signed 32-bit integer data type.
#define SDDS_DOUBLE
Identifier for the double data type.
void * tmalloc(uint64_t size_of_block)
Allocates a memory block of the specified size with zero initialization.
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
double linearCorrelationSignificance(double r, long rows)
Compute the statistical significance of a linear correlation coefficient.
double linearCorrelationCoefficient(double *data1, double *data2, short *accept1, short *accept2, long rows, long *count)
Compute the linear correlation coefficient for two data sets.
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
long processPipeOption(char **item, long items, unsigned long *flags)
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
long scanItemList(unsigned long *flags, char **item, long *items, unsigned long mode,...)
Scans a list of items and assigns values based on provided keywords and types.