SDDSlib
Loading...
Searching...
No Matches
sddsshiftcor.c
Go to the documentation of this file.
1/**
2 * @file sddsshiftcor.c
3 * @brief A program to perform correlation analysis on shifted data columns in SDDS files.
4 *
5 * This program reads data from an SDDS file and calculates the correlation coefficient
6 * between a specified column and all other selected numeric columns with a shifting mechanism.
7 * It outputs the results in a new SDDS file, including the computed correlation values.
8 *
9 * ### Features:
10 * - Supports input and output SDDS files, including pipe input/output.
11 * - Calculates correlation coefficients with data shifts.
12 * - Allows filtering of columns via inclusion or exclusion options.
13 * - Identifies and handles standard deviation outliers.
14 * - Supports rank-order correlation analysis.
15 * - Provides options for verbose output and major order selection (row/column).
16 *
17 * ### Options:
18 * - `-columns=<list-of-names>`: Specifies the columns to process.
19 * - `-excludeColumns=<list-of-names>`: Excludes specific columns from processing.
20 * - `-with=<name>`: Specifies the column to correlate with.
21 * - `-scan=start=<startShift>,end=<endShift>,delta=<deltaShift>`: Configures the range and step of shifts.
22 * - `-stDevOutlier[=limit=<factor>][,passes=<integer>]`: Marks and excludes outliers based on standard deviation.
23 * - `-rankOrder`: Uses rank-order correlation instead of linear correlation.
24 * - `-majorOrder=row|column`: Sets the major data order for output.
25 * - `-verbose`: Enables verbose output for detailed progress information.
26 * - `-pipe=[input][,output]`: Enables piping for input/output files.
27 *
28 * ### Usage:
29 * ```
30 * sddsshiftcor [-pipe=[input][,output]] [<inputfile>] [<outputfile>] -with=<name>
31 * [-scan=start=<startShift>,end=<endShift>,delta=<deltaShift>]
32 * [-columns=<list-of-names>] [-excludeColumns=<list-of-names>]
33 * [-rankOrder] [-stDevOutlier[=limit=<factor>][,passes=<integer>]] [-verbose] [-majorOrder=row|column]
34 * ```
35 *
36 * ### Example:
37 * ```
38 * sddsshiftcor input.sdds output.sdds -with=ColumnA -scan=start=0,end=5,delta=1 -columns=ColumnB,ColumnC
39 * ```
40 *
41 * @copyright
42 * - (c) 2002 The University of Chicago, as Operator of Argonne National Laboratory.
43 * - (c) 2002 The Regents of the University of California, as Operator of Los Alamos National Laboratory.
44 *
45 * @license
46 * This file is distributed under the terms of the Software License Agreement
47 * found in the file LICENSE included with this distribution.
48 *
49 * @author M. Borland, R. Soliday, H. Shang
50 */
51
52#include "mdb.h"
53#include "SDDS.h"
54#include "scan.h"
55#include "SDDSutils.h"
56#include <ctype.h>
57
58/* Enumeration for option types */
59enum option_type {
60 SET_COLUMNS,
61 SET_EXCLUDE,
62 SET_WITH,
63 SET_PIPE,
64 SET_RANKORDER,
65 SET_STDEVOUTLIER,
66 SET_SCAN,
67 SET_VERBOSE,
68 SET_MAJOR_ORDER,
69 N_OPTIONS
70};
71
72char *option[N_OPTIONS] = {
73 "columns",
74 "excludecolumns",
75 "with",
76 "pipe",
77 "rankorder",
78 "stdevoutlier",
79 "scan",
80 "verbose",
81 "majorOrder",
82};
83
84#define USAGE \
85 "sddsshiftcor [-pipe=[input][,output]] [<inputfile>] [<outputfile>] -with=<name>\n" \
86 " [-scan=start=<startShift>,end=<endShift>,delta=<deltaShift>]\n" \
87 " [-columns=<list-of-names>] [-excludeColumns=<list-of-names>]\n" \
88 " [-rankOrder] [-stDevOutlier[=limit=<factor>][,passes=<integer>]]\n" \
89 " [-verbose] [-majorOrder=row|column]\n\n" \
90 "Program by Michael Borland. (\"" __DATE__ " " __TIME__ ", SVN revision: " SVN_VERSION ")\n"
91
92void replaceWithRank(double *data, long n);
93double *findRank(double *data, int64_t n);
94void markStDevOutliers(double *data, double limit, long passes, short *keep, int64_t n);
95void setupOutputFile(SDDS_DATASET *SDDSout, char *output, char **column, long columns, short columnMajorOrder);
96
97int main(int argc, char **argv) {
98 int iArg;
99 char **column, **excludeColumn, *withOnly;
100 long columns, excludeColumns;
101 char *input, *output, buffer[16384];
102 SCANNED_ARG *scanned;
103 SDDS_DATASET SDDSin, SDDSout;
104 long i, count, readCode, rankOrder;
105 int64_t rows;
106 int32_t outlierStDevPasses;
107 double **data, correlation, outlierStDevLimit;
108 double **rank;
109 short **accept;
110 unsigned long pipeFlags, dummyFlags, majorOrderFlag;
111 int32_t startShift, endShift, deltaShift;
112 long outputRows, iWith, outputRow, shiftAmount, verbose;
113 short columnMajorOrder = -1;
114
116 argc = scanargs(&scanned, argc, argv);
117 if (argc < 2)
118 bomb(NULL, USAGE);
119
120 output = input = withOnly = NULL;
121 columns = excludeColumns = 0;
122 column = excludeColumn = NULL;
123 pipeFlags = 0;
124 rankOrder = 0;
125 outlierStDevPasses = 0;
126 outlierStDevLimit = 1.0;
127 rank = NULL;
128 accept = NULL;
129 startShift = -(endShift = 10);
130 deltaShift = 1;
131 verbose = 0;
132
133 for (iArg = 1; iArg < argc; iArg++) {
134 if (scanned[iArg].arg_type == OPTION) {
135 /* process options here */
136 switch (match_string(scanned[iArg].list[0], option, N_OPTIONS, 0)) {
137 case SET_MAJOR_ORDER:
138 majorOrderFlag = 0;
139 scanned[iArg].n_items--;
140 if (scanned[iArg].n_items > 0 &&
141 (!scanItemList(&majorOrderFlag, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
142 "row", -1, NULL, 0, SDDS_ROW_MAJOR_ORDER,
143 "column", -1, NULL, 0, SDDS_COLUMN_MAJOR_ORDER, NULL)))
144 SDDS_Bomb("invalid -majorOrder syntax/values");
145 if (majorOrderFlag & SDDS_COLUMN_MAJOR_ORDER)
146 columnMajorOrder = 1;
147 else if (majorOrderFlag & SDDS_ROW_MAJOR_ORDER)
148 columnMajorOrder = 0;
149 break;
150 case SET_COLUMNS:
151 if (columns)
152 SDDS_Bomb("only one -columns option may be given");
153 if (scanned[iArg].n_items < 2)
154 SDDS_Bomb("invalid -columns syntax");
155 column = tmalloc(sizeof(*column) * (columns = scanned[iArg].n_items - 1));
156 for (i = 0; i < columns; i++)
157 column[i] = scanned[iArg].list[i + 1];
158 break;
159 case SET_EXCLUDE:
160 if (scanned[iArg].n_items < 2)
161 SDDS_Bomb("invalid -excludeColumns syntax");
162 moveToStringArray(&excludeColumn, &excludeColumns, scanned[iArg].list + 1, scanned[iArg].n_items - 1);
163 break;
164 case SET_WITH:
165 if (withOnly)
166 SDDS_Bomb("only one -withOnly option may be given");
167 if (scanned[iArg].n_items < 2)
168 SDDS_Bomb("invalid -withOnly syntax");
169 withOnly = scanned[iArg].list[1];
170 break;
171 case SET_PIPE:
172 if (!processPipeOption(scanned[iArg].list + 1, scanned[iArg].n_items - 1, &pipeFlags))
173 SDDS_Bomb("invalid -pipe syntax");
174 break;
175 case SET_RANKORDER:
176 rankOrder = 1;
177 break;
178 case SET_STDEVOUTLIER:
179 scanned[iArg].n_items--;
180 outlierStDevPasses = 1;
181 outlierStDevLimit = 1.0;
182 if (!scanItemList(&dummyFlags, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
183 "limit", SDDS_DOUBLE, &outlierStDevLimit, 1, 0,
184 "passes", SDDS_LONG, &outlierStDevPasses, 1, 0, NULL) ||
185 outlierStDevPasses <= 0 || outlierStDevLimit <= 0.0)
186 SDDS_Bomb("invalid -stdevOutlier syntax/values");
187 break;
188 case SET_SCAN:
189 scanned[iArg].n_items--;
190 if (!scanItemList(&dummyFlags, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
191 "start", SDDS_LONG, &startShift, 1, 0,
192 "end", SDDS_LONG, &endShift, 1, 0,
193 "delta", SDDS_LONG, &deltaShift, 1, 0, NULL) ||
194 startShift >= endShift || deltaShift <= 0 || (endShift - startShift) < deltaShift)
195 SDDS_Bomb("invalid -scan syntax/values");
196 break;
197 case SET_VERBOSE:
198 verbose = 1;
199 break;
200 default:
201 fprintf(stderr, "error: unknown/ambiguous option: %s\n", scanned[iArg].list[0]);
202 exit(1);
203 break;
204 }
205 } else {
206 if (!input)
207 input = scanned[iArg].list[0];
208 else if (!output)
209 output = scanned[iArg].list[0];
210 else
211 SDDS_Bomb("too many filenames seen");
212 }
213 }
214
215 processFilenames("sddsshiftcor", &input, &output, pipeFlags, 0, NULL);
216
217 if (!SDDS_InitializeInput(&SDDSin, input))
218 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
219
220 if (!columns)
221 columns = appendToStringArray(&column, columns, "*");
222 if (withOnly)
223 columns = appendToStringArray(&column, columns, withOnly);
224
225 if ((columns = expandColumnPairNames(&SDDSin, &column, NULL, columns, excludeColumn, excludeColumns, FIND_NUMERIC_TYPE, 0)) <= 0) {
226 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
227 SDDS_Bomb("no columns selected for correlation analysis");
228 }
229 if (columnMajorOrder == -1)
230 columnMajorOrder = SDDSin.layout.data_mode.column_major;
231
232 setupOutputFile(&SDDSout, output, column, columns, columnMajorOrder);
233
234 if (!(data = (double **)malloc(sizeof(*data) * columns)) ||
235 (rankOrder && !(rank = (double **)malloc(sizeof(*rank) * columns))) ||
236 !(accept = (short **)malloc(sizeof(*accept) * columns)))
237 SDDS_Bomb("allocation failure");
238 outputRows = (endShift - startShift) / deltaShift;
239 iWith = -1;
240 while ((readCode = SDDS_ReadPage(&SDDSin)) > 0) {
241 if ((rows = SDDS_CountRowsOfInterest(&SDDSin)) < (endShift - startShift))
242 continue;
243 if (!SDDS_StartPage(&SDDSout, outputRows + 1))
244 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
245 for (i = 0; i < columns; i++) {
246 if (strcmp(withOnly, column[i]) == 0)
247 iWith = i;
248 if (!(data[i] = SDDS_GetColumnInDoubles(&SDDSin, column[i])))
249 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
250 if (rankOrder)
251 rank[i] = findRank(data[i], rows);
252 if (outlierStDevPasses) {
253 if (!(accept[i] = (short *)malloc(sizeof(**accept) * rows)))
254 SDDS_Bomb("allocation failure");
255 markStDevOutliers(data[i], outlierStDevLimit, outlierStDevPasses, accept[i], rows);
256 } else
257 accept[i] = NULL;
258 }
259 if (iWith < 0)
260 SDDS_Bomb("-with column not found");
261 outputRow = 0;
262 for (shiftAmount = startShift; shiftAmount <= endShift; shiftAmount += deltaShift) {
263 if (verbose)
264 fprintf(stderr, "Working on shift of %ld\n", shiftAmount);
265 if (!SDDS_SetRowValues(&SDDSout, SDDS_SET_BY_NAME | SDDS_PASS_BY_VALUE, outputRow, "ShiftAmount", shiftAmount, NULL))
266 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
267 for (i = 0; i < columns; i++) {
269 rankOrder ? rank[i] : data[i],
270 rankOrder ? rank[iWith] : data[iWith],
271 accept[i],
272 accept[iWith],
273 rows,
274 &count,
275 shiftAmount);
276 sprintf(buffer, "%sShiftedCor", column[i]);
277 if (!SDDS_SetRowValues(&SDDSout, SDDS_SET_BY_NAME | SDDS_PASS_BY_VALUE, outputRow, buffer, correlation, NULL))
278 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
279 }
280 outputRow++;
281 }
282 for (i = 0; i < columns; i++) {
283 free(data[i]);
284 if (rankOrder)
285 free(rank[i]);
286 if (accept[i])
287 free(accept[i]);
288 }
289 if (!SDDS_WritePage(&SDDSout))
290 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
291 }
292 if (SDDS_Terminate(&SDDSin) != 1) {
293 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
294 return 1;
295 }
296 if (SDDS_Terminate(&SDDSout) != 1) {
297 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
298 return 1;
299 }
300
301 return 0;
302}
303
304void setupOutputFile(SDDS_DATASET *SDDSout, char *output, char **column, long columns, short columnMajorOrder) {
305 long i;
306 char buffer[16384];
307 if (!SDDS_InitializeOutput(SDDSout, SDDS_BINARY, 0, NULL, NULL, output) ||
308 !SDDS_DefineSimpleColumn(SDDSout, "ShiftAmount", NULL, SDDS_LONG))
309 SDDS_Bomb("unable to open output file");
310 SDDSout->layout.data_mode.column_major = columnMajorOrder;
311
312 for (i = 0; i < columns; i++) {
313 sprintf(buffer, "%sShiftedCor", column[i]);
314 if (!SDDS_DefineSimpleColumn(SDDSout, buffer, NULL, SDDS_DOUBLE))
315 SDDS_Bomb("unable to set up column definitions");
316 }
317 if (!SDDS_WriteLayout(SDDSout))
318 SDDS_Bomb("unable to set up output file");
319}
320
321void markStDevOutliers(double *data, double limit, long passes, short *keep, int64_t n) {
322 double sum1, sum2, variance, mean, absLimit;
323 long pass;
324 int64_t i, summed, kept;
325
326 for (i = 0; i < n; i++)
327 keep[i] = 1;
328 kept = n;
329 for (pass = 0; pass < passes && kept; pass++) {
330 sum1 = sum2 = 0.0;
331 summed = 0;
332 for (i = 0; i < n; i++) {
333 if (keep[i]) {
334 summed++;
335 sum1 += data[i];
336 sum2 += data[i] * data[i];
337 }
338 }
339 if (summed < 2)
340 return;
341 mean = sum1 / summed;
342 sum2 /= summed;
343 if ((variance = sum2 - mean * mean) > 0.0) {
344 absLimit = limit * sqrt(variance);
345 for (i = 0; i < n; i++) {
346 if (keep[i] && fabs(data[i] - mean) > absLimit) {
347 keep[i] = 0;
348 kept--;
349 }
350 }
351 }
352 }
353}
354
355typedef struct {
356 double data;
357 long originalIndex;
358} DATAnINDEX;
359
360int compareData(const void *d1, const void *d2) {
361 double diff;
362 diff = ((DATAnINDEX *)d1)->data - ((DATAnINDEX *)d2)->data;
363 return (diff == 0.0) ? 0 : (diff < 0.0 ? -1 : 1);
364}
365
366double *findRank(double *data, int64_t n) {
367 int64_t i;
368 double *rank;
369
370 rank = (double *)malloc(sizeof(*rank) * n);
371 if (!rank)
372 return NULL;
373 for (i = 0; i < n; i++)
374 rank[i] = data[i];
375 replaceWithRank(rank, n);
376 return rank;
377}
378
379void replaceWithRank(double *data, long n) {
380 static DATAnINDEX *indexedData = NULL;
381 long i, j, iStart, iEnd;
382
383 indexedData = SDDS_Realloc(indexedData, sizeof(*indexedData) * n);
384 for (i = 0; i < n; i++) {
385 indexedData[i].data = data[i];
386 indexedData[i].originalIndex = i;
387 }
388 qsort((void *)indexedData, n, sizeof(*indexedData), compareData);
389 for (i = 0; i < n; i++)
390 data[indexedData[i].originalIndex] = (double)i;
391 for (i = 0; i < n - 1; i++) {
392 if (data[i] == data[i + 1]) {
393 iStart = i;
394 for (j = i + 2; j < n; j++) {
395 if (data[j - 1] != data[j])
396 break;
397 }
398 iEnd = j - 1;
399 double average = (iStart + iEnd) / 2.0;
400 for (j = iStart; j <= iEnd; j++)
401 data[indexedData[j].originalIndex] = average;
402 i = iEnd;
403 }
404 }
405}
SDDS (Self Describing Data Set) Data Types Definitions and Function Prototypes.
int32_t SDDS_SetRowValues(SDDS_DATASET *SDDS_dataset, int32_t mode, int64_t row,...)
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int64_t SDDS_CountRowsOfInterest(SDDS_DATASET *SDDS_dataset)
Counts the number of rows marked as "of interest" in the current data table.
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_InitializeOutput(SDDS_DATASET *SDDS_dataset, int32_t data_mode, int32_t lines_per_row, const char *description, const char *contents, const char *filename)
Initializes the SDDS output dataset.
int32_t SDDS_DefineSimpleColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *unit, int32_t type)
Defines a simple data column within the SDDS dataset.
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
Definition SDDS_utils.c:677
#define SDDS_LONG
Identifier for the signed 32-bit integer data type.
Definition SDDStypes.h:61
#define SDDS_DOUBLE
Identifier for the double data type.
Definition SDDStypes.h:37
void * tmalloc(uint64_t size_of_block)
Allocates a memory block of the specified size with zero initialization.
Definition array.c:59
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
Definition bomb.c:26
double shiftedLinearCorrelationCoefficient(double *data1, double *data2, short *accept1, short *accept2, long rows, long *count, long shift)
Compute the linear correlation coefficient between two data sets with a given shift.
Definition lincorr.c:102
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390
long scanItemList(unsigned long *flags, char **item, long *items, unsigned long mode,...)
Scans a list of items and assigns values based on provided keywords and types.