SDDSlib
Loading...
Searching...
No Matches
sddsshiftcor.c File Reference

A program to perform correlation analysis on shifted data columns in SDDS files. More...

#include "mdb.h"
#include "SDDS.h"
#include "scan.h"
#include "SDDSutils.h"
#include <ctype.h>

Go to the source code of this file.

Classes

struct  DATAnINDEX
 

Macros

#define USAGE
 

Enumerations

enum  option_type {
  SET_COLUMNS , SET_EXCLUDE , SET_WITH , SET_PIPE ,
  SET_RANKORDER , SET_STDEVOUTLIER , SET_SCAN , SET_VERBOSE ,
  SET_MAJOR_ORDER , N_OPTIONS
}
 

Functions

void replaceWithRank (double *data, long n)
 
double * findRank (double *data, int64_t n)
 
void markStDevOutliers (double *data, double limit, long passes, short *keep, int64_t n)
 
void setupOutputFile (SDDS_DATASET *SDDSout, char *output, char **column, long columns, short columnMajorOrder)
 
int main (int argc, char **argv)
 
int compareData (const void *d1, const void *d2)
 

Variables

char * option [N_OPTIONS]
 

Detailed Description

A program to perform correlation analysis on shifted data columns in SDDS files.

This program reads data from an SDDS file and calculates the correlation coefficient between a specified column and all other selected numeric columns with a shifting mechanism. It outputs the results in a new SDDS file, including the computed correlation values.

Features:

  • Supports input and output SDDS files, including pipe input/output.
  • Calculates correlation coefficients with data shifts.
  • Allows filtering of columns via inclusion or exclusion options.
  • Identifies and handles standard deviation outliers.
  • Supports rank-order correlation analysis.
  • Provides options for verbose output and major order selection (row/column).

Options:

  • -columns=<list-of-names>: Specifies the columns to process.
  • -excludeColumns=<list-of-names>: Excludes specific columns from processing.
  • -with=<name>: Specifies the column to correlate with.
  • -scan=start=<startShift>,end=<endShift>,delta=<deltaShift>: Configures the range and step of shifts.
  • -stDevOutlier[=limit=<factor>][,passes=<integer>]: Marks and excludes outliers based on standard deviation.
  • -rankOrder: Uses rank-order correlation instead of linear correlation.
  • -majorOrder=row|column: Sets the major data order for output.
  • -verbose: Enables verbose output for detailed progress information.
  • -pipe=[input][,output]: Enables piping for input/output files.

Usage:

sddsshiftcor [-pipe=[input][,output]] [<inputfile>] [<outputfile>] -with=<name>
[-scan=start=<startShift>,end=<endShift>,delta=<deltaShift>]
[-columns=<list-of-names>] [-excludeColumns=<list-of-names>]
[-rankOrder] [-stDevOutlier[=limit=<factor>][,passes=<integer>]] [-verbose] [-majorOrder=row|column]

Example:

sddsshiftcor input.sdds output.sdds -with=ColumnA -scan=start=0,end=5,delta=1 -columns=ColumnB,ColumnC
License
This file is distributed under the terms of the Software License Agreement found in the file LICENSE included with this distribution.
Author
M. Borland, R. Soliday, H. Shang

Definition in file sddsshiftcor.c.

Macro Definition Documentation

◆ USAGE

#define USAGE
Value:
"sddsshiftcor [-pipe=[input][,output]] [<inputfile>] [<outputfile>] -with=<name>\n" \
" [-scan=start=<startShift>,end=<endShift>,delta=<deltaShift>]\n" \
" [-columns=<list-of-names>] [-excludeColumns=<list-of-names>]\n" \
" [-rankOrder] [-stDevOutlier[=limit=<factor>][,passes=<integer>]]\n" \
" [-verbose] [-majorOrder=row|column]\n\n" \
"Program by Michael Borland. (\"" __DATE__ " " __TIME__ ", SVN revision: " SVN_VERSION ")\n"

Definition at line 84 of file sddsshiftcor.c.

84#define USAGE \
85 "sddsshiftcor [-pipe=[input][,output]] [<inputfile>] [<outputfile>] -with=<name>\n" \
86 " [-scan=start=<startShift>,end=<endShift>,delta=<deltaShift>]\n" \
87 " [-columns=<list-of-names>] [-excludeColumns=<list-of-names>]\n" \
88 " [-rankOrder] [-stDevOutlier[=limit=<factor>][,passes=<integer>]]\n" \
89 " [-verbose] [-majorOrder=row|column]\n\n" \
90 "Program by Michael Borland. (\"" __DATE__ " " __TIME__ ", SVN revision: " SVN_VERSION ")\n"

Enumeration Type Documentation

◆ option_type

enum option_type

Definition at line 59 of file sddsshiftcor.c.

59 {
60 SET_COLUMNS,
61 SET_EXCLUDE,
62 SET_WITH,
63 SET_PIPE,
64 SET_RANKORDER,
65 SET_STDEVOUTLIER,
66 SET_SCAN,
67 SET_VERBOSE,
68 SET_MAJOR_ORDER,
69 N_OPTIONS
70};

Function Documentation

◆ compareData()

int compareData ( const void * d1,
const void * d2 )

Definition at line 360 of file sddsshiftcor.c.

360 {
361 double diff;
362 diff = ((DATAnINDEX *)d1)->data - ((DATAnINDEX *)d2)->data;
363 return (diff == 0.0) ? 0 : (diff < 0.0 ? -1 : 1);
364}

◆ findRank()

double * findRank ( double * data,
int64_t n )

Definition at line 366 of file sddsshiftcor.c.

366 {
367 int64_t i;
368 double *rank;
369
370 rank = (double *)malloc(sizeof(*rank) * n);
371 if (!rank)
372 return NULL;
373 for (i = 0; i < n; i++)
374 rank[i] = data[i];
375 replaceWithRank(rank, n);
376 return rank;
377}

◆ main()

int main ( int argc,
char ** argv )

Definition at line 97 of file sddsshiftcor.c.

97 {
98 int iArg;
99 char **column, **excludeColumn, *withOnly;
100 long columns, excludeColumns;
101 char *input, *output, buffer[16384];
102 SCANNED_ARG *scanned;
103 SDDS_DATASET SDDSin, SDDSout;
104 long i, count, readCode, rankOrder;
105 int64_t rows;
106 int32_t outlierStDevPasses;
107 double **data, correlation, outlierStDevLimit;
108 double **rank;
109 short **accept;
110 unsigned long pipeFlags, dummyFlags, majorOrderFlag;
111 int32_t startShift, endShift, deltaShift;
112 long outputRows, iWith, outputRow, shiftAmount, verbose;
113 short columnMajorOrder = -1;
114
116 argc = scanargs(&scanned, argc, argv);
117 if (argc < 2)
118 bomb(NULL, USAGE);
119
120 output = input = withOnly = NULL;
121 columns = excludeColumns = 0;
122 column = excludeColumn = NULL;
123 pipeFlags = 0;
124 rankOrder = 0;
125 outlierStDevPasses = 0;
126 outlierStDevLimit = 1.0;
127 rank = NULL;
128 accept = NULL;
129 startShift = -(endShift = 10);
130 deltaShift = 1;
131 verbose = 0;
132
133 for (iArg = 1; iArg < argc; iArg++) {
134 if (scanned[iArg].arg_type == OPTION) {
135 /* process options here */
136 switch (match_string(scanned[iArg].list[0], option, N_OPTIONS, 0)) {
137 case SET_MAJOR_ORDER:
138 majorOrderFlag = 0;
139 scanned[iArg].n_items--;
140 if (scanned[iArg].n_items > 0 &&
141 (!scanItemList(&majorOrderFlag, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
142 "row", -1, NULL, 0, SDDS_ROW_MAJOR_ORDER,
143 "column", -1, NULL, 0, SDDS_COLUMN_MAJOR_ORDER, NULL)))
144 SDDS_Bomb("invalid -majorOrder syntax/values");
145 if (majorOrderFlag & SDDS_COLUMN_MAJOR_ORDER)
146 columnMajorOrder = 1;
147 else if (majorOrderFlag & SDDS_ROW_MAJOR_ORDER)
148 columnMajorOrder = 0;
149 break;
150 case SET_COLUMNS:
151 if (columns)
152 SDDS_Bomb("only one -columns option may be given");
153 if (scanned[iArg].n_items < 2)
154 SDDS_Bomb("invalid -columns syntax");
155 column = tmalloc(sizeof(*column) * (columns = scanned[iArg].n_items - 1));
156 for (i = 0; i < columns; i++)
157 column[i] = scanned[iArg].list[i + 1];
158 break;
159 case SET_EXCLUDE:
160 if (scanned[iArg].n_items < 2)
161 SDDS_Bomb("invalid -excludeColumns syntax");
162 moveToStringArray(&excludeColumn, &excludeColumns, scanned[iArg].list + 1, scanned[iArg].n_items - 1);
163 break;
164 case SET_WITH:
165 if (withOnly)
166 SDDS_Bomb("only one -withOnly option may be given");
167 if (scanned[iArg].n_items < 2)
168 SDDS_Bomb("invalid -withOnly syntax");
169 withOnly = scanned[iArg].list[1];
170 break;
171 case SET_PIPE:
172 if (!processPipeOption(scanned[iArg].list + 1, scanned[iArg].n_items - 1, &pipeFlags))
173 SDDS_Bomb("invalid -pipe syntax");
174 break;
175 case SET_RANKORDER:
176 rankOrder = 1;
177 break;
178 case SET_STDEVOUTLIER:
179 scanned[iArg].n_items--;
180 outlierStDevPasses = 1;
181 outlierStDevLimit = 1.0;
182 if (!scanItemList(&dummyFlags, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
183 "limit", SDDS_DOUBLE, &outlierStDevLimit, 1, 0,
184 "passes", SDDS_LONG, &outlierStDevPasses, 1, 0, NULL) ||
185 outlierStDevPasses <= 0 || outlierStDevLimit <= 0.0)
186 SDDS_Bomb("invalid -stdevOutlier syntax/values");
187 break;
188 case SET_SCAN:
189 scanned[iArg].n_items--;
190 if (!scanItemList(&dummyFlags, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
191 "start", SDDS_LONG, &startShift, 1, 0,
192 "end", SDDS_LONG, &endShift, 1, 0,
193 "delta", SDDS_LONG, &deltaShift, 1, 0, NULL) ||
194 startShift >= endShift || deltaShift <= 0 || (endShift - startShift) < deltaShift)
195 SDDS_Bomb("invalid -scan syntax/values");
196 break;
197 case SET_VERBOSE:
198 verbose = 1;
199 break;
200 default:
201 fprintf(stderr, "error: unknown/ambiguous option: %s\n", scanned[iArg].list[0]);
202 exit(1);
203 break;
204 }
205 } else {
206 if (!input)
207 input = scanned[iArg].list[0];
208 else if (!output)
209 output = scanned[iArg].list[0];
210 else
211 SDDS_Bomb("too many filenames seen");
212 }
213 }
214
215 processFilenames("sddsshiftcor", &input, &output, pipeFlags, 0, NULL);
216
217 if (!SDDS_InitializeInput(&SDDSin, input))
218 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
219
220 if (!columns)
221 columns = appendToStringArray(&column, columns, "*");
222 if (withOnly)
223 columns = appendToStringArray(&column, columns, withOnly);
224
225 if ((columns = expandColumnPairNames(&SDDSin, &column, NULL, columns, excludeColumn, excludeColumns, FIND_NUMERIC_TYPE, 0)) <= 0) {
226 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
227 SDDS_Bomb("no columns selected for correlation analysis");
228 }
229 if (columnMajorOrder == -1)
230 columnMajorOrder = SDDSin.layout.data_mode.column_major;
231
232 setupOutputFile(&SDDSout, output, column, columns, columnMajorOrder);
233
234 if (!(data = (double **)malloc(sizeof(*data) * columns)) ||
235 (rankOrder && !(rank = (double **)malloc(sizeof(*rank) * columns))) ||
236 !(accept = (short **)malloc(sizeof(*accept) * columns)))
237 SDDS_Bomb("allocation failure");
238 outputRows = (endShift - startShift) / deltaShift;
239 iWith = -1;
240 while ((readCode = SDDS_ReadPage(&SDDSin)) > 0) {
241 if ((rows = SDDS_CountRowsOfInterest(&SDDSin)) < (endShift - startShift))
242 continue;
243 if (!SDDS_StartPage(&SDDSout, outputRows + 1))
244 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
245 for (i = 0; i < columns; i++) {
246 if (strcmp(withOnly, column[i]) == 0)
247 iWith = i;
248 if (!(data[i] = SDDS_GetColumnInDoubles(&SDDSin, column[i])))
249 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
250 if (rankOrder)
251 rank[i] = findRank(data[i], rows);
252 if (outlierStDevPasses) {
253 if (!(accept[i] = (short *)malloc(sizeof(**accept) * rows)))
254 SDDS_Bomb("allocation failure");
255 markStDevOutliers(data[i], outlierStDevLimit, outlierStDevPasses, accept[i], rows);
256 } else
257 accept[i] = NULL;
258 }
259 if (iWith < 0)
260 SDDS_Bomb("-with column not found");
261 outputRow = 0;
262 for (shiftAmount = startShift; shiftAmount <= endShift; shiftAmount += deltaShift) {
263 if (verbose)
264 fprintf(stderr, "Working on shift of %ld\n", shiftAmount);
265 if (!SDDS_SetRowValues(&SDDSout, SDDS_SET_BY_NAME | SDDS_PASS_BY_VALUE, outputRow, "ShiftAmount", shiftAmount, NULL))
266 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
267 for (i = 0; i < columns; i++) {
269 rankOrder ? rank[i] : data[i],
270 rankOrder ? rank[iWith] : data[iWith],
271 accept[i],
272 accept[iWith],
273 rows,
274 &count,
275 shiftAmount);
276 sprintf(buffer, "%sShiftedCor", column[i]);
277 if (!SDDS_SetRowValues(&SDDSout, SDDS_SET_BY_NAME | SDDS_PASS_BY_VALUE, outputRow, buffer, correlation, NULL))
278 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
279 }
280 outputRow++;
281 }
282 for (i = 0; i < columns; i++) {
283 free(data[i]);
284 if (rankOrder)
285 free(rank[i]);
286 if (accept[i])
287 free(accept[i]);
288 }
289 if (!SDDS_WritePage(&SDDSout))
290 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
291 }
292 if (SDDS_Terminate(&SDDSin) != 1) {
293 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
294 return 1;
295 }
296 if (SDDS_Terminate(&SDDSout) != 1) {
297 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
298 return 1;
299 }
300
301 return 0;
302}
int32_t SDDS_SetRowValues(SDDS_DATASET *SDDS_dataset, int32_t mode, int64_t row,...)
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int64_t SDDS_CountRowsOfInterest(SDDS_DATASET *SDDS_dataset)
Counts the number of rows marked as "of interest" in the current data table.
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
#define SDDS_LONG
Identifier for the signed 32-bit integer data type.
Definition SDDStypes.h:61
#define SDDS_DOUBLE
Identifier for the double data type.
Definition SDDStypes.h:37
void * tmalloc(uint64_t size_of_block)
Allocates a memory block of the specified size with zero initialization.
Definition array.c:59
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
Definition bomb.c:26
double shiftedLinearCorrelationCoefficient(double *data1, double *data2, short *accept1, short *accept2, long rows, long *count, long shift)
Compute the linear correlation coefficient between two data sets with a given shift.
Definition lincorr.c:102
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390
long scanItemList(unsigned long *flags, char **item, long *items, unsigned long mode,...)
Scans a list of items and assigns values based on provided keywords and types.

◆ markStDevOutliers()

void markStDevOutliers ( double * data,
double limit,
long passes,
short * keep,
int64_t n )

Definition at line 321 of file sddsshiftcor.c.

321 {
322 double sum1, sum2, variance, mean, absLimit;
323 long pass;
324 int64_t i, summed, kept;
325
326 for (i = 0; i < n; i++)
327 keep[i] = 1;
328 kept = n;
329 for (pass = 0; pass < passes && kept; pass++) {
330 sum1 = sum2 = 0.0;
331 summed = 0;
332 for (i = 0; i < n; i++) {
333 if (keep[i]) {
334 summed++;
335 sum1 += data[i];
336 sum2 += data[i] * data[i];
337 }
338 }
339 if (summed < 2)
340 return;
341 mean = sum1 / summed;
342 sum2 /= summed;
343 if ((variance = sum2 - mean * mean) > 0.0) {
344 absLimit = limit * sqrt(variance);
345 for (i = 0; i < n; i++) {
346 if (keep[i] && fabs(data[i] - mean) > absLimit) {
347 keep[i] = 0;
348 kept--;
349 }
350 }
351 }
352 }
353}

◆ replaceWithRank()

void replaceWithRank ( double * data,
long n )

Definition at line 379 of file sddsshiftcor.c.

379 {
380 static DATAnINDEX *indexedData = NULL;
381 long i, j, iStart, iEnd;
382
383 indexedData = SDDS_Realloc(indexedData, sizeof(*indexedData) * n);
384 for (i = 0; i < n; i++) {
385 indexedData[i].data = data[i];
386 indexedData[i].originalIndex = i;
387 }
388 qsort((void *)indexedData, n, sizeof(*indexedData), compareData);
389 for (i = 0; i < n; i++)
390 data[indexedData[i].originalIndex] = (double)i;
391 for (i = 0; i < n - 1; i++) {
392 if (data[i] == data[i + 1]) {
393 iStart = i;
394 for (j = i + 2; j < n; j++) {
395 if (data[j - 1] != data[j])
396 break;
397 }
398 iEnd = j - 1;
399 double average = (iStart + iEnd) / 2.0;
400 for (j = iStart; j <= iEnd; j++)
401 data[indexedData[j].originalIndex] = average;
402 i = iEnd;
403 }
404 }
405}
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
Definition SDDS_utils.c:677

◆ setupOutputFile()

void setupOutputFile ( SDDS_DATASET * SDDSout,
char * output,
char ** column,
long columns,
short columnMajorOrder )

Definition at line 304 of file sddsshiftcor.c.

304 {
305 long i;
306 char buffer[16384];
307 if (!SDDS_InitializeOutput(SDDSout, SDDS_BINARY, 0, NULL, NULL, output) ||
308 !SDDS_DefineSimpleColumn(SDDSout, "ShiftAmount", NULL, SDDS_LONG))
309 SDDS_Bomb("unable to open output file");
310 SDDSout->layout.data_mode.column_major = columnMajorOrder;
311
312 for (i = 0; i < columns; i++) {
313 sprintf(buffer, "%sShiftedCor", column[i]);
314 if (!SDDS_DefineSimpleColumn(SDDSout, buffer, NULL, SDDS_DOUBLE))
315 SDDS_Bomb("unable to set up column definitions");
316 }
317 if (!SDDS_WriteLayout(SDDSout))
318 SDDS_Bomb("unable to set up output file");
319}
int32_t SDDS_InitializeOutput(SDDS_DATASET *SDDS_dataset, int32_t data_mode, int32_t lines_per_row, const char *description, const char *contents, const char *filename)
Initializes the SDDS output dataset.
int32_t SDDS_DefineSimpleColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *unit, int32_t type)
Defines a simple data column within the SDDS dataset.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.

Variable Documentation

◆ option

char* option[N_OPTIONS]
Initial value:
= {
"columns",
"excludecolumns",
"with",
"pipe",
"rankorder",
"stdevoutlier",
"scan",
"verbose",
"majorOrder",
}

Definition at line 72 of file sddsshiftcor.c.

72 {
73 "columns",
74 "excludecolumns",
75 "with",
76 "pipe",
77 "rankorder",
78 "stdevoutlier",
79 "scan",
80 "verbose",
81 "majorOrder",
82};