SDDSlib
Loading...
Searching...
No Matches
sddsnormalize.c
Go to the documentation of this file.
1/**
2 * @file sddsnormalize.c
3 * @brief A program for SDDS-format column normalization.
4 *
5 * This program normalizes the specified columns of an SDDS file according to the user-defined mode.
6 * The normalization can be based on several statistical metrics such as minimum, maximum, RMS, etc.
7 *
8 * ### Features:
9 * - Normalize columns using various modes like RMS, standard deviation, sum, etc.
10 * - Support for multi-threaded normalization.
11 * - Custom suffixes for normalized column names.
12 * - Exclusion of specific columns using wildcard patterns.
13 *
14 * ### Usage:
15 * `sddsnormalize [<inputfile>] [<outputfile>]`
16 * - `-pipe=[input][,output]`
17 * - `-columns=[mode=<mode>,][suffix=<string>,][exclude=<wildcardString>,]<columnName>[,...]`
18 * - `-threads=<number>`
19 * - `-majorOrder=row|column`
20 *
21 * ### Options:
22 * - **mode**: Specifies the normalization mode. Available options include:
23 * - minimum
24 * - maximum
25 * - largest
26 * - signedlargest
27 * - spread
28 * - rms
29 * - standarddeviation
30 * - sum
31 * - area
32 * - average
33 * - **suffix**: Specifies a suffix for the normalized column.
34 * - **exclude**: Excludes columns matching a pattern from normalization.
35 * - **threads**: Specifies the number of threads for normalization.
36 * - **majorOrder**: Specifies the data processing order (row or column).
37 *
38 * @copyright
39 * - (c) 2002 The University of Chicago, as Operator of Argonne National Laboratory.
40 * - (c) 2002 The Regents of the University of California, as Operator of Los Alamos National Laboratory.
41 *
42 * @license
43 * This file is distributed under the terms of the Software License Agreement
44 * found in the file LICENSE included with this distribution.
45 *
46 * @author M. Borland, R. Soliday, H. Shang
47 */
48
49#include "mdb.h"
50#include "SDDS.h"
51#include "scan.h"
52#include <ctype.h>
53
54/* Enumeration for option types */
55enum option_type {
56 CLO_COLUMNS,
57 CLO_PIPE,
58 CLO_MAJOR_ORDER,
59 CLO_THREADS,
60 N_OPTIONS
61};
62
63char *option[N_OPTIONS] = {
64 "columns",
65 "pipe",
66 "majorOrder",
67 "threads",
68};
69
70static char *USAGE =
71 "Usage: sddsnormalize [<inputfile>] [<outputfile>] \n"
72 " [-pipe=[input][,output]] \n"
73 " -columns=[mode=<mode>,][suffix=<string>,][exclude=<wildcardString>,]<columnName>[,...] \n"
74 " [-threads=<number>] \n"
75 " [-majorOrder=row|column] \n\n"
76 "Options:\n"
77 " <mode> Specifies the normalization mode. Available modes are:\n"
78 " minimum, maximum, largest, signedlargest,\n"
79 " spread, rms, standarddeviation, sum, area, or average.\n"
80 " - minimum : Use the minimum value as the normalization factor.\n"
81 " - maximum : Use the maximum value as the normalization factor.\n"
82 " - largest : Use the larger of |min| or |max| (default).\n"
83 " - signedlargest: Use the largest value with its sign retained.\n"
84 " - spread : Use (max - min) as the normalization factor.\n"
85 " - rms : Use the root-mean-square of the values.\n"
86 " - standarddeviation: Use the n-1 weighted standard deviation.\n"
87 " - sum : Use the sum of all values.\n"
88 " - area : Use the area under the curve (requires functionOf).\n"
89 " - average : Use the average of all values.\n"
90 " <string> Specifies a suffix to append to the column name for the normalized output.\n"
91 " If omitted, the original column is replaced.\n"
92 " <wildcardString> Excludes columns matching the wildcard pattern from normalization.\n"
93 " <columnName> Specifies the column(s) to normalize. Multiple columns can be separated by commas.\n"
94 " <number> Specifies the number of threads to use for normalization.\n"
95 " row|column Specifies the major order for data processing.\n\n"
96 "Program by Michael Borland. (" __DATE__ " " __TIME__ ", SVN revision: " SVN_VERSION ")\n";
97
98/* different modes for normalizing */
99#define NORM_MINIMUM 0
100#define NORM_MAXIMUM 1
101#define NORM_LARGEST 2
102#define NORM_SLARGEST 3
103#define NORM_SPREAD 4
104#define NORM_RMS 5
105#define NORM_STDEV 6
106#define NORM_SUM 7
107#define NORM_AREA 8
108#define NORM_AVERAGE 9
109#define NORM_OPTIONS 10
110static char *normMode[NORM_OPTIONS] = {
111 "minimum",
112 "maximum",
113 "largest",
114 "signedlargest",
115 "spread",
116 "rms",
117 "standarddeviation",
118 "sum",
119 "area",
120 "average",
121};
122
123/* structure for users requests to normalize */
124#define FL_SUFFIX_GIVEN 0x0001U
125#define FL_MODE_GIVEN 0x0002U
126#define FL_FUNCOF_GIVEN 0x0004U
127typedef struct
128{
129 unsigned long flags;
130 char *suffix, **source, *exclude, *functionOf;
131 long sources, mode;
133
134/* individual specifications for one column, made from
135 * users request after expanding wildcards and lists
136 */
137typedef struct
138{
139 unsigned long flags;
140 char *source, *target, *functionOf;
141 long mode;
142} NORM_SPEC;
143
144long resolveColumnNames(SDDS_DATASET *SDDSin, NORM_REQUEST *normRequest, long normRequests, NORM_SPEC **normSpecRet, long *normSpecsRet);
145
146int main(int argc, char **argv) {
147 int iArg;
148 NORM_REQUEST *normRequest;
149 NORM_SPEC *normSpec;
150 long normRequests, normSpecs, i, readCode;
151 int64_t j, rows;
152 char *input, *output, *modeString;
153 unsigned long pipeFlags, majorOrderFlag;
154 SCANNED_ARG *scanned;
155 SDDS_DATASET SDDSin, SDDSout;
156 double *data, *funcOfData, factor, min, max;
157 short columnMajorOrder = -1;
158 int threads = 1;
159
161 argc = scanargs(&scanned, argc, argv);
162 if (argc < 3)
163 bomb(NULL, USAGE);
164
165 output = input = NULL;
166 pipeFlags = 0;
167 normRequest = NULL;
168 normSpec = NULL;
169 normRequests = normSpecs = 0;
170
171 for (iArg = 1; iArg < argc; iArg++) {
172 if (scanned[iArg].arg_type == OPTION) {
173 /* process options here */
174 switch (match_string(scanned[iArg].list[0], option, N_OPTIONS, 0)) {
175 case CLO_MAJOR_ORDER:
176 majorOrderFlag = 0;
177 scanned[iArg].n_items--;
178 if (scanned[iArg].n_items > 0 && (!scanItemList(&majorOrderFlag, scanned[iArg].list + 1, &scanned[iArg].n_items, 0, "row", -1, NULL, 0, SDDS_ROW_MAJOR_ORDER, "column", -1, NULL, 0, SDDS_COLUMN_MAJOR_ORDER, NULL)))
179 SDDS_Bomb("invalid -majorOrder syntax/values");
180 if (majorOrderFlag & SDDS_COLUMN_MAJOR_ORDER)
181 columnMajorOrder = 1;
182 else if (majorOrderFlag & SDDS_ROW_MAJOR_ORDER)
183 columnMajorOrder = 0;
184 break;
185 case CLO_COLUMNS:
186 if (!(normRequest = SDDS_Realloc(normRequest, sizeof(*normRequest) * (normRequests + 1))))
187 SDDS_Bomb("memory allocation failure");
188 normRequest[normRequests].exclude = normRequest[normRequests].suffix = NULL;
189 if (!scanItemList(&normRequest[normRequests].flags,
190 scanned[iArg].list, &scanned[iArg].n_items,
191 SCANITEMLIST_UNKNOWN_VALUE_OK | SCANITEMLIST_REMOVE_USED_ITEMS |
192 SCANITEMLIST_IGNORE_VALUELESS,
193 "mode", SDDS_STRING, &modeString, 1, FL_MODE_GIVEN,
194 "suffix", SDDS_STRING, &normRequest[normRequests].suffix, 1, FL_SUFFIX_GIVEN,
195 "functionof", SDDS_STRING, &normRequest[normRequests].functionOf, 1, FL_FUNCOF_GIVEN,
196 "exclude", SDDS_STRING, &normRequest[normRequests].exclude, 1, 0, NULL))
197 SDDS_Bomb("invalid -columns syntax");
198 if (normRequest[normRequests].flags & FL_MODE_GIVEN) {
199 if ((normRequest[normRequests].mode = match_string(modeString, normMode, NORM_OPTIONS, 0)) < 0)
200 SDDS_Bomb("invalid -columns syntax: unknown mode");
201 } else
202 normRequest[normRequests].mode = NORM_LARGEST;
203 if (scanned[iArg].n_items < 1)
204 SDDS_Bomb("invalid -columns syntax: no columns listed");
205 normRequest[normRequests].source = scanned[iArg].list + 1;
206 normRequest[normRequests].sources = scanned[iArg].n_items - 1;
207 normRequests++;
208 break;
209 case CLO_THREADS:
210 if (scanned[iArg].n_items != 2 ||
211 !sscanf(scanned[iArg].list[1], "%d", &threads) || threads < 1)
212 SDDS_Bomb("invalid -threads syntax");
213 break;
214 case CLO_PIPE:
215 if (!processPipeOption(scanned[iArg].list + 1, scanned[iArg].n_items - 1, &pipeFlags))
216 SDDS_Bomb("invalid -pipe syntax");
217 break;
218 default:
219 fprintf(stderr, "error: unknown/ambiguous option: %s\n", scanned[iArg].list[0]);
220 exit(EXIT_FAILURE);
221 break;
222 }
223 } else {
224 if (!input)
225 input = scanned[iArg].list[0];
226 else if (!output)
227 output = scanned[iArg].list[0];
228 else
229 SDDS_Bomb("too many filenames seen");
230 }
231 }
232
233 processFilenames("sddsnormalize", &input, &output, pipeFlags, 0, NULL);
234
235 if (!normRequests)
236 SDDS_Bomb("supply the names of columns to normalize with the -columns option");
237
238 if (!SDDS_InitializeInput(&SDDSin, input))
239 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
240
241 if (!resolveColumnNames(&SDDSin, normRequest, normRequests, &normSpec, &normSpecs))
242 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
243
244 if (!normSpecs)
245 SDDS_Bomb("no columns selected for normalization");
246
247 if (!SDDS_InitializeCopy(&SDDSout, &SDDSin, output, "w"))
248 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
249 if (columnMajorOrder != -1)
250 SDDSout.layout.data_mode.column_major = columnMajorOrder;
251 else
252 SDDSout.layout.data_mode.column_major = SDDSin.layout.data_mode.column_major;
253 for (i = 0; i < normSpecs; i++) {
254 if (normSpec[i].flags & FL_SUFFIX_GIVEN) {
255 if (!SDDS_TransferColumnDefinition(&SDDSout, &SDDSin, normSpec[i].source, normSpec[i].target) ||
256 !SDDS_ChangeColumnInformation(&SDDSout, "units", "", SDDS_BY_NAME, normSpec[i].target))
257 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
258 } else if (!SDDS_ChangeColumnInformation(&SDDSout, "units", "Normalized", SDDS_BY_NAME, normSpec[i].target))
259 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
260 }
261
262 if (!SDDS_WriteLayout(&SDDSout))
263 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
264
265 while ((readCode = SDDS_ReadPage(&SDDSin)) > 0) {
266 if (!SDDS_CopyPage(&SDDSout, &SDDSin))
267 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
268 if ((rows = SDDS_RowCount(&SDDSin))) {
269 for (i = 0; i < normSpecs; i++) {
270 if (!(data = SDDS_GetColumnInDoubles(&SDDSin, normSpec[i].source)))
271 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
272 funcOfData = NULL;
273 if (normSpec[i].functionOf &&
274 !(funcOfData = SDDS_GetColumnInDoubles(&SDDSin, normSpec[i].functionOf)))
275 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
276 if (!find_min_max(&min, &max, data, rows))
277 min = max = 1;
278 switch (normSpec[i].mode) {
279 case NORM_RMS:
280 factor = rmsValueThreaded(data, rows, threads);
281 break;
282 case NORM_STDEV:
283 factor = standardDeviationThreaded(data, rows, threads);
284 break;
285 case NORM_MINIMUM:
286 factor = min;
287 break;
288 case NORM_MAXIMUM:
289 factor = max;
290 break;
291 case NORM_LARGEST:
292 min = fabs(min);
293 max = fabs(max);
294 factor = MAX(min, max);
295 break;
296 case NORM_SLARGEST:
297 if (fabs(min) > fabs(max))
298 factor = min;
299 else
300 factor = max;
301 break;
302 case NORM_SPREAD:
303 factor = max - min;
304 break;
305 case NORM_SUM:
306 for (j = factor = 0; j < rows; j++)
307 factor += data[j];
308 break;
309 case NORM_AREA:
310 if (!funcOfData)
311 SDDS_Bomb("functionOf qualifier must be given for area normalization");
312 trapazoidIntegration(funcOfData, data, rows, &factor);
313 break;
314 case NORM_AVERAGE:
315 for (j = factor = 0; j < rows; j++)
316 factor += data[j];
317 factor /= rows;
318 break;
319 default:
320 SDDS_Bomb("Invalid normalization mode---programming error");
321 break;
322 }
323 if (funcOfData)
324 free(funcOfData);
325 if (factor)
326 for (j = 0; j < rows; j++)
327 data[j] /= factor;
328 if (!SDDS_SetColumnFromDoubles(&SDDSout, SDDS_SET_BY_NAME, data, rows, normSpec[i].target))
329 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
330 free(data);
331 }
332 }
333 if (!SDDS_WritePage(&SDDSout))
334 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
335 }
336 if (!SDDS_Terminate(&SDDSin)) {
337 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
338 return EXIT_FAILURE;
339 }
340 if (!SDDS_Terminate(&SDDSout)) {
341 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
342 return EXIT_FAILURE;
343 }
344
345 return EXIT_SUCCESS;
346}
347
348long resolveColumnNames(SDDS_DATASET *SDDSin, NORM_REQUEST *normRequest, long normRequests, NORM_SPEC **normSpecRet, long *normSpecsRet) {
349 long i, j;
350 int32_t columns;
351 char **column, buffer[1024];
352 long normSpecs = 0;
353 NORM_SPEC *normSpec = NULL;
354
355 for (i = 0; i < normRequests; i++) {
356 SDDS_SetColumnFlags(SDDSin, 0);
357 if (normRequest[i].flags & FL_SUFFIX_GIVEN) {
358 if (!normRequest[i].suffix || !strlen(normRequest[i].suffix)) {
359 SDDS_SetError("resolveColumnNames: missing or blank suffix");
360 return 0;
361 }
362 }
363 for (j = 0; j < normRequest[i].sources; j++) {
364 if (!SDDS_SetColumnsOfInterest(SDDSin, SDDS_MATCH_STRING, normRequest[i].source[j], SDDS_OR)) {
365 SDDS_SetError("resolveColumnNames: SDDS_SetColumnsOfInterest error");
366 return 0;
367 }
368 }
369 if (normRequest[i].exclude &&
370 !SDDS_SetColumnsOfInterest(SDDSin, SDDS_MATCH_STRING, normRequest[i].exclude, SDDS_NEGATE_MATCH | SDDS_AND)) {
371 SDDS_SetError("resolveColumnNames: SDDS_SetColumnsOfInterest error");
372 return 0;
373 }
374 if (!(column = SDDS_GetColumnNames(SDDSin, &columns)) || columns == 0) {
375 sprintf(buffer, "No match for column list: ");
376 for (j = 0; j < normRequest[i].sources; j++) {
377 strcat(buffer, normRequest[i].source[j]);
378 if (j != normRequest[i].sources - 1)
379 strcat(buffer, ", ");
380 }
381 SDDS_SetError(buffer);
382 return 0;
383 }
384 if (!(normSpec = SDDS_Realloc(normSpec, sizeof(*normSpec) * (normSpecs + columns)))) {
385 SDDS_SetError("resolveColumnNames: Memory allocation failure");
386 return 0;
387 }
388 for (j = 0; j < columns; j++) {
389 normSpec[j + normSpecs].source = column[j];
390 normSpec[j + normSpecs].mode = normRequest[i].mode;
391 normSpec[j + normSpecs].flags = normRequest[i].flags;
392 normSpec[j + normSpecs].functionOf = NULL;
393 if (normRequest[i].flags & FL_FUNCOF_GIVEN) {
394 if (!SDDS_CopyString(&normSpec[j + normSpecs].functionOf, normRequest[i].functionOf)) {
395 SDDS_SetError("resolveColumnNames: Memory allocation failure");
396 return 0;
397 }
398 }
399 normSpec[j + normSpecs].target = NULL;
400 if (normRequest[i].flags & FL_SUFFIX_GIVEN) {
401 sprintf(buffer, "%s%s", normSpec[j + normSpecs].source, normRequest[i].suffix);
402 if (!SDDS_CopyString(&normSpec[j + normSpecs].target, buffer)) {
403 SDDS_SetError("resolveColumnNames: Memory allocation failure");
404 return 0;
405 }
406 } else
407 normSpec[j + normSpecs].target = normSpec[j + normSpecs].source;
408 }
409 normSpecs += columns;
410 }
411 *normSpecRet = normSpec;
412 *normSpecsRet = normSpecs;
413 return 1;
414}
SDDS (Self Describing Data Set) Data Types Definitions and Function Prototypes.
int32_t SDDS_InitializeCopy(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source, char *filename, char *filemode)
Definition SDDS_copy.c:40
int32_t SDDS_CopyPage(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
Definition SDDS_copy.c:578
int32_t SDDS_SetColumnFromDoubles(SDDS_DATASET *SDDS_dataset, int32_t mode, double *data, int64_t rows,...)
Sets the values for a single data column using double-precision floating-point numbers.
int32_t SDDS_SetColumnsOfInterest(SDDS_DATASET *SDDS_dataset, int32_t mode,...)
Sets the acceptance flags for columns based on specified naming criteria.
int32_t SDDS_SetColumnFlags(SDDS_DATASET *SDDS_dataset, int32_t column_flag_value)
Sets the acceptance flags for all columns in the current data table of a data set.
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int32_t SDDS_ChangeColumnInformation(SDDS_DATASET *SDDS_dataset, char *field_name, void *memory, int32_t mode,...)
Modifies a specific field in a column definition within the SDDS dataset.
Definition SDDS_info.c:364
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_TransferColumnDefinition(SDDS_DATASET *target, SDDS_DATASET *source, char *name, char *newName)
Transfers a column definition from a source dataset to a target dataset.
void SDDS_SetError(char *error_text)
Records an error message in the SDDS error stack.
Definition SDDS_utils.c:379
char ** SDDS_GetColumnNames(SDDS_DATASET *SDDS_dataset, int32_t *number)
Retrieves the names of all columns in the SDDS dataset.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
int32_t SDDS_CopyString(char **target, const char *source)
Copies a source string to a target string with memory allocation.
Definition SDDS_utils.c:856
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
Definition SDDS_utils.c:677
#define SDDS_STRING
Identifier for the string data type.
Definition SDDStypes.h:85
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
Definition bomb.c:26
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
Definition findMinMax.c:33
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
double standardDeviationThreaded(double *x, long n, long numThreads)
Calculates the standard deviation of an array of doubles using multiple threads.
Definition moments.c:51
double rmsValueThreaded(double *y, long n, long numThreads)
Calculates the RMS (Root Mean Square) value of an array of doubles using multiple threads.
Definition moments.c:597
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390
long scanItemList(unsigned long *flags, char **item, long *items, unsigned long mode,...)
Scans a list of items and assigns values based on provided keywords and types.
long trapazoidIntegration(double *x, double *y, long n, double *integral)
Computes the integral of a dataset using the trapezoidal rule.
Definition trapInteg.c:29