SDDS ToolKit Programs and Libraries for C and Python
All Classes Files Functions Variables Macros Pages
sddsnormalize.c
Go to the documentation of this file.
1/**
2 * @file sddsnormalize.c
3 * @brief A program for SDDS-format column normalization.
4 *
5 * @details
6 * This program normalizes the specified columns of an SDDS file based on user-defined options.
7 * It provides various statistical modes for normalization, supports multithreading, and
8 * offers flexibility in data handling with features like custom suffixes and column exclusions.
9 *
10 * @section Usage
11 * ```
12 * sddsnormalize [<inputfile>] [<outputfile>]
13 * [-pipe=[input][,output]]
14 * -columns=[mode=<mode>,][suffix=<string>,][exclude=<wildcardString>,][functionOf=<columnName>,]<columnName>[,...]
15 * [-threads=<number>]
16 * [-majorOrder=row|column]
17 * ```
18 *
19 * @section Options
20 * | Required | Description |
21 * |---------------------------------------|---------------------------------------------------------------------------------------|
22 * | `-columns` | Specifies the columns to normalize and their modes of normalization. |
23 *
24 * | Optional | Description |
25 * |---------------------------------------|---------------------------------------------------------------------------------------|
26 * | `-pipe` | Specifies whether the input/output is piped. |
27 * | `-threads` | Defines the number of threads for parallel normalization. |
28 * | `-majorOrder` | Sets the processing order as row-major or column-major. |
29 *
30 * | Column Modes | Description |
31 * |---------------------------------------|---------------------------------------------------------------------------------------|
32 * | `minimum` | Use the minimum value as the normalization factor. |
33 * | `maximum` | Use the maximum value as the normalization factor. |
34 * | `largest` | Use the larger of \|min\| or \|max\| (default). |
35 * | `signedlargest` | Use the largest value with its sign retained. |
36 * | `spread` | Use (max - min) as the normalization factor. |
37 * | `rms` | Use the root-mean-square of the values. |
38 * | `standarddeviation` | Use the n-1 weighted standard deviation. |
39 * | `sum` | Use the sum of all values. |
40 * | `area` | Use the area under the curve (requires functionOf). |
41 * | `average` | Use the average of all values. |
42 *
43 * @subsection spec_req Specific Requirements
44 * - For the `area` mode in `-columns`, the `functionOf` qualifier must be provided.
45 * - The `-columns` mode defaults to `largest` if not specified.
46 *
47 * @copyright
48 * - (c) 2002 The University of Chicago, as Operator of Argonne National Laboratory.
49 * - (c) 2002 The Regents of the University of California, as Operator of Los Alamos National Laboratory.
50 *
51 * @license
52 * This file is distributed under the terms of the Software License Agreement
53 * found in the file LICENSE included with this distribution.
54 *
55 * @author
56 * M. Borland, R. Soliday, H. Shang
57 */
58
59#include "mdb.h"
60#include "SDDS.h"
61#include "scan.h"
62#include <ctype.h>
63
64/* Enumeration for option types */
65enum option_type {
66 CLO_COLUMNS,
67 CLO_PIPE,
68 CLO_MAJOR_ORDER,
69 CLO_THREADS,
70 N_OPTIONS
71};
72
73char *option[N_OPTIONS] = {
74 "columns",
75 "pipe",
76 "majorOrder",
77 "threads",
78};
79
80static char *USAGE =
81 "Usage: sddsnormalize [<inputfile>] [<outputfile>] \n"
82 " [-pipe=[input][,output]] \n"
83 " -columns=[mode=<mode>,][suffix=<string>,][exclude=<wildcardString>,][functionOf=<columnName>,]<columnName>[,...] \n"
84 " [-threads=<number>] \n"
85 " [-majorOrder=row|column] \n\n"
86 "Options:\n"
87 " <mode> Specifies the normalization mode. Available modes are:\n"
88 " minimum, maximum, largest, signedlargest,\n"
89 " spread, rms, standarddeviation, sum, area, or average.\n"
90 " - minimum : Use the minimum value as the normalization factor.\n"
91 " - maximum : Use the maximum value as the normalization factor.\n"
92 " - largest : Use the larger of |min| or |max| (default).\n"
93 " - signedlargest: Use the largest value with its sign retained.\n"
94 " - spread : Use (max - min) as the normalization factor.\n"
95 " - rms : Use the root-mean-square of the values.\n"
96 " - standarddeviation: Use the n-1 weighted standard deviation.\n"
97 " - sum : Use the sum of all values.\n"
98 " - area : Use the area under the curve (requires functionOf).\n"
99 " - average : Use the average of all values.\n"
100 " <string> Specifies a suffix to append to the column name for the normalized output.\n"
101 " If omitted, the original column is replaced.\n"
102 " <wildcardString> Excludes columns matching the wildcard pattern from normalization.\n"
103 " <columnName> Specifies the column(s) to normalize. Multiple columns can be separated by commas.\n"
104 " <number> Specifies the number of threads to use for normalization.\n"
105 " row|column Specifies the major order for data processing.\n\n"
106 "Program by Michael Borland. (" __DATE__ " " __TIME__ ", SVN revision: " SVN_VERSION ")\n";
107
108/* different modes for normalizing */
109#define NORM_MINIMUM 0
110#define NORM_MAXIMUM 1
111#define NORM_LARGEST 2
112#define NORM_SLARGEST 3
113#define NORM_SPREAD 4
114#define NORM_RMS 5
115#define NORM_STDEV 6
116#define NORM_SUM 7
117#define NORM_AREA 8
118#define NORM_AVERAGE 9
119#define NORM_OPTIONS 10
120static char *normMode[NORM_OPTIONS] = {
121 "minimum",
122 "maximum",
123 "largest",
124 "signedlargest",
125 "spread",
126 "rms",
127 "standarddeviation",
128 "sum",
129 "area",
130 "average",
131};
132
133/* structure for users requests to normalize */
134#define FL_SUFFIX_GIVEN 0x0001U
135#define FL_MODE_GIVEN 0x0002U
136#define FL_FUNCOF_GIVEN 0x0004U
137typedef struct
138{
139 unsigned long flags;
140 char *suffix, **source, *exclude, *functionOf;
141 long sources, mode;
143
144/* individual specifications for one column, made from
145 * users request after expanding wildcards and lists
146 */
147typedef struct
148{
149 unsigned long flags;
150 char *source, *target, *functionOf;
151 long mode;
152} NORM_SPEC;
153
154long resolveColumnNames(SDDS_DATASET *SDDSin, NORM_REQUEST *normRequest, long normRequests, NORM_SPEC **normSpecRet, long *normSpecsRet);
155
156int main(int argc, char **argv) {
157 int iArg;
158 NORM_REQUEST *normRequest;
159 NORM_SPEC *normSpec;
160 long normRequests, normSpecs, i, readCode;
161 int64_t j, rows;
162 char *input, *output, *modeString;
163 unsigned long pipeFlags, majorOrderFlag;
164 SCANNED_ARG *scanned;
165 SDDS_DATASET SDDSin, SDDSout;
166 double *data, *funcOfData, factor, min, max;
167 short columnMajorOrder = -1;
168 int threads = 1;
169
171 argc = scanargs(&scanned, argc, argv);
172 if (argc < 3)
173 bomb(NULL, USAGE);
174
175 output = input = NULL;
176 pipeFlags = 0;
177 normRequest = NULL;
178 normSpec = NULL;
179 normRequests = normSpecs = 0;
180
181 for (iArg = 1; iArg < argc; iArg++) {
182 if (scanned[iArg].arg_type == OPTION) {
183 /* process options here */
184 switch (match_string(scanned[iArg].list[0], option, N_OPTIONS, 0)) {
185 case CLO_MAJOR_ORDER:
186 majorOrderFlag = 0;
187 scanned[iArg].n_items--;
188 if (scanned[iArg].n_items > 0 && (!scanItemList(&majorOrderFlag, scanned[iArg].list + 1, &scanned[iArg].n_items, 0, "row", -1, NULL, 0, SDDS_ROW_MAJOR_ORDER, "column", -1, NULL, 0, SDDS_COLUMN_MAJOR_ORDER, NULL)))
189 SDDS_Bomb("invalid -majorOrder syntax/values");
190 if (majorOrderFlag & SDDS_COLUMN_MAJOR_ORDER)
191 columnMajorOrder = 1;
192 else if (majorOrderFlag & SDDS_ROW_MAJOR_ORDER)
193 columnMajorOrder = 0;
194 break;
195 case CLO_COLUMNS:
196 if (!(normRequest = SDDS_Realloc(normRequest, sizeof(*normRequest) * (normRequests + 1))))
197 SDDS_Bomb("memory allocation failure");
198 normRequest[normRequests].exclude = normRequest[normRequests].suffix = NULL;
199 if (!scanItemList(&normRequest[normRequests].flags,
200 scanned[iArg].list, &scanned[iArg].n_items,
201 SCANITEMLIST_UNKNOWN_VALUE_OK | SCANITEMLIST_REMOVE_USED_ITEMS |
202 SCANITEMLIST_IGNORE_VALUELESS,
203 "mode", SDDS_STRING, &modeString, 1, FL_MODE_GIVEN,
204 "suffix", SDDS_STRING, &normRequest[normRequests].suffix, 1, FL_SUFFIX_GIVEN,
205 "functionof", SDDS_STRING, &normRequest[normRequests].functionOf, 1, FL_FUNCOF_GIVEN,
206 "exclude", SDDS_STRING, &normRequest[normRequests].exclude, 1, 0, NULL))
207 SDDS_Bomb("invalid -columns syntax");
208 if (normRequest[normRequests].flags & FL_MODE_GIVEN) {
209 if ((normRequest[normRequests].mode = match_string(modeString, normMode, NORM_OPTIONS, 0)) < 0)
210 SDDS_Bomb("invalid -columns syntax: unknown mode");
211 } else
212 normRequest[normRequests].mode = NORM_LARGEST;
213 if (scanned[iArg].n_items < 1)
214 SDDS_Bomb("invalid -columns syntax: no columns listed");
215 normRequest[normRequests].source = scanned[iArg].list + 1;
216 normRequest[normRequests].sources = scanned[iArg].n_items - 1;
217 normRequests++;
218 break;
219 case CLO_THREADS:
220 if (scanned[iArg].n_items != 2 ||
221 !sscanf(scanned[iArg].list[1], "%d", &threads) || threads < 1)
222 SDDS_Bomb("invalid -threads syntax");
223 break;
224 case CLO_PIPE:
225 if (!processPipeOption(scanned[iArg].list + 1, scanned[iArg].n_items - 1, &pipeFlags))
226 SDDS_Bomb("invalid -pipe syntax");
227 break;
228 default:
229 fprintf(stderr, "error: unknown/ambiguous option: %s\n", scanned[iArg].list[0]);
230 exit(EXIT_FAILURE);
231 break;
232 }
233 } else {
234 if (!input)
235 input = scanned[iArg].list[0];
236 else if (!output)
237 output = scanned[iArg].list[0];
238 else
239 SDDS_Bomb("too many filenames seen");
240 }
241 }
242
243 processFilenames("sddsnormalize", &input, &output, pipeFlags, 0, NULL);
244
245 if (!normRequests)
246 SDDS_Bomb("supply the names of columns to normalize with the -columns option");
247
248 if (!SDDS_InitializeInput(&SDDSin, input))
249 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
250
251 if (!resolveColumnNames(&SDDSin, normRequest, normRequests, &normSpec, &normSpecs))
252 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
253
254 if (!normSpecs)
255 SDDS_Bomb("no columns selected for normalization");
256
257 if (!SDDS_InitializeCopy(&SDDSout, &SDDSin, output, "w"))
258 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
259 if (columnMajorOrder != -1)
260 SDDSout.layout.data_mode.column_major = columnMajorOrder;
261 else
262 SDDSout.layout.data_mode.column_major = SDDSin.layout.data_mode.column_major;
263 for (i = 0; i < normSpecs; i++) {
264 if (normSpec[i].flags & FL_SUFFIX_GIVEN) {
265 if (!SDDS_TransferColumnDefinition(&SDDSout, &SDDSin, normSpec[i].source, normSpec[i].target) ||
266 !SDDS_ChangeColumnInformation(&SDDSout, "units", "", SDDS_BY_NAME, normSpec[i].target))
267 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
268 } else if (!SDDS_ChangeColumnInformation(&SDDSout, "units", "Normalized", SDDS_BY_NAME, normSpec[i].target))
269 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
270 }
271
272 if (!SDDS_WriteLayout(&SDDSout))
273 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
274
275 while ((readCode = SDDS_ReadPage(&SDDSin)) > 0) {
276 if (!SDDS_CopyPage(&SDDSout, &SDDSin))
277 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
278 if ((rows = SDDS_RowCount(&SDDSin))) {
279 for (i = 0; i < normSpecs; i++) {
280 if (!(data = SDDS_GetColumnInDoubles(&SDDSin, normSpec[i].source)))
281 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
282 funcOfData = NULL;
283 if (normSpec[i].functionOf &&
284 !(funcOfData = SDDS_GetColumnInDoubles(&SDDSin, normSpec[i].functionOf)))
285 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
286 if (!find_min_max(&min, &max, data, rows))
287 min = max = 1;
288 switch (normSpec[i].mode) {
289 case NORM_RMS:
290 factor = rmsValueThreaded(data, rows, threads);
291 break;
292 case NORM_STDEV:
293 factor = standardDeviationThreaded(data, rows, threads);
294 break;
295 case NORM_MINIMUM:
296 factor = min;
297 break;
298 case NORM_MAXIMUM:
299 factor = max;
300 break;
301 case NORM_LARGEST:
302 min = fabs(min);
303 max = fabs(max);
304 factor = MAX(min, max);
305 break;
306 case NORM_SLARGEST:
307 if (fabs(min) > fabs(max))
308 factor = min;
309 else
310 factor = max;
311 break;
312 case NORM_SPREAD:
313 factor = max - min;
314 break;
315 case NORM_SUM:
316 for (j = factor = 0; j < rows; j++)
317 factor += data[j];
318 break;
319 case NORM_AREA:
320 if (!funcOfData)
321 SDDS_Bomb("functionOf qualifier must be given for area normalization");
322 trapazoidIntegration(funcOfData, data, rows, &factor);
323 break;
324 case NORM_AVERAGE:
325 for (j = factor = 0; j < rows; j++)
326 factor += data[j];
327 factor /= rows;
328 break;
329 default:
330 SDDS_Bomb("Invalid normalization mode---programming error");
331 break;
332 }
333 if (funcOfData)
334 free(funcOfData);
335 if (factor)
336 for (j = 0; j < rows; j++)
337 data[j] /= factor;
338 if (!SDDS_SetColumnFromDoubles(&SDDSout, SDDS_SET_BY_NAME, data, rows, normSpec[i].target))
339 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
340 free(data);
341 }
342 }
343 if (!SDDS_WritePage(&SDDSout))
344 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
345 }
346 if (!SDDS_Terminate(&SDDSin)) {
347 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
348 return EXIT_FAILURE;
349 }
350 if (!SDDS_Terminate(&SDDSout)) {
351 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
352 return EXIT_FAILURE;
353 }
354
355 return EXIT_SUCCESS;
356}
357
358long resolveColumnNames(SDDS_DATASET *SDDSin, NORM_REQUEST *normRequest, long normRequests, NORM_SPEC **normSpecRet, long *normSpecsRet) {
359 long i, j;
360 int32_t columns;
361 char **column, buffer[1024];
362 long normSpecs = 0;
363 NORM_SPEC *normSpec = NULL;
364
365 for (i = 0; i < normRequests; i++) {
366 SDDS_SetColumnFlags(SDDSin, 0);
367 if (normRequest[i].flags & FL_SUFFIX_GIVEN) {
368 if (!normRequest[i].suffix || !strlen(normRequest[i].suffix)) {
369 SDDS_SetError("resolveColumnNames: missing or blank suffix");
370 return 0;
371 }
372 }
373 for (j = 0; j < normRequest[i].sources; j++) {
374 if (!SDDS_SetColumnsOfInterest(SDDSin, SDDS_MATCH_STRING, normRequest[i].source[j], SDDS_OR)) {
375 SDDS_SetError("resolveColumnNames: SDDS_SetColumnsOfInterest error");
376 return 0;
377 }
378 }
379 if (normRequest[i].exclude &&
380 !SDDS_SetColumnsOfInterest(SDDSin, SDDS_MATCH_STRING, normRequest[i].exclude, SDDS_NEGATE_MATCH | SDDS_AND)) {
381 SDDS_SetError("resolveColumnNames: SDDS_SetColumnsOfInterest error");
382 return 0;
383 }
384 if (!(column = SDDS_GetColumnNames(SDDSin, &columns)) || columns == 0) {
385 sprintf(buffer, "No match for column list: ");
386 for (j = 0; j < normRequest[i].sources; j++) {
387 strcat(buffer, normRequest[i].source[j]);
388 if (j != normRequest[i].sources - 1)
389 strcat(buffer, ", ");
390 }
391 SDDS_SetError(buffer);
392 return 0;
393 }
394 if (!(normSpec = SDDS_Realloc(normSpec, sizeof(*normSpec) * (normSpecs + columns)))) {
395 SDDS_SetError("resolveColumnNames: Memory allocation failure");
396 return 0;
397 }
398 for (j = 0; j < columns; j++) {
399 normSpec[j + normSpecs].source = column[j];
400 normSpec[j + normSpecs].mode = normRequest[i].mode;
401 normSpec[j + normSpecs].flags = normRequest[i].flags;
402 normSpec[j + normSpecs].functionOf = NULL;
403 if (normRequest[i].flags & FL_FUNCOF_GIVEN) {
404 if (!SDDS_CopyString(&normSpec[j + normSpecs].functionOf, normRequest[i].functionOf)) {
405 SDDS_SetError("resolveColumnNames: Memory allocation failure");
406 return 0;
407 }
408 }
409 normSpec[j + normSpecs].target = NULL;
410 if (normRequest[i].flags & FL_SUFFIX_GIVEN) {
411 sprintf(buffer, "%s%s", normSpec[j + normSpecs].source, normRequest[i].suffix);
412 if (!SDDS_CopyString(&normSpec[j + normSpecs].target, buffer)) {
413 SDDS_SetError("resolveColumnNames: Memory allocation failure");
414 return 0;
415 }
416 } else
417 normSpec[j + normSpecs].target = normSpec[j + normSpecs].source;
418 }
419 normSpecs += columns;
420 }
421 *normSpecRet = normSpec;
422 *normSpecsRet = normSpecs;
423 return 1;
424}
SDDS (Self Describing Data Set) Data Types Definitions and Function Prototypes.
int32_t SDDS_InitializeCopy(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source, char *filename, char *filemode)
Definition SDDS_copy.c:40
int32_t SDDS_CopyPage(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
Definition SDDS_copy.c:578
int32_t SDDS_SetColumnFromDoubles(SDDS_DATASET *SDDS_dataset, int32_t mode, double *data, int64_t rows,...)
Sets the values for a single data column using double-precision floating-point numbers.
int32_t SDDS_SetColumnsOfInterest(SDDS_DATASET *SDDS_dataset, int32_t mode,...)
Sets the acceptance flags for columns based on specified naming criteria.
int32_t SDDS_SetColumnFlags(SDDS_DATASET *SDDS_dataset, int32_t column_flag_value)
Sets the acceptance flags for all columns in the current data table of a data set.
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int32_t SDDS_ChangeColumnInformation(SDDS_DATASET *SDDS_dataset, char *field_name, void *memory, int32_t mode,...)
Modifies a specific field in a column definition within the SDDS dataset.
Definition SDDS_info.c:364
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_TransferColumnDefinition(SDDS_DATASET *target, SDDS_DATASET *source, char *name, char *newName)
Transfers a column definition from a source dataset to a target dataset.
void SDDS_SetError(char *error_text)
Records an error message in the SDDS error stack.
Definition SDDS_utils.c:379
char ** SDDS_GetColumnNames(SDDS_DATASET *SDDS_dataset, int32_t *number)
Retrieves the names of all columns in the SDDS dataset.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
int32_t SDDS_CopyString(char **target, const char *source)
Copies a source string to a target string with memory allocation.
Definition SDDS_utils.c:856
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
Definition SDDS_utils.c:677
#define SDDS_STRING
Identifier for the string data type.
Definition SDDStypes.h:85
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
Definition bomb.c:26
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
Definition findMinMax.c:33
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
double standardDeviationThreaded(double *x, long n, long numThreads)
Calculates the standard deviation of an array of doubles using multiple threads.
Definition moments.c:51
double rmsValueThreaded(double *y, long n, long numThreads)
Calculates the RMS (Root Mean Square) value of an array of doubles using multiple threads.
Definition moments.c:597
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390
long scanItemList(unsigned long *flags, char **item, long *items, unsigned long mode,...)
Scans a list of items and assigns values based on provided keywords and types.
long trapazoidIntegration(double *x, double *y, long n, double *integral)
Computes the integral of a dataset using the trapezoidal rule.
Definition trapInteg.c:29