SDDS ToolKit Programs and Libraries for C and Python
All Classes Files Functions Variables Macros Pages
sddssnap2grid.c
Go to the documentation of this file.
1/**
2 * @file sddssnap2grid.c
3 * @brief Snap data columns to a regular grid in SDDS files.
4 *
5 * @details
6 * This program processes SDDS files to adjust specified data columns
7 * so that their values align to a regular grid. The program supports
8 * options for controlling the bin size, adjustment factor, and initial grid
9 * spacing guess. It also optionally generates parameters describing the grid
10 * such as minimum, maximum, interval, and dimension.
11 *
12 * @section Usage
13 * ```
14 * sddssnap2grid <inputfile> <outputfile>
15 * [-pipe=[input][,output]]
16 * -column=<name>,[{maximumBins=<value>|binFactor=<value>|deltaGuess=<value>}][,adjustFactor=<value>]
17 * [-makeParameters]
18 * [-verbose]
19 * ```
20 *
21 * @section Options
22 * | Required | Description |
23 * |---------------------------------------|---------------------------------------------------------------------------------------|
24 * | `-column` | Specify the column name and bin settings (e.g., binFactor, maximumBins, deltaGuess). |
25 *
26 * | Optional | Description |
27 * |---------------------------------------|---------------------------------------------------------------------------------------|
28 * | `-pipe` | Standard SDDS Toolkit pipe option for input/output. |
29 * | `-makeParameters` | Store grid information as parameters in the output file. |
30 * | `-verbose` | Display detailed information during processing. |
31 *
32 * @subsection Incompatibilities
33 * - `-column`:
34 * - Only one of `maximumBins` or `binFactor` or `deltaGuess` may be specified.
35 *
36 * @subsection Requirements
37 * - For `-column`:
38 * - `adjustFactor` must be in the range (0, 1).
39 * - `deltaGuess` must be positive.
40 * - If `binFactor` is provided, it must be greater than or equal to 1.
41 *
42 * @copyright
43 * - (c) 2002 The University of Chicago, as Operator of Argonne National Laboratory.
44 * - (c) 2002 The Regents of the University of California, as Operator of Los Alamos National Laboratory.
45 *
46 * @license
47 * This file is distributed under the terms of the Software License Agreement
48 * found in the file LICENSE included with this distribution.
49 *
50 * @authors
51 * M. Borland, R. Soliday
52 */
53
54#include "mdb.h"
55#include "SDDS.h"
56#include "scan.h"
57#include "SDDSutils.h"
58#include <ctype.h>
59
60/* Enumeration for option types */
61enum option_type {
62 CLO_PIPE,
63 CLO_COLUMN,
64 CLO_VERBOSE,
65 CLO_MAKE_PARAMETERS,
66 N_OPTIONS
67};
68
69typedef struct {
70 unsigned long flags;
71 char *name;
72 int32_t maximumBins, binFactor;
73 double deltaGuess, adjustFactor;
75
76char *option[N_OPTIONS] = {
77 "pipe",
78 "column",
79 "verbose",
80 "makeparameters"
81};
82
83#define COLUMN_MAXIMUM_BINS 0x01UL
84#define COLUMN_BIN_FACTOR 0x02UL
85#define COLUMN_DELTA_GUESS 0x04UL
86#define COLUMN_ADJUST_FACTOR 0x08UL
87
88static char *USAGE =
89 "sddssnap2grid [<inputfile>] [<outputfile>] [-pipe=[input][,output]]\n"
90 " -column=<name>,[{maximumBins=<value>|binFactor=<value>|deltaGuess=<value>}][,adjustFactor=<value>]\n"
91 " [-column=...]\n"
92 " [-makeParameters] [-verbose]\n"
93 "\n"
94 "Options:\n"
95 " -pipe Standard SDDS Toolkit pipe option.\n"
96 " -column Specify the name of a column to modify for equispaced values.\n"
97 " The default mode uses binFactor = 10, meaning the maximum number\n"
98 " of bins is 10 times the number of data points. The algorithm works as follows:\n"
99 " 1. Bin the data with the maximum number of bins.\n"
100 " 2. If no two adjacent bins are populated, use this grouping to compute\n"
101 " centroids for each subset, providing delta values.\n"
102 " 3. If two adjacent bins are populated, multiply the number of bins by\n"
103 " adjustFactor (default: 0.9) and repeat the process.\n"
104 " Alternatively, you can provide a guess for the grid spacing;\n"
105 " the algorithm will use 1/10 of this as the initial bin size.\n"
106 " -makeParameters\n"
107 " Store grid parameters in the output file as parameters named\n"
108 " <name>Minimum, <name>Maximum, <name>Interval, and <name>Dimension.\n"
109 " -verbose Report the computed deltas and the number of grid points.\n"
110 "\n"
111 "Program by Michael Borland. (" __DATE__ " " __TIME__ ", SVN revision: " SVN_VERSION ")\n";
112
113void SnapDataToGrid(double *data, int64_t rows, COLUMN_TO_SNAP *column, int verbose, double *min, double *max, int64_t *points);
114void AddParameterDefinitions(SDDS_DATASET *SDDSout, SDDS_DATASET *SDDSin, COLUMN_TO_SNAP *column, long nColumns);
115void StoreGridParameters(SDDS_DATASET *SDDSout, char *column, double min, double max, int64_t points);
116
117int main(int argc, char **argv) {
118 int iArg, iColumn, verbose;
119 char *input, *output;
120 unsigned long pipeFlags;
121 SCANNED_ARG *scanned;
122 SDDS_DATASET SDDSin, SDDSout;
123 COLUMN_TO_SNAP *column;
124 long nColumns;
125 double *data;
126 int64_t rows;
127 short makeParameters;
128 double min, max;
129 int64_t points;
130
132 argc = scanargs(&scanned, argc, argv);
133 if (argc < 3)
134 bomb(NULL, USAGE);
135
136 nColumns = 0;
137 column = NULL;
138 input = output = NULL;
139 pipeFlags = 0;
140 verbose = 0;
141 makeParameters = 0;
142
143 for (iArg = 1; iArg < argc; iArg++) {
144 if (scanned[iArg].arg_type == OPTION) {
145 /* process options here */
146 switch (match_string(scanned[iArg].list[0], option, N_OPTIONS, 0)) {
147 case CLO_COLUMN:
148 column = SDDS_Realloc(column, sizeof(*column) * (nColumns + 1));
149 column[nColumns].name = scanned[iArg].list[1];
150 column[nColumns].maximumBins = -1;
151 column[nColumns].binFactor = 10;
152 column[nColumns].deltaGuess = -1;
153 column[nColumns].flags = COLUMN_BIN_FACTOR;
154 column[nColumns].adjustFactor = 0.9;
155 scanned[iArg].n_items -= 2;
156 if (scanned[iArg].n_items) {
157 if (!scanItemList(&(column[nColumns].flags), scanned[iArg].list + 2, &scanned[iArg].n_items, 0,
158 "maximumbins", SDDS_LONG, &(column[nColumns].maximumBins), 1, COLUMN_MAXIMUM_BINS,
159 "binfactor", SDDS_LONG, &(column[nColumns].binFactor), 1, COLUMN_BIN_FACTOR,
160 "deltaguess", SDDS_DOUBLE, &(column[nColumns].deltaGuess), 1, COLUMN_DELTA_GUESS,
161 "adjustfactor", SDDS_DOUBLE, &(column[nColumns].adjustFactor), 1, COLUMN_ADJUST_FACTOR,
162 NULL)) {
163 SDDS_Bomb("invalid -column syntax");
164 }
165 if (column[nColumns].flags & COLUMN_ADJUST_FACTOR) {
166 if (column[nColumns].adjustFactor <= 0 || column[nColumns].adjustFactor >= 1)
167 SDDS_Bomb("invalid -column syntax. adjustFactor must be (0,1)");
168 }
169 if (column[nColumns].flags & COLUMN_DELTA_GUESS) {
170 if (column[nColumns].flags & ~(COLUMN_DELTA_GUESS | COLUMN_ADJUST_FACTOR))
171 SDDS_Bomb("invalid -column syntax. Can't combine deltaGuess with other options.");
172 if (column[nColumns].deltaGuess <= 0)
173 SDDS_Bomb("invalid -column syntax. deltaGuess<=0.");
174 } else {
175 if (column[nColumns].flags & COLUMN_BIN_FACTOR && column[nColumns].flags & COLUMN_MAXIMUM_BINS)
176 SDDS_Bomb("invalid -column syntax. Can't give minimumBins and maximumBins with binFactor");
177 if (!(column[nColumns].flags & COLUMN_BIN_FACTOR) && !(column[nColumns].flags & COLUMN_MAXIMUM_BINS))
178 SDDS_Bomb("invalid -column syntax. Give maximumBins or binFactor");
179 if (column[nColumns].flags & COLUMN_BIN_FACTOR && column[nColumns].binFactor < 1)
180 SDDS_Bomb("invalid -column syntax. binFactor<1");
181 }
182 }
183 nColumns++;
184 break;
185 case CLO_PIPE:
186 if (!processPipeOption(scanned[iArg].list + 1, scanned[iArg].n_items - 1, &pipeFlags))
187 SDDS_Bomb("invalid -pipe syntax");
188 break;
189 case CLO_VERBOSE:
190 verbose = 1;
191 break;
192 case CLO_MAKE_PARAMETERS:
193 makeParameters = 1;
194 break;
195 default:
196 fprintf(stderr, "error: unknown/ambiguous option: %s\n", scanned[iArg].list[0]);
197 exit(EXIT_FAILURE);
198 break;
199 }
200 } else {
201 if (!input)
202 input = scanned[iArg].list[0];
203 else if (!output)
204 output = scanned[iArg].list[0];
205 else
206 SDDS_Bomb("too many filenames seen");
207 }
208 }
209
210 processFilenames("sddssnap2grid", &input, &output, pipeFlags, 0, NULL);
211
212 if (!SDDS_InitializeInput(&SDDSin, input))
213 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
214
215 if (!SDDS_InitializeCopy(&SDDSout, &SDDSin, output, "w"))
216 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
217 if (makeParameters)
218 AddParameterDefinitions(&SDDSout, &SDDSin, column, nColumns);
219 if (!SDDS_WriteLayout(&SDDSout))
220 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
221
222 while (SDDS_ReadPage(&SDDSin) > 0) {
223 rows = SDDS_CountRowsOfInterest(&SDDSin);
224 if (!SDDS_CopyPage(&SDDSout, &SDDSin))
225 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
226 for (iColumn = 0; iColumn < nColumns; iColumn++) {
227 if (!(data = SDDS_GetColumnInDoubles(&SDDSout, column[iColumn].name)))
228 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
229 SnapDataToGrid(data, rows, column + iColumn, verbose, &min, &max, &points);
230 if (!SDDS_SetColumnFromDoubles(&SDDSout, SDDS_SET_BY_NAME, data, rows, column[iColumn].name))
231 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
232 if (makeParameters)
233 StoreGridParameters(&SDDSout, column[iColumn].name, min, max, points);
234 free(data);
235 }
236 if (!SDDS_WritePage(&SDDSout))
237 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
238 }
239
240 if (!SDDS_Terminate(&SDDSin)) {
241 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
242 exit(EXIT_FAILURE);
243 }
244 if (!SDDS_Terminate(&SDDSout)) {
245 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
246 exit(EXIT_FAILURE);
247 }
248
249 return EXIT_SUCCESS;
250}
251
252void SnapDataToGrid(double *data, int64_t rows, COLUMN_TO_SNAP *column, int verbose,
253 double *minReturn, double *maxReturn, int64_t *pointsReturn) {
254 int64_t i, bins, centroids;
255 double min, max, hmin, hmax, span, middle;
256 char s[16384];
257 double *histogram, *whistogram, *centroid, delta;
258
259 find_min_max(&min, &max, data, rows);
260 if ((span = max - min) <= 0)
261 return;
262
263 /* Add some buffer space at the ends for histogramming */
264 span *= 1 + 2.0 / rows;
265 middle = (max + min) / 2;
266 hmin = middle - span / 2;
267 hmax = middle + span / 2;
268
269 bins = 0; /* prevent compiler warning */
270 if (column->flags & COLUMN_DELTA_GUESS)
271 bins = (hmax - hmin) / (column->deltaGuess / 10);
272 else if (column->flags & COLUMN_MAXIMUM_BINS)
273 bins = column->maximumBins;
274 else if (column->flags & COLUMN_BIN_FACTOR)
275 bins = rows * column->binFactor;
276 else
277 SDDS_Bomb("logic error. Missing flags for determination of maximum number of bins.");
278
279 if (verbose) {
280 printf("Working on %s with %" PRId64 " bins, span=%le, hmin=%le, hmax=%le\n",
281 column->name, bins, span, hmin, hmax);
282 fflush(stdout);
283 }
284
285 while (bins >= 2) {
286 histogram = calloc(bins, sizeof(*histogram));
287 whistogram = calloc(bins, sizeof(*whistogram));
288 if (verbose) {
289 printf("Histogramming %s with %" PRId64 " bins\n", column->name, bins);
290 fflush(stdout);
291 }
292 make_histogram(histogram, bins, hmin, hmax, data, rows, 1);
293 for (i = 1; i < bins; i++) {
294 if (histogram[i] && histogram[i - 1])
295 break;
296 }
297 if (i != bins) {
298 /* indicates that two adjacent bins are occupied, so we need more coarse grouping */
299 free(histogram);
300 bins *= column->adjustFactor;
301 continue;
302 }
303 if (bins <= 1) {
304 snprintf(s, 16384, "Unable to snap data for %s to grid\n", column->name);
305 SDDS_Bomb(s);
306 }
307
308 /* self-weighted histogram gives centroid*number in each bin */
309 make_histogram_weighted(whistogram, bins, hmin, hmax, data, rows, 1, data);
310 centroid = tmalloc(sizeof(*centroid) * bins);
311 centroids = 0;
312 for (i = 0; i < bins; i++) {
313 if (histogram[i]) {
314 /*
315 printf("centroid[%ld] = %le (%le/%le)\n", centroids, whistogram[i]/histogram[i],
316 whistogram[i], histogram[i]);
317 */
318 centroid[centroids++] = whistogram[i] / histogram[i];
319 }
320 }
321 free(histogram);
322 free(whistogram);
323 delta = (centroid[centroids - 1] - centroid[0]) / (centroids - 1);
324 /* assign new value using the computed delta and the first centroid as the first value */
325 for (i = 0; i < rows; i++)
326 data[i] = ((long)((data[i] - centroid[0]) / delta + 0.5)) * delta + centroid[0];
327 if (verbose) {
328 printf("Completed work for %s: delta = %le, start = %le, locations = %" PRId64 "\n",
329 column->name, delta, centroid[0], centroids);
330 fflush(stdout);
331 }
332 *minReturn = centroid[0];
333 *pointsReturn = centroids;
334 *maxReturn = centroid[0] + (centroids - 1) * delta;
335 free(centroid);
336 return;
337 }
338}
339
340#define BUFLEN 16834
341void AddParameterDefinitions(SDDS_DATASET *SDDSout, SDDS_DATASET *SDDSin, COLUMN_TO_SNAP *column, long nColumns) {
342 long icol;
343 char name1[BUFLEN], name2[BUFLEN], name3[BUFLEN], name4[BUFLEN];
344
345 for (icol = 0; icol < nColumns; icol++) {
346 snprintf(name1, BUFLEN, "%sMinimum", column[icol].name);
347 snprintf(name2, BUFLEN, "%sMaximum", column[icol].name);
348 snprintf(name3, BUFLEN, "%sInterval", column[icol].name);
349 snprintf(name4, BUFLEN, "%sDimension", column[icol].name);
350 if (!SDDS_DefineParameterLikeColumn(SDDSout, SDDSin, column[icol].name, name1) ||
351 !SDDS_DefineParameterLikeColumn(SDDSout, SDDSin, column[icol].name, name2) ||
352 !SDDS_DefineParameterLikeColumn(SDDSout, SDDSin, column[icol].name, name3) ||
353 !SDDS_DefineSimpleParameter(SDDSout, name4, NULL, SDDS_LONG64))
354 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
355 }
356}
357
358void StoreGridParameters(SDDS_DATASET *SDDSout, char *column, double min, double max, int64_t points) {
359 char name1[BUFLEN], name2[BUFLEN], name3[BUFLEN], name4[BUFLEN];
360
361 snprintf(name1, BUFLEN, "%sMinimum", column);
362 snprintf(name2, BUFLEN, "%sMaximum", column);
363 snprintf(name3, BUFLEN, "%sInterval", column);
364 snprintf(name4, BUFLEN, "%sDimension", column);
365 if (!SDDS_SetParameters(SDDSout, SDDS_SET_BY_NAME | SDDS_PASS_BY_VALUE,
366 name1, min,
367 name2, max,
368 name3, points > 1 ? (max - min) / (points - 1) : -1,
369 name4, points,
370 NULL))
371 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
372}
SDDS (Self Describing Data Set) Data Types Definitions and Function Prototypes.
int32_t SDDS_InitializeCopy(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source, char *filename, char *filemode)
Definition SDDS_copy.c:40
int32_t SDDS_CopyPage(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
Definition SDDS_copy.c:578
int32_t SDDS_SetParameters(SDDS_DATASET *SDDS_dataset, int32_t mode,...)
int32_t SDDS_SetColumnFromDoubles(SDDS_DATASET *SDDS_dataset, int32_t mode, double *data, int64_t rows,...)
Sets the values for a single data column using double-precision floating-point numbers.
int64_t SDDS_CountRowsOfInterest(SDDS_DATASET *SDDS_dataset)
Counts the number of rows marked as "of interest" in the current data table.
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_DefineSimpleParameter(SDDS_DATASET *SDDS_dataset, const char *name, const char *unit, int32_t type)
Defines a simple data parameter within the SDDS dataset.
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_DefineParameterLikeColumn(SDDS_DATASET *target, SDDS_DATASET *source, char *name, char *newName)
Defines a parameter in the target dataset based on a column definition from the source dataset.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
Definition SDDS_utils.c:677
#define SDDS_LONG
Identifier for the signed 32-bit integer data type.
Definition SDDStypes.h:61
#define SDDS_DOUBLE
Identifier for the double data type.
Definition SDDStypes.h:37
#define SDDS_LONG64
Identifier for the signed 64-bit integer data type.
Definition SDDStypes.h:49
Utility functions for SDDS dataset manipulation and string array operations.
void * tmalloc(uint64_t size_of_block)
Allocates a memory block of the specified size with zero initialization.
Definition array.c:59
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
Definition bomb.c:26
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
Definition findMinMax.c:33
long make_histogram_weighted(double *hist, long n_bins, double lo, double hi, double *data, long n_pts, long new_start, double *weight)
Compiles a weighted histogram from data points.
long make_histogram(double *hist, long n_bins, double lo, double hi, double *data, int64_t n_pts, long new_start)
Compiles a histogram from data points.
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390
long scanItemList(unsigned long *flags, char **item, long *items, unsigned long mode,...)
Scans a list of items and assigns values based on provided keywords and types.