SDDSlib
Loading...
Searching...
No Matches
sddssnap2grid.c
Go to the documentation of this file.
1/**
2 * @file sddssnap2grid.c
3 * @brief Snap data columns to a regular grid in SDDS files.
4 *
5 * This program processes SDDS files to adjust specified data columns
6 * so that their values align to a regular grid. The program supports
7 * options for controlling the bin size, adjustment factor, and initial grid
8 * spacing guess. It also optionally generates parameters describing the grid
9 * such as minimum, maximum, interval, and dimension.
10 *
11 * ## Key Features
12 * - Aligns data columns to a grid using adaptive binning.
13 * - Supports custom maximum bins, bin factors, and delta guess values.
14 * - Reports grid parameters (minimum, maximum, interval, and points).
15 * - Verbose mode for detailed processing output.
16 *
17 * ## Options
18 * - **-pipe**: Standard SDDS toolkit pipe option for input/output.
19 * - **-column**: Specify a column to process with options for bin settings.
20 * - **-makeParameters**: Store grid information as parameters in the output file.
21 * - **-verbose**: Display detailed information during processing.
22 *
23 * ## Example Usage
24 * ```bash
25 * sddssnap2grid input.sdds output.sdds -column=Position,binFactor=20
26 * sddssnap2grid -pipe=input,output -column=Time,deltaGuess=0.01
27 * ```
28 *
29 * @copyright
30 * - (c) 2002 The University of Chicago, as Operator of Argonne National Laboratory.
31 * - (c) 2002 The Regents of the University of California, as Operator of Los Alamos National Laboratory.
32 *
33 * @license
34 * This file is distributed under the terms of the Software License Agreement
35 * found in the file LICENSE included with this distribution.
36 *
37 * @author M. Borland, R. Soliday
38 */
39
40#include "mdb.h"
41#include "SDDS.h"
42#include "scan.h"
43#include "SDDSutils.h"
44#include <ctype.h>
45
46/* Enumeration for option types */
47enum option_type {
48 CLO_PIPE,
49 CLO_COLUMN,
50 CLO_VERBOSE,
51 CLO_MAKE_PARAMETERS,
52 N_OPTIONS
53};
54
55typedef struct {
56 unsigned long flags;
57 char *name;
58 int32_t maximumBins, binFactor;
59 double deltaGuess, adjustFactor;
61
62char *option[N_OPTIONS] = {
63 "pipe",
64 "column",
65 "verbose",
66 "makeparameters"
67};
68
69#define COLUMN_MAXIMUM_BINS 0x01UL
70#define COLUMN_BIN_FACTOR 0x02UL
71#define COLUMN_DELTA_GUESS 0x04UL
72#define COLUMN_ADJUST_FACTOR 0x08UL
73
74static char *USAGE =
75 "sddssnap2grid [<inputfile>] [<outputfile>] [-pipe=[input][,output]]\n"
76 " -column=<name>,[{maximumBins=<value> | binFactor=<value>} | deltaGuess=<value>][,adjustFactor=<value>]\n"
77 " [-column=...]\n"
78 " [-makeParameters] [-verbose]\n"
79 "\n"
80 "Options:\n"
81 " -pipe Standard SDDS Toolkit pipe option.\n"
82 " -column Specify the name of a column to modify for equispaced values.\n"
83 " The default mode uses binFactor = 10, meaning the maximum number\n"
84 " of bins is 10 times the number of data points. The algorithm works as follows:\n"
85 " 1. Bin the data with the maximum number of bins.\n"
86 " 2. If no two adjacent bins are populated, use this grouping to compute\n"
87 " centroids for each subset, providing delta values.\n"
88 " 3. If two adjacent bins are populated, multiply the number of bins by\n"
89 " adjustFactor (default: 0.9) and repeat the process.\n"
90 " Alternatively, you can provide a guess for the grid spacing;\n"
91 " the algorithm will use 1/10 of this as the initial bin size.\n"
92 " -makeParameters\n"
93 " Store grid parameters in the output file as parameters named\n"
94 " <name>Minimum, <name>Maximum, <name>Interval, and <name>Dimension.\n"
95 " -verbose Report the computed deltas and the number of grid points.\n"
96 "\n"
97 "Program by Michael Borland. (" __DATE__ " " __TIME__ ", SVN revision: " SVN_VERSION ")\n";
98
99void SnapDataToGrid(double *data, int64_t rows, COLUMN_TO_SNAP *column, int verbose, double *min, double *max, int64_t *points);
100void AddParameterDefinitions(SDDS_DATASET *SDDSout, SDDS_DATASET *SDDSin, COLUMN_TO_SNAP *column, long nColumns);
101void StoreGridParameters(SDDS_DATASET *SDDSout, char *column, double min, double max, int64_t points);
102
103int main(int argc, char **argv) {
104 int iArg, iColumn, verbose;
105 char *input, *output;
106 unsigned long pipeFlags;
107 SCANNED_ARG *scanned;
108 SDDS_DATASET SDDSin, SDDSout;
109 COLUMN_TO_SNAP *column;
110 long nColumns;
111 double *data;
112 int64_t rows;
113 short makeParameters;
114 double min, max;
115 int64_t points;
116
118 argc = scanargs(&scanned, argc, argv);
119 if (argc < 3)
120 bomb(NULL, USAGE);
121
122 nColumns = 0;
123 column = NULL;
124 input = output = NULL;
125 pipeFlags = 0;
126 verbose = 0;
127 makeParameters = 0;
128
129 for (iArg = 1; iArg < argc; iArg++) {
130 if (scanned[iArg].arg_type == OPTION) {
131 /* process options here */
132 switch (match_string(scanned[iArg].list[0], option, N_OPTIONS, 0)) {
133 case CLO_COLUMN:
134 column = SDDS_Realloc(column, sizeof(*column) * (nColumns + 1));
135 column[nColumns].name = scanned[iArg].list[1];
136 column[nColumns].maximumBins = -1;
137 column[nColumns].binFactor = 10;
138 column[nColumns].deltaGuess = -1;
139 column[nColumns].flags = COLUMN_BIN_FACTOR;
140 column[nColumns].adjustFactor = 0.9;
141 scanned[iArg].n_items -= 2;
142 if (scanned[iArg].n_items) {
143 if (!scanItemList(&(column[nColumns].flags), scanned[iArg].list + 2, &scanned[iArg].n_items, 0,
144 "maximumbins", SDDS_LONG, &(column[nColumns].maximumBins), 1, COLUMN_MAXIMUM_BINS,
145 "binfactor", SDDS_LONG, &(column[nColumns].binFactor), 1, COLUMN_BIN_FACTOR,
146 "deltaguess", SDDS_DOUBLE, &(column[nColumns].deltaGuess), 1, COLUMN_DELTA_GUESS,
147 "adjustfactor", SDDS_DOUBLE, &(column[nColumns].adjustFactor), 1, COLUMN_ADJUST_FACTOR,
148 NULL)) {
149 SDDS_Bomb("invalid -column syntax");
150 }
151 if (column[nColumns].flags & COLUMN_ADJUST_FACTOR) {
152 if (column[nColumns].adjustFactor <= 0 || column[nColumns].adjustFactor >= 1)
153 SDDS_Bomb("invalid -column syntax. adjustFactor must be (0,1)");
154 }
155 if (column[nColumns].flags & COLUMN_DELTA_GUESS) {
156 if (column[nColumns].flags & ~(COLUMN_DELTA_GUESS | COLUMN_ADJUST_FACTOR))
157 SDDS_Bomb("invalid -column syntax. Can't combine deltaGuess with other options.");
158 if (column[nColumns].deltaGuess <= 0)
159 SDDS_Bomb("invalid -column syntax. deltaGuess<=0.");
160 } else {
161 if (column[nColumns].flags & COLUMN_BIN_FACTOR && column[nColumns].flags & COLUMN_MAXIMUM_BINS)
162 SDDS_Bomb("invalid -column syntax. Can't give minimumBins and maximumBins with binFactor");
163 if (!(column[nColumns].flags & COLUMN_BIN_FACTOR) && !(column[nColumns].flags & COLUMN_MAXIMUM_BINS))
164 SDDS_Bomb("invalid -column syntax. Give maximumBins or binFactor");
165 if (column[nColumns].flags & COLUMN_BIN_FACTOR && column[nColumns].binFactor < 1)
166 SDDS_Bomb("invalid -column syntax. binFactor<1");
167 }
168 }
169 nColumns++;
170 break;
171 case CLO_PIPE:
172 if (!processPipeOption(scanned[iArg].list + 1, scanned[iArg].n_items - 1, &pipeFlags))
173 SDDS_Bomb("invalid -pipe syntax");
174 break;
175 case CLO_VERBOSE:
176 verbose = 1;
177 break;
178 case CLO_MAKE_PARAMETERS:
179 makeParameters = 1;
180 break;
181 default:
182 fprintf(stderr, "error: unknown/ambiguous option: %s\n", scanned[iArg].list[0]);
183 exit(EXIT_FAILURE);
184 break;
185 }
186 } else {
187 if (!input)
188 input = scanned[iArg].list[0];
189 else if (!output)
190 output = scanned[iArg].list[0];
191 else
192 SDDS_Bomb("too many filenames seen");
193 }
194 }
195
196 processFilenames("sddssnap2grid", &input, &output, pipeFlags, 0, NULL);
197
198 if (!SDDS_InitializeInput(&SDDSin, input))
199 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
200
201 if (!SDDS_InitializeCopy(&SDDSout, &SDDSin, output, "w"))
202 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
203 if (makeParameters)
204 AddParameterDefinitions(&SDDSout, &SDDSin, column, nColumns);
205 if (!SDDS_WriteLayout(&SDDSout))
206 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
207
208 while (SDDS_ReadPage(&SDDSin) > 0) {
209 rows = SDDS_CountRowsOfInterest(&SDDSin);
210 if (!SDDS_CopyPage(&SDDSout, &SDDSin))
211 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
212 for (iColumn = 0; iColumn < nColumns; iColumn++) {
213 if (!(data = SDDS_GetColumnInDoubles(&SDDSout, column[iColumn].name)))
214 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
215 SnapDataToGrid(data, rows, column + iColumn, verbose, &min, &max, &points);
216 if (!SDDS_SetColumnFromDoubles(&SDDSout, SDDS_SET_BY_NAME, data, rows, column[iColumn].name))
217 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
218 if (makeParameters)
219 StoreGridParameters(&SDDSout, column[iColumn].name, min, max, points);
220 free(data);
221 }
222 if (!SDDS_WritePage(&SDDSout))
223 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
224 }
225
226 if (!SDDS_Terminate(&SDDSin)) {
227 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
228 exit(EXIT_FAILURE);
229 }
230 if (!SDDS_Terminate(&SDDSout)) {
231 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
232 exit(EXIT_FAILURE);
233 }
234
235 return EXIT_SUCCESS;
236}
237
238void SnapDataToGrid(double *data, int64_t rows, COLUMN_TO_SNAP *column, int verbose,
239 double *minReturn, double *maxReturn, int64_t *pointsReturn) {
240 int64_t i, bins, centroids;
241 double min, max, hmin, hmax, span, middle;
242 char s[16384];
243 double *histogram, *whistogram, *centroid, delta;
244
245 find_min_max(&min, &max, data, rows);
246 if ((span = max - min) <= 0)
247 return;
248
249 /* Add some buffer space at the ends for histogramming */
250 span *= 1 + 2.0 / rows;
251 middle = (max + min) / 2;
252 hmin = middle - span / 2;
253 hmax = middle + span / 2;
254
255 bins = 0; /* prevent compiler warning */
256 if (column->flags & COLUMN_DELTA_GUESS)
257 bins = (hmax - hmin) / (column->deltaGuess / 10);
258 else if (column->flags & COLUMN_MAXIMUM_BINS)
259 bins = column->maximumBins;
260 else if (column->flags & COLUMN_BIN_FACTOR)
261 bins = rows * column->binFactor;
262 else
263 SDDS_Bomb("logic error. Missing flags for determination of maximum number of bins.");
264
265 if (verbose) {
266 printf("Working on %s with %" PRId64 " bins, span=%le, hmin=%le, hmax=%le\n",
267 column->name, bins, span, hmin, hmax);
268 fflush(stdout);
269 }
270
271 while (bins >= 2) {
272 histogram = calloc(bins, sizeof(*histogram));
273 whistogram = calloc(bins, sizeof(*whistogram));
274 if (verbose) {
275 printf("Histogramming %s with %" PRId64 " bins\n", column->name, bins);
276 fflush(stdout);
277 }
278 make_histogram(histogram, bins, hmin, hmax, data, rows, 1);
279 for (i = 1; i < bins; i++) {
280 if (histogram[i] && histogram[i - 1])
281 break;
282 }
283 if (i != bins) {
284 /* indicates that two adjacent bins are occupied, so we need more coarse grouping */
285 free(histogram);
286 bins *= column->adjustFactor;
287 continue;
288 }
289 if (bins <= 1) {
290 snprintf(s, 16384, "Unable to snap data for %s to grid\n", column->name);
291 SDDS_Bomb(s);
292 }
293
294 /* self-weighted histogram gives centroid*number in each bin */
295 make_histogram_weighted(whistogram, bins, hmin, hmax, data, rows, 1, data);
296 centroid = tmalloc(sizeof(*centroid) * bins);
297 centroids = 0;
298 for (i = 0; i < bins; i++) {
299 if (histogram[i]) {
300 /*
301 printf("centroid[%ld] = %le (%le/%le)\n", centroids, whistogram[i]/histogram[i],
302 whistogram[i], histogram[i]);
303 */
304 centroid[centroids++] = whistogram[i] / histogram[i];
305 }
306 }
307 free(histogram);
308 free(whistogram);
309 delta = (centroid[centroids - 1] - centroid[0]) / (centroids - 1);
310 /* assign new value using the computed delta and the first centroid as the first value */
311 for (i = 0; i < rows; i++)
312 data[i] = ((long)((data[i] - centroid[0]) / delta + 0.5)) * delta + centroid[0];
313 if (verbose) {
314 printf("Completed work for %s: delta = %le, start = %le, locations = %" PRId64 "\n",
315 column->name, delta, centroid[0], centroids);
316 fflush(stdout);
317 }
318 *minReturn = centroid[0];
319 *pointsReturn = centroids;
320 *maxReturn = centroid[0] + (centroids - 1) * delta;
321 free(centroid);
322 return;
323 }
324}
325
326#define BUFLEN 16834
327void AddParameterDefinitions(SDDS_DATASET *SDDSout, SDDS_DATASET *SDDSin, COLUMN_TO_SNAP *column, long nColumns) {
328 long icol;
329 char name1[BUFLEN], name2[BUFLEN], name3[BUFLEN], name4[BUFLEN];
330
331 for (icol = 0; icol < nColumns; icol++) {
332 snprintf(name1, BUFLEN, "%sMinimum", column[icol].name);
333 snprintf(name2, BUFLEN, "%sMaximum", column[icol].name);
334 snprintf(name3, BUFLEN, "%sInterval", column[icol].name);
335 snprintf(name4, BUFLEN, "%sDimension", column[icol].name);
336 if (!SDDS_DefineParameterLikeColumn(SDDSout, SDDSin, column[icol].name, name1) ||
337 !SDDS_DefineParameterLikeColumn(SDDSout, SDDSin, column[icol].name, name2) ||
338 !SDDS_DefineParameterLikeColumn(SDDSout, SDDSin, column[icol].name, name3) ||
339 !SDDS_DefineSimpleParameter(SDDSout, name4, NULL, SDDS_LONG64))
340 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
341 }
342}
343
344void StoreGridParameters(SDDS_DATASET *SDDSout, char *column, double min, double max, int64_t points) {
345 char name1[BUFLEN], name2[BUFLEN], name3[BUFLEN], name4[BUFLEN];
346
347 snprintf(name1, BUFLEN, "%sMinimum", column);
348 snprintf(name2, BUFLEN, "%sMaximum", column);
349 snprintf(name3, BUFLEN, "%sInterval", column);
350 snprintf(name4, BUFLEN, "%sDimension", column);
351 if (!SDDS_SetParameters(SDDSout, SDDS_SET_BY_NAME | SDDS_PASS_BY_VALUE,
352 name1, min,
353 name2, max,
354 name3, points > 1 ? (max - min) / (points - 1) : -1,
355 name4, points,
356 NULL))
357 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
358}
SDDS (Self Describing Data Set) Data Types Definitions and Function Prototypes.
int32_t SDDS_InitializeCopy(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source, char *filename, char *filemode)
Definition SDDS_copy.c:40
int32_t SDDS_CopyPage(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
Definition SDDS_copy.c:578
int32_t SDDS_SetParameters(SDDS_DATASET *SDDS_dataset, int32_t mode,...)
int32_t SDDS_SetColumnFromDoubles(SDDS_DATASET *SDDS_dataset, int32_t mode, double *data, int64_t rows,...)
Sets the values for a single data column using double-precision floating-point numbers.
int64_t SDDS_CountRowsOfInterest(SDDS_DATASET *SDDS_dataset)
Counts the number of rows marked as "of interest" in the current data table.
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_DefineSimpleParameter(SDDS_DATASET *SDDS_dataset, const char *name, const char *unit, int32_t type)
Defines a simple data parameter within the SDDS dataset.
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_DefineParameterLikeColumn(SDDS_DATASET *target, SDDS_DATASET *source, char *name, char *newName)
Defines a parameter in the target dataset based on a column definition from the source dataset.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
Definition SDDS_utils.c:677
#define SDDS_LONG
Identifier for the signed 32-bit integer data type.
Definition SDDStypes.h:61
#define SDDS_DOUBLE
Identifier for the double data type.
Definition SDDStypes.h:37
#define SDDS_LONG64
Identifier for the signed 64-bit integer data type.
Definition SDDStypes.h:49
void * tmalloc(uint64_t size_of_block)
Allocates a memory block of the specified size with zero initialization.
Definition array.c:59
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
Definition bomb.c:26
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
Definition findMinMax.c:33
long make_histogram_weighted(double *hist, long n_bins, double lo, double hi, double *data, long n_pts, long new_start, double *weight)
Compiles a weighted histogram from data points.
long make_histogram(double *hist, long n_bins, double lo, double hi, double *data, int64_t n_pts, long new_start)
Compiles a histogram from data points.
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390
long scanItemList(unsigned long *flags, char **item, long *items, unsigned long mode,...)
Scans a list of items and assigns values based on provided keywords and types.