SDDS ToolKit Programs and Libraries for C and Python
All Classes Files Functions Variables Macros Pages
sddsduplicate.c
Go to the documentation of this file.
1/**
2 * @file sddsduplicate.c
3 * @brief A program for duplicating rows in an SDDS file based on a weight column or fixed duplication factors.
4 *
5 * @details
6 * This program duplicates rows in an SDDS file and creates a new output file. The number of duplicates can be
7 * determined either by a weight column or by a fixed duplication factor. Users can specify minimum and maximum
8 * duplication factors, enable probabilistic duplication, and control verbosity settings. The program also supports
9 * input/output through pipes for integration into data processing pipelines.
10 *
11 * @section Usage
12 * ```
13 * sddsduplicate [<input>] [<output>]
14 * [-pipe=[input][,output]]
15 * [-weight=<columnName>]
16 * [-minFactor=<integer>]
17 * [-maxFactor=<integer>]
18 * [-factor=<integer>]
19 * [-probabilistic]
20 * [-seed=<integer>]
21 * [-verbosity[=<level>]]
22 * ```
23 *
24 * @section Options
25 * | Option | Description |
26 * |-------------------|----------------------------------------------------------------------------------|
27 * | `-pipe` | Use pipes for input and/or output. |
28 * | `-weight` | Specify the column to use for weighting the number of duplicates. |
29 * | `-minFactor` | Set the minimum number of rows to emit, scaling weights accordingly. |
30 * | `-maxFactor` | Set the maximum number of rows to emit, scaling weights accordingly. |
31 * | `-factor` | Specify a fixed number of duplicates to create. Mutually exclusive with `-weight`|
32 * | `-probabilistic` | Treat fractional duplication counts as probabilities. |
33 * | `-seed` | Set the random number generator seed (default: system clock). |
34 * | `-verbosity` | Set verbosity level for detailed output. |
35 *
36 * @subsection Incompatibilities
37 * - `-weight` is incompatible with:
38 * - `-factor`
39 * - `-minFactor` and `-maxFactor` only work with:
40 * - `-weight`
41 * - Only one of the following may be specified:
42 * - `-minFactor`
43 * - `-maxFactor`
44 *
45 * @copyright
46 * - (c) 2002 The University of Chicago, as Operator of Argonne National Laboratory.
47 * - (c) 2002 The Regents of the University of California, as Operator of Los Alamos National Laboratory.
48 *
49 * @license
50 * This file is distributed under the terms of the Software License Agreement
51 * found in the file LICENSE included with this distribution.
52 *
53 * @authors
54 * M. Borland, R. Soliday
55 */
56
57#include "mdb.h"
58#include "scan.h"
59#include "SDDS.h"
60
61// Enumeration of available options for parsing input arguments
62typedef enum {
63 SET_WEIGHT,
64 SET_PIPE,
65 SET_MAXFACTOR,
66 SET_MINFACTOR,
67 SET_FACTOR,
68 SET_VERBOSITY,
69 SET_SEED,
70 SET_PROBABILISTIC,
71 N_OPTIONS
72} OptionType;
73
74// String representations of each option for command-line parsing
75static char *option[N_OPTIONS] = {
76 "weight",
77 "pipe",
78 "maxfactor",
79 "minfactor",
80 "factor",
81 "verbosity",
82 "seed",
83 "probabilistic",
84};
85
86// Usage message displayed when the program is invoked incorrectly
87static char *usage =
88 "sddsduplicate [<input>] [<output>]\n"
89 " [-pipe=[input][,output]]\n"
90 " [-weight=<columnName>]\n"
91 " [-minFactor=<integer>]\n"
92 " [-maxFactor=<integer>]\n"
93 " [-factor=<integer>]\n"
94 " [-probabilistic]\n"
95 " [-seed=<integer>]\n"
96 " [-verbosity[=<level>]]\n"
97 "Options:\n"
98 " -pipe=[input][,output]\n"
99 " Use pipes for input and/or output.\n\n"
100 " -weight=<columnName>\n"
101 " Name of a column to use for weighting the number of duplicates.\n\n"
102 " -minFactor=<integer>\n"
103 " Minimum number of rows to emit. Results in scaling of weights.\n\n"
104 " -maxFactor=<integer>\n"
105 " Maximum number of rows to emit. Results in scaling of weights.\n"
106 " In some cases, input rows will not appear in the output file because\n"
107 " the weight is less than 1.\n\n"
108 " -factor=<integer>\n"
109 " Number of duplicates to create. Incompatible with -weight.\n\n"
110 " -probabilistic\n"
111 " Treat fractional duplication counts as probabilities.\n\n"
112 " -seed=<integer>\n"
113 " Set the seed for random number generation. By default, the\n"
114 " system clock is used.\n\n"
115 " -verbosity[=<level>]\n"
116 " Set verbosity level.\n\n"
117 "This program duplicates rows in the input file and creates a new file.\n"
118 "The number of duplicates is determined either by a weight column or\n"
119 "by a fixed value.\n\n"
120 "Program by Michael Borland. (" __DATE__ " " __TIME__ ", SVN revision: " SVN_VERSION ")\n";
121
122int main(int argc, char **argv) {
123 SDDS_DATASET sdds_input, sdds_output;
124 char *inputfile = NULL, *outputfile = NULL;
125 long i_arg, verbosity = 0;
126 SCANNED_ARG *s_arg;
127 unsigned long pipe_flags = 0;
128 char *weight_column_name = NULL;
129 double *weight_data = NULL, min_weight, max_weight;
130 double *dup_value = NULL;
131 long max_factor = 0, min_factor = 0, dup_rows = 0;
132 long random_number_seed = 0;
133 int64_t i, j, input_rows, stored_rows;
134 short probabilistic = 0;
135
136 // Register the program name for error messages
138 argc = scanargs(&s_arg, argc, argv);
139 if (argc < 3)
140 bomb(NULL, usage); // Ensure sufficient arguments
141
142 // Parse command-line arguments
143 for (i_arg = 1; i_arg < argc; i_arg++) {
144 if (s_arg[i_arg].arg_type == OPTION) {
145 // Match the option and process accordingly
146 switch (match_string(s_arg[i_arg].list[0], option, N_OPTIONS, 0)) {
147 case SET_PIPE:
148 // Process pipe-related options
149 if (!processPipeOption(s_arg[i_arg].list + 1, s_arg[i_arg].n_items - 1, &pipe_flags))
150 SDDS_Bomb("invalid -pipe syntax");
151 break;
152 case SET_WEIGHT:
153 // Specify the column name for weighting
154 if (s_arg[i_arg].n_items != 2 || !(weight_column_name = s_arg[i_arg].list[1]))
155 bomb("invalid -weight syntax", usage);
156 break;
157 case SET_FACTOR:
158 // Specify a fixed number of duplicates
159 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &dup_rows) != 1 || dup_rows <= 0)
160 bomb("invalid -rows syntax", usage);
161 break;
162 case SET_MINFACTOR:
163 // Specify the minimum duplication factor
164 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &min_factor) != 1 || min_factor <= 0)
165 bomb("invalid -minFactor syntax", usage);
166 break;
167 case SET_MAXFACTOR:
168 // Specify the maximum duplication factor
169 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &max_factor) != 1 || max_factor <= 0)
170 bomb("invalid -maxFactor syntax", usage);
171 break;
172 case SET_VERBOSITY:
173 // Set the verbosity level
174 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &verbosity) != 1 || verbosity < 0)
175 bomb("invalid -verbosity syntax", usage);
176 break;
177 case SET_PROBABILISTIC:
178 // Enable probabilistic duplication
179 probabilistic = 1;
180 if (s_arg[i_arg].n_items != 1)
181 bomb("invalid -probabilistic syntax", usage);
182 break;
183 case SET_SEED:
184 // Set the random number generator seed
185 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &random_number_seed) != 1 || random_number_seed < 0)
186 bomb("invalid -seed syntax", usage);
187 break;
188 default:
189 // Unrecognized option
190 bomb("unrecognized option", usage);
191 break;
192 }
193 } else {
194 // Process input and output file arguments
195 if (!inputfile)
196 inputfile = s_arg[i_arg].list[0];
197 else if (!outputfile)
198 outputfile = s_arg[i_arg].list[0];
199 else
200 SDDS_Bomb("too many filenames");
201 }
202 }
203
204 // Ensure only one of minFactor or maxFactor is set
205 if (min_factor && max_factor)
206 SDDS_Bomb("give only one of -minFactor and -maxFactor");
207
208 // Process file names and pipe configurations
209 processFilenames("sddsduplicate", &inputfile, &outputfile, pipe_flags, 0, NULL);
210
211 // Initialize random number generator
212 if (random_number_seed == 0) {
213 random_number_seed = (long)time(NULL); // Use system clock if no seed provided
214 random_number_seed = 2 * (random_number_seed / 2) + 1; // Ensure odd seed
215#if defined(_WIN32) || defined(__APPLE__)
216 random_1(-labs(random_number_seed));
217#else
218 random_1(-FABS(random_number_seed));
219#endif
220 } else {
221 random_1(-random_number_seed); // Use specified seed
222 }
223
224 // Initialize SDDS datasets
225 if (!SDDS_InitializeInput(&sdds_input, inputfile) ||
226 !SDDS_InitializeCopy(&sdds_output, &sdds_input, outputfile, "w") ||
227 !SDDS_WriteLayout(&sdds_output)) {
228 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
229 }
230
231 // Main loop for reading and duplicating rows
232 while (SDDS_ReadPage(&sdds_input) > 0) {
233 input_rows = SDDS_RowCount(&sdds_input); // Get number of rows in current page
234 if (input_rows > 0) {
235 dup_value = tmalloc(sizeof(*dup_value) * input_rows); // Allocate duplication array
236
237 // Handle weighting logic if a weight column is specified
238 if (weight_column_name) {
239 if (!(weight_data = SDDS_GetColumnInDoubles(&sdds_input, weight_column_name))) {
240 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
241 }
242
243 // Scale weights based on minFactor or maxFactor
244 if (min_factor) {
245 find_min_max(&min_weight, &max_weight, weight_data, input_rows);
246 if (min_weight <= 0)
247 SDDS_Bomb("Minimum weight value is nonpositive. Can't use -minFactor.");
248 for (i = 0; i < input_rows; i++)
249 dup_value[i] = weight_data[i] * min_factor / min_weight;
250 } else if (max_factor) {
251 find_min_max(&min_weight, &max_weight, weight_data, input_rows);
252 if (max_weight <= 0)
253 SDDS_Bomb("Maximum weight value is nonpositive. Can't use -maxFactor.");
254 for (i = 0; i < input_rows; i++)
255 dup_value[i] = weight_data[i] * max_factor / max_weight;
256 } else {
257 for (i = 0; i < input_rows; i++)
258 dup_value[i] = weight_data[i];
259 }
260
261 // Apply probabilistic logic for fractional duplication counts
262 if (probabilistic) {
263 double fraction;
264 for (i = 0; i < input_rows; i++) {
265 fraction = dup_value[i] - ((long)dup_value[i]);
266 dup_value[i] = (long)dup_value[i];
267 if (fraction > random_1(0))
268 dup_value[i] += 1;
269 }
270 } else {
271 for (i = 0; i < input_rows; i++)
272 dup_value[i] = (long)dup_value[i];
273 }
274 } else {
275 // Use fixed duplication factor if no weight column is provided
276 for (i = 0; i < input_rows; i++)
277 dup_value[i] = dup_rows;
278 }
279
280 // Count total rows to be stored after duplication
281 stored_rows = 0;
282 for (i = 0; i < input_rows; i++)
283 stored_rows += (int64_t)dup_value[i];
284
285 // Print duplication summary if verbosity is enabled
286 if (verbosity) {
287 int64_t max_dup = 0, min_dup = INT64_MAX;
288 for (i = 0; i < input_rows; i++) {
289 if (max_dup < dup_value[i])
290 max_dup = dup_value[i];
291 if (min_dup > dup_value[i])
292 min_dup = dup_value[i];
293 }
294 fprintf(stderr, "%" PRId64 " output rows, minimum and maximum duplication factor: %" PRId64 ", %" PRId64 "\n",
295 stored_rows, min_dup, max_dup);
296 }
297
298 // Start a new SDDS page and copy data
299 if (!SDDS_StartPage(&sdds_output, stored_rows) ||
300 !SDDS_CopyParameters(&sdds_output, &sdds_input) ||
301 !SDDS_CopyArrays(&sdds_output, &sdds_input)) {
302 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
303 }
304
305 // Duplicate rows into the output dataset
306 stored_rows = 0;
307 for (i = 0; i < input_rows; i++) {
308 for (j = 0; j < dup_value[i]; j++) {
309 if (SDDS_CopyRowDirect(&sdds_output, stored_rows++, &sdds_input, i)) {
310 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
311 }
312 }
313 }
314
315 // Write the completed page to the output file
316 if (!SDDS_WritePage(&sdds_output)) {
317 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
318 }
319
320 free(dup_value); // Free allocated memory for duplication array
321 dup_value = NULL;
322 }
323 }
324
325 // Terminate SDDS datasets and close files
326 if (!SDDS_Terminate(&sdds_input) || !SDDS_Terminate(&sdds_output)) {
327 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
328 }
329
330 return 0; // Exit successfully
331}
SDDS (Self Describing Data Set) Data Types Definitions and Function Prototypes.
int32_t SDDS_CopyParameters(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
Definition SDDS_copy.c:286
int32_t SDDS_InitializeCopy(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source, char *filename, char *filemode)
Definition SDDS_copy.c:40
int32_t SDDS_CopyRowDirect(SDDS_DATASET *SDDS_target, int64_t target_row, SDDS_DATASET *SDDS_source, int64_t source_row)
Definition SDDS_copy.c:834
int32_t SDDS_CopyArrays(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
Definition SDDS_copy.c:334
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
void * tmalloc(uint64_t size_of_block)
Allocates a memory block of the specified size with zero initialization.
Definition array.c:59
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
Definition bomb.c:26
double random_1(long iseed)
Generate a uniform random double in [0,1] using a custom seed initialization.
Definition drand.c:175
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
Definition findMinMax.c:33
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390