SDDSlib
Loading...
Searching...
No Matches
sddsduplicate.c
Go to the documentation of this file.
1/**
2 * @file sddsduplicate.c
3 * @brief A program for duplicating rows in a file based on a weight column.
4 *
5 * This program duplicates rows in an SDDS file and creates a new file. The number of
6 * duplicates is determined either by a weight column or by a fixed value. The program
7 * provides options for minimum and maximum factors, probabilistic duplication, and
8 * verbosity settings. The output can be directed through pipes, making it suitable for
9 * use in larger data processing pipelines.
10 *
11 * @details
12 * ### Usage:
13 * `sddsduplicate [<input>] [<output>] [options]`
14 *
15 * ### Options:
16 * - `-pipe=[input][,output]` Use pipes for input and/or output.
17 * - `-weight=<columnName>` Specify the column to use for weighting the number of duplicates.
18 * - `-minFactor=<integer>` Set the minimum number of rows to emit, scaling weights accordingly.
19 * - `-maxFactor=<integer>` Set the maximum number of rows to emit, scaling weights accordingly.
20 * - `-factor=<integer>` Specify the fixed number of duplicates to create (incompatible with `-weight`).
21 * - `-probabilistic` Treat fractional duplication counts as probabilities.
22 * - `-seed=<integer>` Set the random number generator seed (default: system clock).
23 * - `-verbosity[=<level>]` Set verbosity level for detailed output.
24 *
25 * ### Example:
26 * `sddsduplicate input.sdds output.sdds -weight=Population -maxFactor=10`
27 *
28 * This command creates `output.sdds` by duplicating rows from `input.sdds` based on
29 * the `Population` column, scaling values such that the maximum duplication factor is 10.
30 *
31 * @copyright
32 * - (c) 2002 The University of Chicago, as Operator of Argonne National Laboratory.
33 * - (c) 2002 The Regents of the University of California, as Operator of Los Alamos National Laboratory.
34 *
35 * @license
36 * This file is distributed under the terms of the Software License Agreement
37 * found in the file LICENSE included with this distribution.
38 *
39 * @author M. Borland, R. Soliday
40 */
41
42#include "mdb.h"
43#include "scan.h"
44#include "SDDS.h"
45
46// Enumeration of available options for parsing input arguments
47typedef enum {
48 SET_WEIGHT,
49 SET_PIPE,
50 SET_MAXFACTOR,
51 SET_MINFACTOR,
52 SET_FACTOR,
53 SET_VERBOSITY,
54 SET_SEED,
55 SET_PROBABILISTIC,
56 N_OPTIONS
58
59// String representations of each option for command-line parsing
60static char *option[N_OPTIONS] = {
61 "weight",
62 "pipe",
63 "maxfactor",
64 "minfactor",
65 "factor",
66 "verbosity",
67 "seed",
68 "probabilistic",
69};
70
71// Usage message displayed when the program is invoked incorrectly
72static char *usage =
73 "Usage: sddsduplicate [<input>] [<output>] [options]\n\n"
74 "Options:\n"
75 " -pipe=[input][,output]\n"
76 " Use pipes for input and/or output.\n\n"
77 " -weight=<columnName>\n"
78 " Name of a column to use for weighting the number of duplicates.\n\n"
79 " -minFactor=<integer>\n"
80 " Minimum number of rows to emit. Results in scaling of weights.\n\n"
81 " -maxFactor=<integer>\n"
82 " Maximum number of rows to emit. Results in scaling of weights.\n"
83 " In some cases, input rows will not appear in the output file because\n"
84 " the weight is less than 1.\n\n"
85 " -factor=<integer>\n"
86 " Number of duplicates to create. Incompatible with -weight.\n\n"
87 " -probabilistic\n"
88 " Treat fractional duplication counts as probabilities.\n\n"
89 " -seed=<integer>\n"
90 " Set the seed for random number generation. By default, the\n"
91 " system clock is used.\n\n"
92 " -verbosity[=<level>]\n"
93 " Set verbosity level.\n\n"
94 "This program duplicates rows in the input file and creates a new file.\n"
95 "The number of duplicates is determined either by a weight column or\n"
96 "by a fixed value.\n\n"
97 "Program by Michael Borland. (" __DATE__ " " __TIME__ ", SVN revision: " SVN_VERSION ")\n";
98
99int main(int argc, char **argv) {
100 SDDS_DATASET sdds_input, sdds_output;
101 char *inputfile = NULL, *outputfile = NULL;
102 long i_arg, verbosity = 0;
103 SCANNED_ARG *s_arg;
104 unsigned long pipe_flags = 0;
105 char *weight_column_name = NULL;
106 double *weight_data = NULL, min_weight, max_weight;
107 double *dup_value = NULL;
108 long max_factor = 0, min_factor = 0, dup_rows = 0;
109 long random_number_seed = 0;
110 int64_t i, j, input_rows, stored_rows;
111 short probabilistic = 0;
112
113 // Register the program name for error messages
115 argc = scanargs(&s_arg, argc, argv);
116 if (argc < 3)
117 bomb(NULL, usage); // Ensure sufficient arguments
118
119 // Parse command-line arguments
120 for (i_arg = 1; i_arg < argc; i_arg++) {
121 if (s_arg[i_arg].arg_type == OPTION) {
122 // Match the option and process accordingly
123 switch (match_string(s_arg[i_arg].list[0], option, N_OPTIONS, 0)) {
124 case SET_PIPE:
125 // Process pipe-related options
126 if (!processPipeOption(s_arg[i_arg].list + 1, s_arg[i_arg].n_items - 1, &pipe_flags))
127 SDDS_Bomb("invalid -pipe syntax");
128 break;
129 case SET_WEIGHT:
130 // Specify the column name for weighting
131 if (s_arg[i_arg].n_items != 2 || !(weight_column_name = s_arg[i_arg].list[1]))
132 bomb("invalid -weight syntax", usage);
133 break;
134 case SET_FACTOR:
135 // Specify a fixed number of duplicates
136 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &dup_rows) != 1 || dup_rows <= 0)
137 bomb("invalid -rows syntax", usage);
138 break;
139 case SET_MINFACTOR:
140 // Specify the minimum duplication factor
141 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &min_factor) != 1 || min_factor <= 0)
142 bomb("invalid -minFactor syntax", usage);
143 break;
144 case SET_MAXFACTOR:
145 // Specify the maximum duplication factor
146 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &max_factor) != 1 || max_factor <= 0)
147 bomb("invalid -maxFactor syntax", usage);
148 break;
149 case SET_VERBOSITY:
150 // Set the verbosity level
151 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &verbosity) != 1 || verbosity < 0)
152 bomb("invalid -verbosity syntax", usage);
153 break;
154 case SET_PROBABILISTIC:
155 // Enable probabilistic duplication
156 probabilistic = 1;
157 if (s_arg[i_arg].n_items != 1)
158 bomb("invalid -probabilistic syntax", usage);
159 break;
160 case SET_SEED:
161 // Set the random number generator seed
162 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &random_number_seed) != 1 || random_number_seed < 0)
163 bomb("invalid -seed syntax", usage);
164 break;
165 default:
166 // Unrecognized option
167 bomb("unrecognized option", usage);
168 break;
169 }
170 } else {
171 // Process input and output file arguments
172 if (!inputfile)
173 inputfile = s_arg[i_arg].list[0];
174 else if (!outputfile)
175 outputfile = s_arg[i_arg].list[0];
176 else
177 SDDS_Bomb("too many filenames");
178 }
179 }
180
181 // Ensure only one of minFactor or maxFactor is set
182 if (min_factor && max_factor)
183 SDDS_Bomb("give only one of -minFactor and -maxFactor");
184
185 // Process file names and pipe configurations
186 processFilenames("sddsduplicate", &inputfile, &outputfile, pipe_flags, 0, NULL);
187
188 // Initialize random number generator
189 if (random_number_seed == 0) {
190 random_number_seed = (long)time(NULL); // Use system clock if no seed provided
191 random_number_seed = 2 * (random_number_seed / 2) + 1; // Ensure odd seed
192#if defined(_WIN32) || defined(darwin)
193 random_1(-labs(random_number_seed));
194#else
195 random_1(-FABS(random_number_seed));
196#endif
197 } else {
198 random_1(-random_number_seed); // Use specified seed
199 }
200
201 // Initialize SDDS datasets
202 if (!SDDS_InitializeInput(&sdds_input, inputfile) ||
203 !SDDS_InitializeCopy(&sdds_output, &sdds_input, outputfile, "w") ||
204 !SDDS_WriteLayout(&sdds_output)) {
205 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
206 }
207
208 // Main loop for reading and duplicating rows
209 while (SDDS_ReadPage(&sdds_input) > 0) {
210 input_rows = SDDS_RowCount(&sdds_input); // Get number of rows in current page
211 if (input_rows > 0) {
212 dup_value = tmalloc(sizeof(*dup_value) * input_rows); // Allocate duplication array
213
214 // Handle weighting logic if a weight column is specified
215 if (weight_column_name) {
216 if (!(weight_data = SDDS_GetColumnInDoubles(&sdds_input, weight_column_name))) {
217 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
218 }
219
220 // Scale weights based on minFactor or maxFactor
221 if (min_factor) {
222 find_min_max(&min_weight, &max_weight, weight_data, input_rows);
223 if (min_weight <= 0)
224 SDDS_Bomb("Minimum weight value is nonpositive. Can't use -minFactor.");
225 for (i = 0; i < input_rows; i++)
226 dup_value[i] = weight_data[i] * min_factor / min_weight;
227 } else if (max_factor) {
228 find_min_max(&min_weight, &max_weight, weight_data, input_rows);
229 if (max_weight <= 0)
230 SDDS_Bomb("Maximum weight value is nonpositive. Can't use -maxFactor.");
231 for (i = 0; i < input_rows; i++)
232 dup_value[i] = weight_data[i] * max_factor / max_weight;
233 } else {
234 for (i = 0; i < input_rows; i++)
235 dup_value[i] = weight_data[i];
236 }
237
238 // Apply probabilistic logic for fractional duplication counts
239 if (probabilistic) {
240 double fraction;
241 for (i = 0; i < input_rows; i++) {
242 fraction = dup_value[i] - ((long)dup_value[i]);
243 dup_value[i] = (long)dup_value[i];
244 if (fraction > random_1(0))
245 dup_value[i] += 1;
246 }
247 } else {
248 for (i = 0; i < input_rows; i++)
249 dup_value[i] = (long)dup_value[i];
250 }
251 } else {
252 // Use fixed duplication factor if no weight column is provided
253 for (i = 0; i < input_rows; i++)
254 dup_value[i] = dup_rows;
255 }
256
257 // Count total rows to be stored after duplication
258 stored_rows = 0;
259 for (i = 0; i < input_rows; i++)
260 stored_rows += (int64_t)dup_value[i];
261
262 // Print duplication summary if verbosity is enabled
263 if (verbosity) {
264 int64_t max_dup = 0, min_dup = INT64_MAX;
265 for (i = 0; i < input_rows; i++) {
266 if (max_dup < dup_value[i])
267 max_dup = dup_value[i];
268 if (min_dup > dup_value[i])
269 min_dup = dup_value[i];
270 }
271 fprintf(stderr, "%" PRId64 " output rows, minimum and maximum duplication factor: %" PRId64 ", %" PRId64 "\n",
272 stored_rows, min_dup, max_dup);
273 }
274
275 // Start a new SDDS page and copy data
276 if (!SDDS_StartPage(&sdds_output, stored_rows) ||
277 !SDDS_CopyParameters(&sdds_output, &sdds_input) ||
278 !SDDS_CopyArrays(&sdds_output, &sdds_input)) {
279 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
280 }
281
282 // Duplicate rows into the output dataset
283 stored_rows = 0;
284 for (i = 0; i < input_rows; i++) {
285 for (j = 0; j < dup_value[i]; j++) {
286 if (SDDS_CopyRowDirect(&sdds_output, stored_rows++, &sdds_input, i)) {
287 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
288 }
289 }
290 }
291
292 // Write the completed page to the output file
293 if (!SDDS_WritePage(&sdds_output)) {
294 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
295 }
296
297 free(dup_value); // Free allocated memory for duplication array
298 dup_value = NULL;
299 }
300 }
301
302 // Terminate SDDS datasets and close files
303 if (!SDDS_Terminate(&sdds_input) || !SDDS_Terminate(&sdds_output)) {
304 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
305 }
306
307 return 0; // Exit successfully
308}
SDDS (Self Describing Data Set) Data Types Definitions and Function Prototypes.
int32_t SDDS_CopyParameters(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
Definition SDDS_copy.c:286
int32_t SDDS_InitializeCopy(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source, char *filename, char *filemode)
Definition SDDS_copy.c:40
int32_t SDDS_CopyRowDirect(SDDS_DATASET *SDDS_target, int64_t target_row, SDDS_DATASET *SDDS_source, int64_t source_row)
Definition SDDS_copy.c:834
int32_t SDDS_CopyArrays(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
Definition SDDS_copy.c:334
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
void * tmalloc(uint64_t size_of_block)
Allocates a memory block of the specified size with zero initialization.
Definition array.c:59
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
Definition bomb.c:26
double random_1(long iseed)
Generate a uniform random double in [0,1] using a custom seed initialization.
Definition drand.c:175
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
Definition findMinMax.c:33
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390
OptionType
Enumeration for command-line options.