SDDS ToolKit Programs and Libraries for C and Python
All Classes Files Functions Variables Macros Pages
sddsduplicate.c File Reference

Detailed Description

A program for duplicating rows in an SDDS file based on a weight column or fixed duplication factors.

This program duplicates rows in an SDDS file and creates a new output file. The number of duplicates can be determined either by a weight column or by a fixed duplication factor. Users can specify minimum and maximum duplication factors, enable probabilistic duplication, and control verbosity settings. The program also supports input/output through pipes for integration into data processing pipelines.

Usage

sddsduplicate [<input>] [<output>]
[-pipe=[input][,output]]
[-weight=<columnName>]
[-minFactor=<integer>]
[-maxFactor=<integer>]
[-factor=<integer>]
[-probabilistic]
[-seed=<integer>]
[-verbosity[=<level>]]

Options

Option Description
-pipe Use pipes for input and/or output.
-weight Specify the column to use for weighting the number of duplicates.
-minFactor Set the minimum number of rows to emit, scaling weights accordingly.
-maxFactor Set the maximum number of rows to emit, scaling weights accordingly.
-factor Specify a fixed number of duplicates to create. Mutually exclusive with -weight
-probabilistic Treat fractional duplication counts as probabilities.
-seed Set the random number generator seed (default: system clock).
-verbosity Set verbosity level for detailed output.

Incompatibilities

  • -weight is incompatible with:
    • -factor
  • -minFactor and -maxFactor only work with:
    • -weight
  • Only one of the following may be specified:
    • -minFactor
    • -maxFactor
License
This file is distributed under the terms of the Software License Agreement found in the file LICENSE included with this distribution.
Authors
M. Borland, R. Soliday

Definition in file sddsduplicate.c.

#include "mdb.h"
#include "scan.h"
#include "SDDS.h"

Go to the source code of this file.

Functions

int main (int argc, char **argv)
 

Function Documentation

◆ main()

int main ( int argc,
char ** argv )

Definition at line 122 of file sddsduplicate.c.

122 {
123 SDDS_DATASET sdds_input, sdds_output;
124 char *inputfile = NULL, *outputfile = NULL;
125 long i_arg, verbosity = 0;
126 SCANNED_ARG *s_arg;
127 unsigned long pipe_flags = 0;
128 char *weight_column_name = NULL;
129 double *weight_data = NULL, min_weight, max_weight;
130 double *dup_value = NULL;
131 long max_factor = 0, min_factor = 0, dup_rows = 0;
132 long random_number_seed = 0;
133 int64_t i, j, input_rows, stored_rows;
134 short probabilistic = 0;
135
136 // Register the program name for error messages
138 argc = scanargs(&s_arg, argc, argv);
139 if (argc < 3)
140 bomb(NULL, usage); // Ensure sufficient arguments
141
142 // Parse command-line arguments
143 for (i_arg = 1; i_arg < argc; i_arg++) {
144 if (s_arg[i_arg].arg_type == OPTION) {
145 // Match the option and process accordingly
146 switch (match_string(s_arg[i_arg].list[0], option, N_OPTIONS, 0)) {
147 case SET_PIPE:
148 // Process pipe-related options
149 if (!processPipeOption(s_arg[i_arg].list + 1, s_arg[i_arg].n_items - 1, &pipe_flags))
150 SDDS_Bomb("invalid -pipe syntax");
151 break;
152 case SET_WEIGHT:
153 // Specify the column name for weighting
154 if (s_arg[i_arg].n_items != 2 || !(weight_column_name = s_arg[i_arg].list[1]))
155 bomb("invalid -weight syntax", usage);
156 break;
157 case SET_FACTOR:
158 // Specify a fixed number of duplicates
159 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &dup_rows) != 1 || dup_rows <= 0)
160 bomb("invalid -rows syntax", usage);
161 break;
162 case SET_MINFACTOR:
163 // Specify the minimum duplication factor
164 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &min_factor) != 1 || min_factor <= 0)
165 bomb("invalid -minFactor syntax", usage);
166 break;
167 case SET_MAXFACTOR:
168 // Specify the maximum duplication factor
169 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &max_factor) != 1 || max_factor <= 0)
170 bomb("invalid -maxFactor syntax", usage);
171 break;
172 case SET_VERBOSITY:
173 // Set the verbosity level
174 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &verbosity) != 1 || verbosity < 0)
175 bomb("invalid -verbosity syntax", usage);
176 break;
177 case SET_PROBABILISTIC:
178 // Enable probabilistic duplication
179 probabilistic = 1;
180 if (s_arg[i_arg].n_items != 1)
181 bomb("invalid -probabilistic syntax", usage);
182 break;
183 case SET_SEED:
184 // Set the random number generator seed
185 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &random_number_seed) != 1 || random_number_seed < 0)
186 bomb("invalid -seed syntax", usage);
187 break;
188 default:
189 // Unrecognized option
190 bomb("unrecognized option", usage);
191 break;
192 }
193 } else {
194 // Process input and output file arguments
195 if (!inputfile)
196 inputfile = s_arg[i_arg].list[0];
197 else if (!outputfile)
198 outputfile = s_arg[i_arg].list[0];
199 else
200 SDDS_Bomb("too many filenames");
201 }
202 }
203
204 // Ensure only one of minFactor or maxFactor is set
205 if (min_factor && max_factor)
206 SDDS_Bomb("give only one of -minFactor and -maxFactor");
207
208 // Process file names and pipe configurations
209 processFilenames("sddsduplicate", &inputfile, &outputfile, pipe_flags, 0, NULL);
210
211 // Initialize random number generator
212 if (random_number_seed == 0) {
213 random_number_seed = (long)time(NULL); // Use system clock if no seed provided
214 random_number_seed = 2 * (random_number_seed / 2) + 1; // Ensure odd seed
215#if defined(_WIN32) || defined(__APPLE__)
216 random_1(-labs(random_number_seed));
217#else
218 random_1(-FABS(random_number_seed));
219#endif
220 } else {
221 random_1(-random_number_seed); // Use specified seed
222 }
223
224 // Initialize SDDS datasets
225 if (!SDDS_InitializeInput(&sdds_input, inputfile) ||
226 !SDDS_InitializeCopy(&sdds_output, &sdds_input, outputfile, "w") ||
227 !SDDS_WriteLayout(&sdds_output)) {
228 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
229 }
230
231 // Main loop for reading and duplicating rows
232 while (SDDS_ReadPage(&sdds_input) > 0) {
233 input_rows = SDDS_RowCount(&sdds_input); // Get number of rows in current page
234 if (input_rows > 0) {
235 dup_value = tmalloc(sizeof(*dup_value) * input_rows); // Allocate duplication array
236
237 // Handle weighting logic if a weight column is specified
238 if (weight_column_name) {
239 if (!(weight_data = SDDS_GetColumnInDoubles(&sdds_input, weight_column_name))) {
240 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
241 }
242
243 // Scale weights based on minFactor or maxFactor
244 if (min_factor) {
245 find_min_max(&min_weight, &max_weight, weight_data, input_rows);
246 if (min_weight <= 0)
247 SDDS_Bomb("Minimum weight value is nonpositive. Can't use -minFactor.");
248 for (i = 0; i < input_rows; i++)
249 dup_value[i] = weight_data[i] * min_factor / min_weight;
250 } else if (max_factor) {
251 find_min_max(&min_weight, &max_weight, weight_data, input_rows);
252 if (max_weight <= 0)
253 SDDS_Bomb("Maximum weight value is nonpositive. Can't use -maxFactor.");
254 for (i = 0; i < input_rows; i++)
255 dup_value[i] = weight_data[i] * max_factor / max_weight;
256 } else {
257 for (i = 0; i < input_rows; i++)
258 dup_value[i] = weight_data[i];
259 }
260
261 // Apply probabilistic logic for fractional duplication counts
262 if (probabilistic) {
263 double fraction;
264 for (i = 0; i < input_rows; i++) {
265 fraction = dup_value[i] - ((long)dup_value[i]);
266 dup_value[i] = (long)dup_value[i];
267 if (fraction > random_1(0))
268 dup_value[i] += 1;
269 }
270 } else {
271 for (i = 0; i < input_rows; i++)
272 dup_value[i] = (long)dup_value[i];
273 }
274 } else {
275 // Use fixed duplication factor if no weight column is provided
276 for (i = 0; i < input_rows; i++)
277 dup_value[i] = dup_rows;
278 }
279
280 // Count total rows to be stored after duplication
281 stored_rows = 0;
282 for (i = 0; i < input_rows; i++)
283 stored_rows += (int64_t)dup_value[i];
284
285 // Print duplication summary if verbosity is enabled
286 if (verbosity) {
287 int64_t max_dup = 0, min_dup = INT64_MAX;
288 for (i = 0; i < input_rows; i++) {
289 if (max_dup < dup_value[i])
290 max_dup = dup_value[i];
291 if (min_dup > dup_value[i])
292 min_dup = dup_value[i];
293 }
294 fprintf(stderr, "%" PRId64 " output rows, minimum and maximum duplication factor: %" PRId64 ", %" PRId64 "\n",
295 stored_rows, min_dup, max_dup);
296 }
297
298 // Start a new SDDS page and copy data
299 if (!SDDS_StartPage(&sdds_output, stored_rows) ||
300 !SDDS_CopyParameters(&sdds_output, &sdds_input) ||
301 !SDDS_CopyArrays(&sdds_output, &sdds_input)) {
302 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
303 }
304
305 // Duplicate rows into the output dataset
306 stored_rows = 0;
307 for (i = 0; i < input_rows; i++) {
308 for (j = 0; j < dup_value[i]; j++) {
309 if (SDDS_CopyRowDirect(&sdds_output, stored_rows++, &sdds_input, i)) {
310 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
311 }
312 }
313 }
314
315 // Write the completed page to the output file
316 if (!SDDS_WritePage(&sdds_output)) {
317 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
318 }
319
320 free(dup_value); // Free allocated memory for duplication array
321 dup_value = NULL;
322 }
323 }
324
325 // Terminate SDDS datasets and close files
326 if (!SDDS_Terminate(&sdds_input) || !SDDS_Terminate(&sdds_output)) {
327 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
328 }
329
330 return 0; // Exit successfully
331}
int32_t SDDS_CopyParameters(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
Definition SDDS_copy.c:286
int32_t SDDS_InitializeCopy(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source, char *filename, char *filemode)
Definition SDDS_copy.c:40
int32_t SDDS_CopyRowDirect(SDDS_DATASET *SDDS_target, int64_t target_row, SDDS_DATASET *SDDS_source, int64_t source_row)
Definition SDDS_copy.c:834
int32_t SDDS_CopyArrays(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
Definition SDDS_copy.c:334
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
void * tmalloc(uint64_t size_of_block)
Allocates a memory block of the specified size with zero initialization.
Definition array.c:59
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
Definition bomb.c:26
double random_1(long iseed)
Generate a uniform random double in [0,1] using a custom seed initialization.
Definition drand.c:175
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
Definition findMinMax.c:33
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390