SDDSlib
Loading...
Searching...
No Matches
sddsduplicate.c File Reference

A program for duplicating rows in a file based on a weight column. More...

#include "mdb.h"
#include "scan.h"
#include "SDDS.h"

Go to the source code of this file.

Enumerations

enum  OptionType {
  SET_WEIGHT , SET_PIPE , SET_MAXFACTOR , SET_MINFACTOR ,
  SET_FACTOR , SET_VERBOSITY , SET_SEED , SET_PROBABILISTIC ,
  N_OPTIONS
}
 

Functions

int main (int argc, char **argv)
 

Variables

static char * option [N_OPTIONS]
 
static char * usage
 

Detailed Description

A program for duplicating rows in a file based on a weight column.

This program duplicates rows in an SDDS file and creates a new file. The number of duplicates is determined either by a weight column or by a fixed value. The program provides options for minimum and maximum factors, probabilistic duplication, and verbosity settings. The output can be directed through pipes, making it suitable for use in larger data processing pipelines.

Usage:

sddsduplicate [<input>] [<output>] [options]

Options:

  • -pipe=[input][,output] Use pipes for input and/or output.
  • -weight=<columnName> Specify the column to use for weighting the number of duplicates.
  • -minFactor=<integer> Set the minimum number of rows to emit, scaling weights accordingly.
  • -maxFactor=<integer> Set the maximum number of rows to emit, scaling weights accordingly.
  • -factor=<integer> Specify the fixed number of duplicates to create (incompatible with -weight).
  • -probabilistic Treat fractional duplication counts as probabilities.
  • -seed=<integer> Set the random number generator seed (default: system clock).
  • -verbosity[=<level>] Set verbosity level for detailed output.

Example:

sddsduplicate input.sdds output.sdds -weight=Population -maxFactor=10

This command creates output.sdds by duplicating rows from input.sdds based on the Population column, scaling values such that the maximum duplication factor is 10.

License
This file is distributed under the terms of the Software License Agreement found in the file LICENSE included with this distribution.
Author
M. Borland, R. Soliday

Definition in file sddsduplicate.c.

Enumeration Type Documentation

◆ OptionType

enum OptionType

Definition at line 47 of file sddsduplicate.c.

47 {
48 SET_WEIGHT,
49 SET_PIPE,
50 SET_MAXFACTOR,
51 SET_MINFACTOR,
52 SET_FACTOR,
53 SET_VERBOSITY,
54 SET_SEED,
55 SET_PROBABILISTIC,
56 N_OPTIONS
OptionType
Enumeration for command-line options.

Function Documentation

◆ main()

int main ( int argc,
char ** argv )

Definition at line 99 of file sddsduplicate.c.

99 {
100 SDDS_DATASET sdds_input, sdds_output;
101 char *inputfile = NULL, *outputfile = NULL;
102 long i_arg, verbosity = 0;
103 SCANNED_ARG *s_arg;
104 unsigned long pipe_flags = 0;
105 char *weight_column_name = NULL;
106 double *weight_data = NULL, min_weight, max_weight;
107 double *dup_value = NULL;
108 long max_factor = 0, min_factor = 0, dup_rows = 0;
109 long random_number_seed = 0;
110 int64_t i, j, input_rows, stored_rows;
111 short probabilistic = 0;
112
113 // Register the program name for error messages
115 argc = scanargs(&s_arg, argc, argv);
116 if (argc < 3)
117 bomb(NULL, usage); // Ensure sufficient arguments
118
119 // Parse command-line arguments
120 for (i_arg = 1; i_arg < argc; i_arg++) {
121 if (s_arg[i_arg].arg_type == OPTION) {
122 // Match the option and process accordingly
123 switch (match_string(s_arg[i_arg].list[0], option, N_OPTIONS, 0)) {
124 case SET_PIPE:
125 // Process pipe-related options
126 if (!processPipeOption(s_arg[i_arg].list + 1, s_arg[i_arg].n_items - 1, &pipe_flags))
127 SDDS_Bomb("invalid -pipe syntax");
128 break;
129 case SET_WEIGHT:
130 // Specify the column name for weighting
131 if (s_arg[i_arg].n_items != 2 || !(weight_column_name = s_arg[i_arg].list[1]))
132 bomb("invalid -weight syntax", usage);
133 break;
134 case SET_FACTOR:
135 // Specify a fixed number of duplicates
136 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &dup_rows) != 1 || dup_rows <= 0)
137 bomb("invalid -rows syntax", usage);
138 break;
139 case SET_MINFACTOR:
140 // Specify the minimum duplication factor
141 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &min_factor) != 1 || min_factor <= 0)
142 bomb("invalid -minFactor syntax", usage);
143 break;
144 case SET_MAXFACTOR:
145 // Specify the maximum duplication factor
146 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &max_factor) != 1 || max_factor <= 0)
147 bomb("invalid -maxFactor syntax", usage);
148 break;
149 case SET_VERBOSITY:
150 // Set the verbosity level
151 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &verbosity) != 1 || verbosity < 0)
152 bomb("invalid -verbosity syntax", usage);
153 break;
154 case SET_PROBABILISTIC:
155 // Enable probabilistic duplication
156 probabilistic = 1;
157 if (s_arg[i_arg].n_items != 1)
158 bomb("invalid -probabilistic syntax", usage);
159 break;
160 case SET_SEED:
161 // Set the random number generator seed
162 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &random_number_seed) != 1 || random_number_seed < 0)
163 bomb("invalid -seed syntax", usage);
164 break;
165 default:
166 // Unrecognized option
167 bomb("unrecognized option", usage);
168 break;
169 }
170 } else {
171 // Process input and output file arguments
172 if (!inputfile)
173 inputfile = s_arg[i_arg].list[0];
174 else if (!outputfile)
175 outputfile = s_arg[i_arg].list[0];
176 else
177 SDDS_Bomb("too many filenames");
178 }
179 }
180
181 // Ensure only one of minFactor or maxFactor is set
182 if (min_factor && max_factor)
183 SDDS_Bomb("give only one of -minFactor and -maxFactor");
184
185 // Process file names and pipe configurations
186 processFilenames("sddsduplicate", &inputfile, &outputfile, pipe_flags, 0, NULL);
187
188 // Initialize random number generator
189 if (random_number_seed == 0) {
190 random_number_seed = (long)time(NULL); // Use system clock if no seed provided
191 random_number_seed = 2 * (random_number_seed / 2) + 1; // Ensure odd seed
192#if defined(_WIN32) || defined(darwin)
193 random_1(-labs(random_number_seed));
194#else
195 random_1(-FABS(random_number_seed));
196#endif
197 } else {
198 random_1(-random_number_seed); // Use specified seed
199 }
200
201 // Initialize SDDS datasets
202 if (!SDDS_InitializeInput(&sdds_input, inputfile) ||
203 !SDDS_InitializeCopy(&sdds_output, &sdds_input, outputfile, "w") ||
204 !SDDS_WriteLayout(&sdds_output)) {
205 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
206 }
207
208 // Main loop for reading and duplicating rows
209 while (SDDS_ReadPage(&sdds_input) > 0) {
210 input_rows = SDDS_RowCount(&sdds_input); // Get number of rows in current page
211 if (input_rows > 0) {
212 dup_value = tmalloc(sizeof(*dup_value) * input_rows); // Allocate duplication array
213
214 // Handle weighting logic if a weight column is specified
215 if (weight_column_name) {
216 if (!(weight_data = SDDS_GetColumnInDoubles(&sdds_input, weight_column_name))) {
217 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
218 }
219
220 // Scale weights based on minFactor or maxFactor
221 if (min_factor) {
222 find_min_max(&min_weight, &max_weight, weight_data, input_rows);
223 if (min_weight <= 0)
224 SDDS_Bomb("Minimum weight value is nonpositive. Can't use -minFactor.");
225 for (i = 0; i < input_rows; i++)
226 dup_value[i] = weight_data[i] * min_factor / min_weight;
227 } else if (max_factor) {
228 find_min_max(&min_weight, &max_weight, weight_data, input_rows);
229 if (max_weight <= 0)
230 SDDS_Bomb("Maximum weight value is nonpositive. Can't use -maxFactor.");
231 for (i = 0; i < input_rows; i++)
232 dup_value[i] = weight_data[i] * max_factor / max_weight;
233 } else {
234 for (i = 0; i < input_rows; i++)
235 dup_value[i] = weight_data[i];
236 }
237
238 // Apply probabilistic logic for fractional duplication counts
239 if (probabilistic) {
240 double fraction;
241 for (i = 0; i < input_rows; i++) {
242 fraction = dup_value[i] - ((long)dup_value[i]);
243 dup_value[i] = (long)dup_value[i];
244 if (fraction > random_1(0))
245 dup_value[i] += 1;
246 }
247 } else {
248 for (i = 0; i < input_rows; i++)
249 dup_value[i] = (long)dup_value[i];
250 }
251 } else {
252 // Use fixed duplication factor if no weight column is provided
253 for (i = 0; i < input_rows; i++)
254 dup_value[i] = dup_rows;
255 }
256
257 // Count total rows to be stored after duplication
258 stored_rows = 0;
259 for (i = 0; i < input_rows; i++)
260 stored_rows += (int64_t)dup_value[i];
261
262 // Print duplication summary if verbosity is enabled
263 if (verbosity) {
264 int64_t max_dup = 0, min_dup = INT64_MAX;
265 for (i = 0; i < input_rows; i++) {
266 if (max_dup < dup_value[i])
267 max_dup = dup_value[i];
268 if (min_dup > dup_value[i])
269 min_dup = dup_value[i];
270 }
271 fprintf(stderr, "%" PRId64 " output rows, minimum and maximum duplication factor: %" PRId64 ", %" PRId64 "\n",
272 stored_rows, min_dup, max_dup);
273 }
274
275 // Start a new SDDS page and copy data
276 if (!SDDS_StartPage(&sdds_output, stored_rows) ||
277 !SDDS_CopyParameters(&sdds_output, &sdds_input) ||
278 !SDDS_CopyArrays(&sdds_output, &sdds_input)) {
279 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
280 }
281
282 // Duplicate rows into the output dataset
283 stored_rows = 0;
284 for (i = 0; i < input_rows; i++) {
285 for (j = 0; j < dup_value[i]; j++) {
286 if (SDDS_CopyRowDirect(&sdds_output, stored_rows++, &sdds_input, i)) {
287 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
288 }
289 }
290 }
291
292 // Write the completed page to the output file
293 if (!SDDS_WritePage(&sdds_output)) {
294 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
295 }
296
297 free(dup_value); // Free allocated memory for duplication array
298 dup_value = NULL;
299 }
300 }
301
302 // Terminate SDDS datasets and close files
303 if (!SDDS_Terminate(&sdds_input) || !SDDS_Terminate(&sdds_output)) {
304 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
305 }
306
307 return 0; // Exit successfully
308}
int32_t SDDS_CopyParameters(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
Definition SDDS_copy.c:286
int32_t SDDS_InitializeCopy(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source, char *filename, char *filemode)
Definition SDDS_copy.c:40
int32_t SDDS_CopyRowDirect(SDDS_DATASET *SDDS_target, int64_t target_row, SDDS_DATASET *SDDS_source, int64_t source_row)
Definition SDDS_copy.c:834
int32_t SDDS_CopyArrays(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
Definition SDDS_copy.c:334
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
void * tmalloc(uint64_t size_of_block)
Allocates a memory block of the specified size with zero initialization.
Definition array.c:59
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
Definition bomb.c:26
double random_1(long iseed)
Generate a uniform random double in [0,1] using a custom seed initialization.
Definition drand.c:175
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
Definition findMinMax.c:33
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390

Variable Documentation

◆ option

char* option[N_OPTIONS]
static
Initial value:
= {
"weight",
"pipe",
"maxfactor",
"minfactor",
"factor",
"verbosity",
"seed",
"probabilistic",
}

Definition at line 60 of file sddsduplicate.c.

60 {
61 "weight",
62 "pipe",
63 "maxfactor",
64 "minfactor",
65 "factor",
66 "verbosity",
67 "seed",
68 "probabilistic",
69};

◆ usage

char* usage
static
Initial value:
=
"Usage: sddsduplicate [<input>] [<output>] [options]\n\n"
"Options:\n"
" -pipe=[input][,output]\n"
" Use pipes for input and/or output.\n\n"
" -weight=<columnName>\n"
" Name of a column to use for weighting the number of duplicates.\n\n"
" -minFactor=<integer>\n"
" Minimum number of rows to emit. Results in scaling of weights.\n\n"
" -maxFactor=<integer>\n"
" Maximum number of rows to emit. Results in scaling of weights.\n"
" In some cases, input rows will not appear in the output file because\n"
" the weight is less than 1.\n\n"
" -factor=<integer>\n"
" Number of duplicates to create. Incompatible with -weight.\n\n"
" -probabilistic\n"
" Treat fractional duplication counts as probabilities.\n\n"
" -seed=<integer>\n"
" Set the seed for random number generation. By default, the\n"
" system clock is used.\n\n"
" -verbosity[=<level>]\n"
" Set verbosity level.\n\n"
"This program duplicates rows in the input file and creates a new file.\n"
"The number of duplicates is determined either by a weight column or\n"
"by a fixed value.\n\n"
"Program by Michael Borland. (" __DATE__ " " __TIME__ ", SVN revision: " SVN_VERSION ")\n"

Definition at line 72 of file sddsduplicate.c.