75static char *option[N_OPTIONS] = {
88 "sddsduplicate [<input>] [<output>]\n"
89 " [-pipe=[input][,output]]\n"
90 " [-weight=<columnName>]\n"
91 " [-minFactor=<integer>]\n"
92 " [-maxFactor=<integer>]\n"
93 " [-factor=<integer>]\n"
95 " [-seed=<integer>]\n"
96 " [-verbosity[=<level>]]\n"
98 " -pipe=[input][,output]\n"
99 " Use pipes for input and/or output.\n\n"
100 " -weight=<columnName>\n"
101 " Name of a column to use for weighting the number of duplicates.\n\n"
102 " -minFactor=<integer>\n"
103 " Minimum number of rows to emit. Results in scaling of weights.\n\n"
104 " -maxFactor=<integer>\n"
105 " Maximum number of rows to emit. Results in scaling of weights.\n"
106 " In some cases, input rows will not appear in the output file because\n"
107 " the weight is less than 1.\n\n"
108 " -factor=<integer>\n"
109 " Number of duplicates to create. Incompatible with -weight.\n\n"
111 " Treat fractional duplication counts as probabilities.\n\n"
113 " Set the seed for random number generation. By default, the\n"
114 " system clock is used.\n\n"
115 " -verbosity[=<level>]\n"
116 " Set verbosity level.\n\n"
117 "This program duplicates rows in the input file and creates a new file.\n"
118 "The number of duplicates is determined either by a weight column or\n"
119 "by a fixed value.\n\n"
120 "Program by Michael Borland. (" __DATE__
" " __TIME__
", SVN revision: " SVN_VERSION
")\n";
122int main(
int argc,
char **argv) {
124 char *inputfile = NULL, *outputfile = NULL;
125 long i_arg, verbosity = 0;
127 unsigned long pipe_flags = 0;
128 char *weight_column_name = NULL;
129 double *weight_data = NULL, min_weight, max_weight;
130 double *dup_value = NULL;
131 long max_factor = 0, min_factor = 0, dup_rows = 0;
132 long random_number_seed = 0;
133 int64_t i, j, input_rows, stored_rows;
134 short probabilistic = 0;
138 argc =
scanargs(&s_arg, argc, argv);
143 for (i_arg = 1; i_arg < argc; i_arg++) {
144 if (s_arg[i_arg].arg_type == OPTION) {
146 switch (
match_string(s_arg[i_arg].list[0], option, N_OPTIONS, 0)) {
149 if (!
processPipeOption(s_arg[i_arg].list + 1, s_arg[i_arg].n_items - 1, &pipe_flags))
154 if (s_arg[i_arg].n_items != 2 || !(weight_column_name = s_arg[i_arg].list[1]))
155 bomb(
"invalid -weight syntax", usage);
159 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1],
"%ld", &dup_rows) != 1 || dup_rows <= 0)
160 bomb(
"invalid -rows syntax", usage);
164 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1],
"%ld", &min_factor) != 1 || min_factor <= 0)
165 bomb(
"invalid -minFactor syntax", usage);
169 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1],
"%ld", &max_factor) != 1 || max_factor <= 0)
170 bomb(
"invalid -maxFactor syntax", usage);
174 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1],
"%ld", &verbosity) != 1 || verbosity < 0)
175 bomb(
"invalid -verbosity syntax", usage);
177 case SET_PROBABILISTIC:
180 if (s_arg[i_arg].n_items != 1)
181 bomb(
"invalid -probabilistic syntax", usage);
185 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1],
"%ld", &random_number_seed) != 1 || random_number_seed < 0)
186 bomb(
"invalid -seed syntax", usage);
190 bomb(
"unrecognized option", usage);
196 inputfile = s_arg[i_arg].list[0];
197 else if (!outputfile)
198 outputfile = s_arg[i_arg].list[0];
205 if (min_factor && max_factor)
206 SDDS_Bomb(
"give only one of -minFactor and -maxFactor");
209 processFilenames(
"sddsduplicate", &inputfile, &outputfile, pipe_flags, 0, NULL);
212 if (random_number_seed == 0) {
213 random_number_seed = (long)time(NULL);
214 random_number_seed = 2 * (random_number_seed / 2) + 1;
215#if defined(_WIN32) || defined(__APPLE__)
216 random_1(-labs(random_number_seed));
218 random_1(-FABS(random_number_seed));
233 input_rows = SDDS_RowCount(&sdds_input);
234 if (input_rows > 0) {
235 dup_value =
tmalloc(
sizeof(*dup_value) * input_rows);
238 if (weight_column_name) {
245 find_min_max(&min_weight, &max_weight, weight_data, input_rows);
247 SDDS_Bomb(
"Minimum weight value is nonpositive. Can't use -minFactor.");
248 for (i = 0; i < input_rows; i++)
249 dup_value[i] = weight_data[i] * min_factor / min_weight;
250 }
else if (max_factor) {
251 find_min_max(&min_weight, &max_weight, weight_data, input_rows);
253 SDDS_Bomb(
"Maximum weight value is nonpositive. Can't use -maxFactor.");
254 for (i = 0; i < input_rows; i++)
255 dup_value[i] = weight_data[i] * max_factor / max_weight;
257 for (i = 0; i < input_rows; i++)
258 dup_value[i] = weight_data[i];
264 for (i = 0; i < input_rows; i++) {
265 fraction = dup_value[i] - ((long)dup_value[i]);
266 dup_value[i] = (long)dup_value[i];
271 for (i = 0; i < input_rows; i++)
272 dup_value[i] = (
long)dup_value[i];
276 for (i = 0; i < input_rows; i++)
277 dup_value[i] = dup_rows;
282 for (i = 0; i < input_rows; i++)
283 stored_rows += (int64_t)dup_value[i];
287 int64_t max_dup = 0, min_dup = INT64_MAX;
288 for (i = 0; i < input_rows; i++) {
289 if (max_dup < dup_value[i])
290 max_dup = dup_value[i];
291 if (min_dup > dup_value[i])
292 min_dup = dup_value[i];
294 fprintf(stderr,
"%" PRId64
" output rows, minimum and maximum duplication factor: %" PRId64
", %" PRId64
"\n",
295 stored_rows, min_dup, max_dup);
307 for (i = 0; i < input_rows; i++) {
308 for (j = 0; j < dup_value[i]; j++) {
SDDS (Self Describing Data Set) Data Types Definitions and Function Prototypes.
int32_t SDDS_CopyParameters(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
int32_t SDDS_InitializeCopy(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source, char *filename, char *filemode)
int32_t SDDS_CopyRowDirect(SDDS_DATASET *SDDS_target, int64_t target_row, SDDS_DATASET *SDDS_source, int64_t source_row)
int32_t SDDS_CopyArrays(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
void * tmalloc(uint64_t size_of_block)
Allocates a memory block of the specified size with zero initialization.
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
double random_1(long iseed)
Generate a uniform random double in [0,1] using a custom seed initialization.
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
long processPipeOption(char **item, long items, unsigned long *flags)
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)