60static char *option[N_OPTIONS] = {
73 "Usage: sddsduplicate [<input>] [<output>] [options]\n\n"
75 " -pipe=[input][,output]\n"
76 " Use pipes for input and/or output.\n\n"
77 " -weight=<columnName>\n"
78 " Name of a column to use for weighting the number of duplicates.\n\n"
79 " -minFactor=<integer>\n"
80 " Minimum number of rows to emit. Results in scaling of weights.\n\n"
81 " -maxFactor=<integer>\n"
82 " Maximum number of rows to emit. Results in scaling of weights.\n"
83 " In some cases, input rows will not appear in the output file because\n"
84 " the weight is less than 1.\n\n"
85 " -factor=<integer>\n"
86 " Number of duplicates to create. Incompatible with -weight.\n\n"
88 " Treat fractional duplication counts as probabilities.\n\n"
90 " Set the seed for random number generation. By default, the\n"
91 " system clock is used.\n\n"
92 " -verbosity[=<level>]\n"
93 " Set verbosity level.\n\n"
94 "This program duplicates rows in the input file and creates a new file.\n"
95 "The number of duplicates is determined either by a weight column or\n"
96 "by a fixed value.\n\n"
97 "Program by Michael Borland. (" __DATE__
" " __TIME__
", SVN revision: " SVN_VERSION
")\n";
99int main(
int argc,
char **argv) {
101 char *inputfile = NULL, *outputfile = NULL;
102 long i_arg, verbosity = 0;
104 unsigned long pipe_flags = 0;
105 char *weight_column_name = NULL;
106 double *weight_data = NULL, min_weight, max_weight;
107 double *dup_value = NULL;
108 long max_factor = 0, min_factor = 0, dup_rows = 0;
109 long random_number_seed = 0;
110 int64_t i, j, input_rows, stored_rows;
111 short probabilistic = 0;
115 argc =
scanargs(&s_arg, argc, argv);
120 for (i_arg = 1; i_arg < argc; i_arg++) {
121 if (s_arg[i_arg].arg_type == OPTION) {
123 switch (
match_string(s_arg[i_arg].list[0], option, N_OPTIONS, 0)) {
126 if (!
processPipeOption(s_arg[i_arg].list + 1, s_arg[i_arg].n_items - 1, &pipe_flags))
131 if (s_arg[i_arg].n_items != 2 || !(weight_column_name = s_arg[i_arg].list[1]))
132 bomb(
"invalid -weight syntax", usage);
136 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1],
"%ld", &dup_rows) != 1 || dup_rows <= 0)
137 bomb(
"invalid -rows syntax", usage);
141 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1],
"%ld", &min_factor) != 1 || min_factor <= 0)
142 bomb(
"invalid -minFactor syntax", usage);
146 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1],
"%ld", &max_factor) != 1 || max_factor <= 0)
147 bomb(
"invalid -maxFactor syntax", usage);
151 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1],
"%ld", &verbosity) != 1 || verbosity < 0)
152 bomb(
"invalid -verbosity syntax", usage);
154 case SET_PROBABILISTIC:
157 if (s_arg[i_arg].n_items != 1)
158 bomb(
"invalid -probabilistic syntax", usage);
162 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1],
"%ld", &random_number_seed) != 1 || random_number_seed < 0)
163 bomb(
"invalid -seed syntax", usage);
167 bomb(
"unrecognized option", usage);
173 inputfile = s_arg[i_arg].list[0];
174 else if (!outputfile)
175 outputfile = s_arg[i_arg].list[0];
182 if (min_factor && max_factor)
183 SDDS_Bomb(
"give only one of -minFactor and -maxFactor");
186 processFilenames(
"sddsduplicate", &inputfile, &outputfile, pipe_flags, 0, NULL);
189 if (random_number_seed == 0) {
190 random_number_seed = (long)time(NULL);
191 random_number_seed = 2 * (random_number_seed / 2) + 1;
192#if defined(_WIN32) || defined(darwin)
193 random_1(-labs(random_number_seed));
195 random_1(-FABS(random_number_seed));
210 input_rows = SDDS_RowCount(&sdds_input);
211 if (input_rows > 0) {
212 dup_value =
tmalloc(
sizeof(*dup_value) * input_rows);
215 if (weight_column_name) {
222 find_min_max(&min_weight, &max_weight, weight_data, input_rows);
224 SDDS_Bomb(
"Minimum weight value is nonpositive. Can't use -minFactor.");
225 for (i = 0; i < input_rows; i++)
226 dup_value[i] = weight_data[i] * min_factor / min_weight;
227 }
else if (max_factor) {
228 find_min_max(&min_weight, &max_weight, weight_data, input_rows);
230 SDDS_Bomb(
"Maximum weight value is nonpositive. Can't use -maxFactor.");
231 for (i = 0; i < input_rows; i++)
232 dup_value[i] = weight_data[i] * max_factor / max_weight;
234 for (i = 0; i < input_rows; i++)
235 dup_value[i] = weight_data[i];
241 for (i = 0; i < input_rows; i++) {
242 fraction = dup_value[i] - ((long)dup_value[i]);
243 dup_value[i] = (long)dup_value[i];
248 for (i = 0; i < input_rows; i++)
249 dup_value[i] = (
long)dup_value[i];
253 for (i = 0; i < input_rows; i++)
254 dup_value[i] = dup_rows;
259 for (i = 0; i < input_rows; i++)
260 stored_rows += (int64_t)dup_value[i];
264 int64_t max_dup = 0, min_dup = INT64_MAX;
265 for (i = 0; i < input_rows; i++) {
266 if (max_dup < dup_value[i])
267 max_dup = dup_value[i];
268 if (min_dup > dup_value[i])
269 min_dup = dup_value[i];
271 fprintf(stderr,
"%" PRId64
" output rows, minimum and maximum duplication factor: %" PRId64
", %" PRId64
"\n",
272 stored_rows, min_dup, max_dup);
284 for (i = 0; i < input_rows; i++) {
285 for (j = 0; j < dup_value[i]; j++) {
SDDS (Self Describing Data Set) Data Types Definitions and Function Prototypes.
int32_t SDDS_CopyParameters(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
int32_t SDDS_InitializeCopy(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source, char *filename, char *filemode)
int32_t SDDS_CopyRowDirect(SDDS_DATASET *SDDS_target, int64_t target_row, SDDS_DATASET *SDDS_source, int64_t source_row)
int32_t SDDS_CopyArrays(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
void * tmalloc(uint64_t size_of_block)
Allocates a memory block of the specified size with zero initialization.
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
double random_1(long iseed)
Generate a uniform random double in [0,1] using a custom seed initialization.
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
long processPipeOption(char **item, long items, unsigned long *flags)
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
OptionType
Enumeration for command-line options.