122 {
124 char *inputfile = NULL, *outputfile = NULL;
125 long i_arg, verbosity = 0;
126 SCANNED_ARG *s_arg;
127 unsigned long pipe_flags = 0;
128 char *weight_column_name = NULL;
129 double *weight_data = NULL, min_weight, max_weight;
130 double *dup_value = NULL;
131 long max_factor = 0, min_factor = 0, dup_rows = 0;
132 long random_number_seed = 0;
133 int64_t i, j, input_rows, stored_rows;
134 short probabilistic = 0;
135
136
138 argc =
scanargs(&s_arg, argc, argv);
139 if (argc < 3)
141
142
143 for (i_arg = 1; i_arg < argc; i_arg++) {
144 if (s_arg[i_arg].arg_type == OPTION) {
145
146 switch (
match_string(s_arg[i_arg].list[0], option, N_OPTIONS, 0)) {
147 case SET_PIPE:
148
149 if (!
processPipeOption(s_arg[i_arg].list + 1, s_arg[i_arg].n_items - 1, &pipe_flags))
151 break;
152 case SET_WEIGHT:
153
154 if (s_arg[i_arg].n_items != 2 || !(weight_column_name = s_arg[i_arg].list[1]))
155 bomb(
"invalid -weight syntax", usage);
156 break;
157 case SET_FACTOR:
158
159 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &dup_rows) != 1 || dup_rows <= 0)
160 bomb(
"invalid -rows syntax", usage);
161 break;
162 case SET_MINFACTOR:
163
164 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &min_factor) != 1 || min_factor <= 0)
165 bomb(
"invalid -minFactor syntax", usage);
166 break;
167 case SET_MAXFACTOR:
168
169 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &max_factor) != 1 || max_factor <= 0)
170 bomb(
"invalid -maxFactor syntax", usage);
171 break;
172 case SET_VERBOSITY:
173
174 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &verbosity) != 1 || verbosity < 0)
175 bomb(
"invalid -verbosity syntax", usage);
176 break;
177 case SET_PROBABILISTIC:
178
179 probabilistic = 1;
180 if (s_arg[i_arg].n_items != 1)
181 bomb(
"invalid -probabilistic syntax", usage);
182 break;
183 case SET_SEED:
184
185 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &random_number_seed) != 1 || random_number_seed < 0)
186 bomb(
"invalid -seed syntax", usage);
187 break;
188 default:
189
190 bomb(
"unrecognized option", usage);
191 break;
192 }
193 } else {
194
195 if (!inputfile)
196 inputfile = s_arg[i_arg].list[0];
197 else if (!outputfile)
198 outputfile = s_arg[i_arg].list[0];
199 else
201 }
202 }
203
204
205 if (min_factor && max_factor)
206 SDDS_Bomb(
"give only one of -minFactor and -maxFactor");
207
208
209 processFilenames(
"sddsduplicate", &inputfile, &outputfile, pipe_flags, 0, NULL);
210
211
212 if (random_number_seed == 0) {
213 random_number_seed = (long)time(NULL);
214 random_number_seed = 2 * (random_number_seed / 2) + 1;
215#if defined(_WIN32) || defined(__APPLE__)
216 random_1(-labs(random_number_seed));
217#else
218 random_1(-FABS(random_number_seed));
219#endif
220 } else {
222 }
223
224
229 }
230
231
233 input_rows = SDDS_RowCount(&sdds_input);
234 if (input_rows > 0) {
235 dup_value =
tmalloc(
sizeof(*dup_value) * input_rows);
236
237
238 if (weight_column_name) {
241 }
242
243
244 if (min_factor) {
245 find_min_max(&min_weight, &max_weight, weight_data, input_rows);
246 if (min_weight <= 0)
247 SDDS_Bomb(
"Minimum weight value is nonpositive. Can't use -minFactor.");
248 for (i = 0; i < input_rows; i++)
249 dup_value[i] = weight_data[i] * min_factor / min_weight;
250 } else if (max_factor) {
251 find_min_max(&min_weight, &max_weight, weight_data, input_rows);
252 if (max_weight <= 0)
253 SDDS_Bomb(
"Maximum weight value is nonpositive. Can't use -maxFactor.");
254 for (i = 0; i < input_rows; i++)
255 dup_value[i] = weight_data[i] * max_factor / max_weight;
256 } else {
257 for (i = 0; i < input_rows; i++)
258 dup_value[i] = weight_data[i];
259 }
260
261
262 if (probabilistic) {
263 double fraction;
264 for (i = 0; i < input_rows; i++) {
265 fraction = dup_value[i] - ((long)dup_value[i]);
266 dup_value[i] = (long)dup_value[i];
268 dup_value[i] += 1;
269 }
270 } else {
271 for (i = 0; i < input_rows; i++)
272 dup_value[i] = (long)dup_value[i];
273 }
274 } else {
275
276 for (i = 0; i < input_rows; i++)
277 dup_value[i] = dup_rows;
278 }
279
280
281 stored_rows = 0;
282 for (i = 0; i < input_rows; i++)
283 stored_rows += (int64_t)dup_value[i];
284
285
286 if (verbosity) {
287 int64_t max_dup = 0, min_dup = INT64_MAX;
288 for (i = 0; i < input_rows; i++) {
289 if (max_dup < dup_value[i])
290 max_dup = dup_value[i];
291 if (min_dup > dup_value[i])
292 min_dup = dup_value[i];
293 }
294 fprintf(stderr, "%" PRId64 " output rows, minimum and maximum duplication factor: %" PRId64 ", %" PRId64 "\n",
295 stored_rows, min_dup, max_dup);
296 }
297
298
303 }
304
305
306 stored_rows = 0;
307 for (i = 0; i < input_rows; i++) {
308 for (j = 0; j < dup_value[i]; j++) {
311 }
312 }
313 }
314
315
318 }
319
320 free(dup_value);
321 dup_value = NULL;
322 }
323 }
324
325
328 }
329
330 return 0;
331}
int32_t SDDS_CopyParameters(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
int32_t SDDS_InitializeCopy(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source, char *filename, char *filemode)
int32_t SDDS_CopyRowDirect(SDDS_DATASET *SDDS_target, int64_t target_row, SDDS_DATASET *SDDS_source, int64_t source_row)
int32_t SDDS_CopyArrays(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
void * tmalloc(uint64_t size_of_block)
Allocates a memory block of the specified size with zero initialization.
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
double random_1(long iseed)
Generate a uniform random double in [0,1] using a custom seed initialization.
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
long processPipeOption(char **item, long items, unsigned long *flags)
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)