99 {
101 char *inputfile = NULL, *outputfile = NULL;
102 long i_arg, verbosity = 0;
103 SCANNED_ARG *s_arg;
104 unsigned long pipe_flags = 0;
105 char *weight_column_name = NULL;
106 double *weight_data = NULL, min_weight, max_weight;
107 double *dup_value = NULL;
108 long max_factor = 0, min_factor = 0, dup_rows = 0;
109 long random_number_seed = 0;
110 int64_t i, j, input_rows, stored_rows;
111 short probabilistic = 0;
112
113
115 argc =
scanargs(&s_arg, argc, argv);
116 if (argc < 3)
118
119
120 for (i_arg = 1; i_arg < argc; i_arg++) {
121 if (s_arg[i_arg].arg_type == OPTION) {
122
123 switch (
match_string(s_arg[i_arg].list[0], option, N_OPTIONS, 0)) {
124 case SET_PIPE:
125
126 if (!
processPipeOption(s_arg[i_arg].list + 1, s_arg[i_arg].n_items - 1, &pipe_flags))
128 break;
129 case SET_WEIGHT:
130
131 if (s_arg[i_arg].n_items != 2 || !(weight_column_name = s_arg[i_arg].list[1]))
132 bomb(
"invalid -weight syntax", usage);
133 break;
134 case SET_FACTOR:
135
136 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &dup_rows) != 1 || dup_rows <= 0)
137 bomb(
"invalid -rows syntax", usage);
138 break;
139 case SET_MINFACTOR:
140
141 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &min_factor) != 1 || min_factor <= 0)
142 bomb(
"invalid -minFactor syntax", usage);
143 break;
144 case SET_MAXFACTOR:
145
146 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &max_factor) != 1 || max_factor <= 0)
147 bomb(
"invalid -maxFactor syntax", usage);
148 break;
149 case SET_VERBOSITY:
150
151 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &verbosity) != 1 || verbosity < 0)
152 bomb(
"invalid -verbosity syntax", usage);
153 break;
154 case SET_PROBABILISTIC:
155
156 probabilistic = 1;
157 if (s_arg[i_arg].n_items != 1)
158 bomb(
"invalid -probabilistic syntax", usage);
159 break;
160 case SET_SEED:
161
162 if (s_arg[i_arg].n_items != 2 || sscanf(s_arg[i_arg].list[1], "%ld", &random_number_seed) != 1 || random_number_seed < 0)
163 bomb(
"invalid -seed syntax", usage);
164 break;
165 default:
166
167 bomb(
"unrecognized option", usage);
168 break;
169 }
170 } else {
171
172 if (!inputfile)
173 inputfile = s_arg[i_arg].list[0];
174 else if (!outputfile)
175 outputfile = s_arg[i_arg].list[0];
176 else
178 }
179 }
180
181
182 if (min_factor && max_factor)
183 SDDS_Bomb(
"give only one of -minFactor and -maxFactor");
184
185
186 processFilenames(
"sddsduplicate", &inputfile, &outputfile, pipe_flags, 0, NULL);
187
188
189 if (random_number_seed == 0) {
190 random_number_seed = (long)time(NULL);
191 random_number_seed = 2 * (random_number_seed / 2) + 1;
192#if defined(_WIN32) || defined(darwin)
193 random_1(-labs(random_number_seed));
194#else
195 random_1(-FABS(random_number_seed));
196#endif
197 } else {
199 }
200
201
206 }
207
208
210 input_rows = SDDS_RowCount(&sdds_input);
211 if (input_rows > 0) {
212 dup_value =
tmalloc(
sizeof(*dup_value) * input_rows);
213
214
215 if (weight_column_name) {
218 }
219
220
221 if (min_factor) {
222 find_min_max(&min_weight, &max_weight, weight_data, input_rows);
223 if (min_weight <= 0)
224 SDDS_Bomb(
"Minimum weight value is nonpositive. Can't use -minFactor.");
225 for (i = 0; i < input_rows; i++)
226 dup_value[i] = weight_data[i] * min_factor / min_weight;
227 } else if (max_factor) {
228 find_min_max(&min_weight, &max_weight, weight_data, input_rows);
229 if (max_weight <= 0)
230 SDDS_Bomb(
"Maximum weight value is nonpositive. Can't use -maxFactor.");
231 for (i = 0; i < input_rows; i++)
232 dup_value[i] = weight_data[i] * max_factor / max_weight;
233 } else {
234 for (i = 0; i < input_rows; i++)
235 dup_value[i] = weight_data[i];
236 }
237
238
239 if (probabilistic) {
240 double fraction;
241 for (i = 0; i < input_rows; i++) {
242 fraction = dup_value[i] - ((long)dup_value[i]);
243 dup_value[i] = (long)dup_value[i];
245 dup_value[i] += 1;
246 }
247 } else {
248 for (i = 0; i < input_rows; i++)
249 dup_value[i] = (long)dup_value[i];
250 }
251 } else {
252
253 for (i = 0; i < input_rows; i++)
254 dup_value[i] = dup_rows;
255 }
256
257
258 stored_rows = 0;
259 for (i = 0; i < input_rows; i++)
260 stored_rows += (int64_t)dup_value[i];
261
262
263 if (verbosity) {
264 int64_t max_dup = 0, min_dup = INT64_MAX;
265 for (i = 0; i < input_rows; i++) {
266 if (max_dup < dup_value[i])
267 max_dup = dup_value[i];
268 if (min_dup > dup_value[i])
269 min_dup = dup_value[i];
270 }
271 fprintf(stderr, "%" PRId64 " output rows, minimum and maximum duplication factor: %" PRId64 ", %" PRId64 "\n",
272 stored_rows, min_dup, max_dup);
273 }
274
275
280 }
281
282
283 stored_rows = 0;
284 for (i = 0; i < input_rows; i++) {
285 for (j = 0; j < dup_value[i]; j++) {
288 }
289 }
290 }
291
292
295 }
296
297 free(dup_value);
298 dup_value = NULL;
299 }
300 }
301
302
305 }
306
307 return 0;
308}
int32_t SDDS_CopyParameters(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
int32_t SDDS_InitializeCopy(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source, char *filename, char *filemode)
int32_t SDDS_CopyRowDirect(SDDS_DATASET *SDDS_target, int64_t target_row, SDDS_DATASET *SDDS_source, int64_t source_row)
int32_t SDDS_CopyArrays(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
void * tmalloc(uint64_t size_of_block)
Allocates a memory block of the specified size with zero initialization.
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
double random_1(long iseed)
Generate a uniform random double in [0,1] using a custom seed initialization.
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
long processPipeOption(char **item, long items, unsigned long *flags)
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)