SDDS ToolKit Programs and Libraries for C and Python
All Classes Files Functions Variables Macros Pages
sddsselect.c File Reference

Detailed Description

Creates an SDDS data set from another data set based on matching data in a third data set.

This program selects rows from <input1> that have (or do not have, if inverted) a matching entry in <input2>, based on specified matching or equating columns. The output is written to <output>, or <input1> is replaced if <output> is not provided. It supports various options to customize the matching behavior, data processing, and output format.

Usage

sddsselect [<input1>] <input2> [<output>]
[-pipe[=input][,output]]
[-match=<column-name>[=<column-name>]]
[-equate=<column-name>[=<column-name>]]
[-invert]
[-reuse[=rows][,page]]
[-majorOrder=row|column]
[-nowarnings]

Options

Optional Description
-pipe Use pipe for input and/or output.
-match Specify columns to match between <input1> and <input2>.
-equate Specify columns to equate between <input1> and <input2>.
-invert Invert the selection to keep non-matching rows.
-reuse Allow reusing rows or specify page reuse.
-majorOrder Set the output file to row or column major order.
-nowarnings Suppress warning messages.

Incompatibilities

  • Only one of -match or -equate may be specified.
License
This file is distributed under the terms of the Software License Agreement found in the file LICENSE included with this distribution.
Author
M. Borland, C. Saunders, R. Soliday, H. Shang

Definition in file sddsselect.c.

#include "mdb.h"
#include "SDDS.h"
#include "scan.h"

Go to the source code of this file.

Functions

int main (int argc, char **argv)
 

Function Documentation

◆ main()

int main ( int argc,
char ** argv )

Definition at line 98 of file sddsselect.c.

98 {
99 SDDS_DATASET SDDS_1, SDDS_2, SDDS_output;
100 long i, j, i_arg, reuse, reusePage;
101 int64_t rows1, rows2, i1, i2;
102 SCANNED_ARG *s_arg;
103 char s[200], *ptr;
104 char **match_column, **equate_column;
105 long match_columns, equate_columns;
106 char *input1, *input2, *output;
107 long tmpfile_used, retval1, retval2;
108 long warnings, invert;
109 unsigned long pipeFlags, majorOrderFlag;
110 KEYED_EQUIVALENT **keyGroup = NULL;
111 long keyGroups = 0;
112 short columnMajorOrder = -1;
113
115 argc = scanargs(&s_arg, argc, argv);
116 if (argc < 3)
117 bomb(NULL, USAGE);
118
119 input1 = input2 = output = NULL;
120 match_column = equate_column = NULL;
121 match_columns = equate_columns = reuse = reusePage = 0;
122 tmpfile_used = invert = 0;
123 warnings = 1;
124 pipeFlags = 0;
125
126 for (i_arg = 1; i_arg < argc; i_arg++) {
127 if (s_arg[i_arg].arg_type == OPTION) {
128 delete_chars(s_arg[i_arg].list[0], "_");
129 switch (match_string(s_arg[i_arg].list[0], option, N_OPTIONS, 0)) {
130 case SET_MAJOR_ORDER:
131 majorOrderFlag = 0;
132 s_arg[i_arg].n_items--;
133 if (s_arg[i_arg].n_items > 0 &&
134 (!scanItemList(&majorOrderFlag, s_arg[i_arg].list + 1, &s_arg[i_arg].n_items, 0,
135 "row", -1, NULL, 0, SDDS_ROW_MAJOR_ORDER,
136 "column", -1, NULL, 0, SDDS_COLUMN_MAJOR_ORDER, NULL)))
137 SDDS_Bomb("invalid -majorOrder syntax/values");
138 if (majorOrderFlag & SDDS_COLUMN_MAJOR_ORDER)
139 columnMajorOrder = 1;
140 else if (majorOrderFlag & SDDS_ROW_MAJOR_ORDER)
141 columnMajorOrder = 0;
142 break;
143 case SET_MATCH_COLUMN:
144 if (s_arg[i_arg].n_items != 2)
145 SDDS_Bomb("invalid -match syntax");
146 if (match_columns != 0)
147 SDDS_Bomb("only one -match option may be given");
148 match_column = tmalloc(sizeof(*match_column) * 2);
149 if ((ptr = strchr(s_arg[i_arg].list[1], '=')))
150 *ptr++ = 0;
151 else
152 ptr = s_arg[i_arg].list[1];
153 match_column[0] = s_arg[i_arg].list[1];
154 match_column[1] = ptr;
155 match_columns = 1;
156 break;
157 case SET_EQUATE_COLUMN:
158 if (s_arg[i_arg].n_items != 2)
159 SDDS_Bomb("invalid -equate syntax");
160 if (equate_columns != 0)
161 SDDS_Bomb("only one -equate option may be given");
162 equate_column = tmalloc(sizeof(*equate_column) * 2);
163 if ((ptr = strchr(s_arg[i_arg].list[1], '=')))
164 *ptr++ = 0;
165 else
166 ptr = s_arg[i_arg].list[1];
167 equate_column[0] = s_arg[i_arg].list[1];
168 equate_column[1] = ptr;
169 equate_columns = 1;
170 break;
171 case SET_REUSE:
172 if (s_arg[i_arg].n_items == 1)
173 reuse = 1;
174 else {
175 char *reuseOptions[2] = {"rows", "page"};
176 for (i = 1; i < s_arg[i_arg].n_items; i++) {
177 switch (match_string(s_arg[i_arg].list[i], reuseOptions, 2, 0)) {
178 case 0:
179 reuse = 1;
180 break;
181 case 1:
182 reusePage = 1;
183 break;
184 default:
185 SDDS_Bomb("unknown reuse keyword");
186 break;
187 }
188 }
189 }
190 break;
191 case SET_INVERT:
192 invert = 1;
193 break;
194 case SET_NOWARNINGS:
195 warnings = 0;
196 break;
197 case SET_PIPE:
198 if (!processPipeOption(s_arg[i_arg].list + 1, s_arg[i_arg].n_items - 1, &pipeFlags))
199 SDDS_Bomb("invalid -pipe syntax");
200 break;
201 default:
202 fprintf(stderr, "error: unknown switch: %s\n", s_arg[i_arg].list[0]);
203 SDDS_Bomb(NULL);
204 break;
205 }
206 } else {
207 if (input1 == NULL)
208 input1 = s_arg[i_arg].list[0];
209 else if (input2 == NULL)
210 input2 = s_arg[i_arg].list[0];
211 else if (output == NULL)
212 output = s_arg[i_arg].list[0];
213 else
214 SDDS_Bomb("too many filenames");
215 }
216 }
217
218 if (pipeFlags & USE_STDIN && input1) {
219 if (output)
220 SDDS_Bomb("too many filenames (sddsxref)");
221 output = input2;
222 input2 = input1;
223 input1 = NULL;
224 }
225 processFilenames("sddsselect", &input1, &output, pipeFlags, !warnings, &tmpfile_used);
226 if (!input2)
227 SDDS_Bomb("second input file not specified (sddsxref)");
228
229 if (equate_columns && match_columns)
230 SDDS_Bomb("only one of -equate or -match may be given");
231 if (!equate_columns && !match_columns)
232 SDDS_Bomb("one of -equate or -match must be given");
233
234 if (!SDDS_InitializeInput(&SDDS_1, input1)) {
235 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
236 exit(EXIT_FAILURE);
237 }
238 if (!SDDS_InitializeInput(&SDDS_2, input2)) {
239 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
240 exit(EXIT_FAILURE);
241 }
242
243 if (match_columns) {
244 if ((j = SDDS_GetColumnIndex(&SDDS_1, match_column[0])) < 0 ||
245 SDDS_GetColumnType(&SDDS_1, j) != SDDS_STRING) {
246 sprintf(s, "error: column %s not found or not string type in file %s", match_column[0], input1 ? input1 : "stdin");
247 SDDS_SetError(s);
248 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
249 }
250 if ((j = SDDS_GetColumnIndex(&SDDS_2, match_column[1])) < 0 ||
251 SDDS_GetColumnType(&SDDS_2, j) != SDDS_STRING) {
252 sprintf(s, "error: column %s not found or not string type in file %s", match_column[1], input2);
253 SDDS_SetError(s);
254 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
255 }
256 }
257 if (equate_columns) {
258 if ((j = SDDS_GetColumnIndex(&SDDS_1, equate_column[0])) < 0 ||
260 sprintf(s, "error: column %s not found or not numeric type in file %s", equate_column[0], input1 ? input1 : "stdin");
261 SDDS_SetError(s);
262 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
263 }
264 if ((j = SDDS_GetColumnIndex(&SDDS_2, equate_column[1])) < 0 ||
266 sprintf(s, "error: column %s not found or not numeric type in file %s", equate_column[1], input2);
267 SDDS_SetError(s);
268 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
269 }
270 }
271
272 if (output && pipeFlags & USE_STDOUT)
273 SDDS_Bomb("too many filenames with -pipe option");
274 if (!output && !(pipeFlags & USE_STDOUT)) {
275 if (warnings)
276 fprintf(stderr, "warning: existing file %s will be replaced (sddsselect)\n", input1 ? input1 : "stdin");
277 tmpfile_used = 1;
278 cp_str(&output, tmpname(NULL));
279 }
280 if (!SDDS_InitializeCopy(&SDDS_output, &SDDS_1, output, "w")) {
281 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
282 exit(EXIT_FAILURE);
283 }
284 if (columnMajorOrder != -1)
285 SDDS_output.layout.data_mode.column_major = columnMajorOrder;
286 else
287 SDDS_output.layout.data_mode.column_major = SDDS_1.layout.data_mode.column_major;
288 if (!SDDS_WriteLayout(&SDDS_output))
289 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
290
291 while ((retval1 = SDDS_ReadPage(&SDDS_1)) > 0) {
292 if (!reusePage) {
293 if ((retval2 = SDDS_ReadPage(&SDDS_2)) <= 0) {
294 if (warnings)
295 fprintf(stderr, "warning: <input2> ends before <input1>\n");
296 if (invert) {
297 /* nothing to match, so everything would normally be thrown out */
298 if (!SDDS_CopyPage(&SDDS_output, &SDDS_1) ||
299 !SDDS_WritePage(&SDDS_output))
300 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
301 continue;
302 } else
303 /* nothing to match, so everything thrown out */
304 break;
305 }
306 } else {
307 if (retval1 == 1 && (retval2 = SDDS_ReadPage(&SDDS_2)) <= 0)
308 SDDS_Bomb("<input2> has no data");
309 SDDS_SetRowFlags(&SDDS_2, 1);
310 }
311 rows1 = SDDS_CountRowsOfInterest(&SDDS_1);
312 rows2 = SDDS_CountRowsOfInterest(&SDDS_2);
313
314 if (!SDDS_StartPage(&SDDS_output, rows1)) {
315 SDDS_SetError("Problem starting output page");
316 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
317 }
318 if (!SDDS_CopyParameters(&SDDS_output, &SDDS_2) ||
319 !SDDS_CopyArrays(&SDDS_output, &SDDS_2)) {
320 SDDS_SetError("Problem copying parameter or array data from second input file");
321 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
322 }
323 if (!SDDS_CopyParameters(&SDDS_output, &SDDS_1) ||
324 !SDDS_CopyArrays(&SDDS_output, &SDDS_1)) {
325 SDDS_SetError("Problem copying parameter or array data from first input file");
326 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
327 }
328 if (rows1) {
329 if (match_columns) {
330 char **string1, **string2;
331 long matched;
332 string2 = NULL;
333 if (!(string1 = SDDS_GetColumn(&SDDS_1, match_column[0]))) {
334 fprintf(stderr, "Error: problem getting column %s from file %s\n", match_column[0], input1 ? input1 : "stdin");
335 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
336 }
337 if (rows2 && !(string2 = SDDS_GetColumn(&SDDS_2, match_column[1]))) {
338 fprintf(stderr, "Error: problem getting column %s from file %s\n", match_column[1], input2);
339 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
340 }
341 if (rows2)
342 keyGroup = MakeSortedKeyGroups(&keyGroups, SDDS_STRING, string2, rows2);
343 for (i1 = 0; i1 < rows1; i1++) {
344 if (!SDDS_CopyRowDirect(&SDDS_output, i1, &SDDS_1, i1)) {
345 sprintf(s, "Problem copying row %" PRId64 " of first data set", i1);
346 SDDS_SetError(s);
347 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
348 }
349 matched = 0;
350 if (rows2 && (i2 = FindMatchingKeyGroup(keyGroup, keyGroups, SDDS_STRING, string1 + i1, reuse)) >= 0) {
351 matched = 1;
352 }
353 if ((!matched && !invert) || (matched && invert)) {
354 if (!SDDS_AssertRowFlags(&SDDS_output, SDDS_INDEX_LIMITS, i1, i1, 0))
355 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
356 }
357 }
358 if (string1) {
359 for (i = 0; i < rows1; i++)
360 free(string1[i]);
361 free(string1);
362 string1 = NULL;
363 }
364 if (string2) {
365 for (i = 0; i < rows2; i++)
366 free(string2[i]);
367 free(string2);
368 string2 = NULL;
369 }
370 for (i = 0; i < keyGroups; i++) {
371 if (keyGroup[i]) {
372 if (keyGroup[i]->equivalent)
373 free(keyGroup[i]->equivalent);
374 free(keyGroup[i]);
375 keyGroup[i] = NULL;
376 }
377 }
378 if (keyGroups) {
379 free(keyGroup);
380 keyGroup = NULL;
381 keyGroups = 0;
382 }
383 } else if (equate_columns) {
384 double *value1, *value2;
385 long equated;
386 value2 = NULL;
387 if (!(value1 = SDDS_GetColumnInDoubles(&SDDS_1, equate_column[0]))) {
388 fprintf(stderr, "Error: problem getting column %s from file %s\n", equate_column[0], input1 ? input1 : "stdin");
389 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
390 }
391 if (rows2 && !(value2 = SDDS_GetColumnInDoubles(&SDDS_2, equate_column[1]))) {
392 fprintf(stderr, "Error: problem getting column %s from file %s\n", equate_column[1], input2);
393 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
394 }
395 if (rows2)
396 keyGroup = MakeSortedKeyGroups(&keyGroups, SDDS_DOUBLE, value2, rows2);
397 for (i1 = 0; i1 < rows1; i1++) {
398 if (!SDDS_CopyRowDirect(&SDDS_output, i1, &SDDS_1, i1)) {
399 sprintf(s, "Problem copying row %" PRId64 " of first data set", i1);
400 SDDS_SetError(s);
401 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
402 }
403 equated = 0;
404 if (rows2 &&
405 (i2 = FindMatchingKeyGroup(keyGroup, keyGroups, SDDS_DOUBLE, value1 + i1, reuse)) >= 0) {
406 equated = 1;
407 }
408 if ((!equated && !invert) || (equated && invert)) {
409 if (!SDDS_AssertRowFlags(&SDDS_output, SDDS_INDEX_LIMITS, i1, i1, 0))
410 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
411 }
412 }
413 if (value1)
414 free(value1);
415 value1 = NULL;
416 if (rows2 && value2)
417 free(value2);
418 value2 = NULL;
419 for (i = 0; i < keyGroups; i++) {
420 if (keyGroup[i]) {
421 if (keyGroup[i]->equivalent)
422 free(keyGroup[i]->equivalent);
423 free(keyGroup[i]);
424 keyGroup[i] = NULL;
425 }
426 }
427 if (keyGroups) {
428 free(keyGroup);
429 keyGroup = NULL;
430 keyGroups = 0;
431 }
432 }
433 }
434 if (!SDDS_WritePage(&SDDS_output)) {
435 SDDS_SetError("Problem writing data to output file");
436 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
437 }
438 }
439
440 if (!SDDS_Terminate(&SDDS_1) || !SDDS_Terminate(&SDDS_2) || !SDDS_Terminate(&SDDS_output)) {
441 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
442 exit(EXIT_FAILURE);
443 }
444 if (tmpfile_used && !replaceFileAndBackUp(input1, output))
445 exit(EXIT_FAILURE);
446 free_scanargs(&s_arg, argc);
447 if (match_columns)
448 free(match_column);
449 return EXIT_SUCCESS;
450}
int32_t SDDS_CopyParameters(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
Definition SDDS_copy.c:286
int32_t SDDS_InitializeCopy(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source, char *filename, char *filemode)
Definition SDDS_copy.c:40
int32_t SDDS_CopyRowDirect(SDDS_DATASET *SDDS_target, int64_t target_row, SDDS_DATASET *SDDS_source, int64_t source_row)
Definition SDDS_copy.c:834
int32_t SDDS_CopyPage(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
Definition SDDS_copy.c:578
int32_t SDDS_CopyArrays(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
Definition SDDS_copy.c:334
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int32_t SDDS_AssertRowFlags(SDDS_DATASET *SDDS_dataset, uint32_t mode,...)
Sets acceptance flags for rows based on specified criteria.
void * SDDS_GetColumn(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves a copy of the data for a specified column, including only rows marked as "of interest".
int64_t SDDS_CountRowsOfInterest(SDDS_DATASET *SDDS_dataset)
Counts the number of rows marked as "of interest" in the current data table.
int32_t SDDS_SetRowFlags(SDDS_DATASET *SDDS_dataset, int32_t row_flag_value)
Sets the acceptance flags for all rows in the current data table of a data set.
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
void SDDS_SetError(char *error_text)
Records an error message in the SDDS error stack.
Definition SDDS_utils.c:379
int32_t SDDS_GetColumnIndex(SDDS_DATASET *SDDS_dataset, char *name)
Retrieves the index of a named column in the SDDS dataset.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
int32_t SDDS_GetColumnType(SDDS_DATASET *SDDS_dataset, int32_t index)
Retrieves the data type of a column in the SDDS dataset by its index.
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
#define SDDS_STRING
Identifier for the string data type.
Definition SDDStypes.h:85
#define SDDS_DOUBLE
Identifier for the double data type.
Definition SDDStypes.h:37
#define SDDS_NUMERIC_TYPE(type)
Checks if the given type identifier corresponds to any numeric type.
Definition SDDStypes.h:138
void * tmalloc(uint64_t size_of_block)
Allocates a memory block of the specified size with zero initialization.
Definition array.c:59
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
Definition bomb.c:26
char * cp_str(char **s, char *t)
Copies a string, allocating memory for storage.
Definition cp_str.c:28
char * delete_chars(char *s, char *t)
Removes all occurrences of characters found in string t from string s.
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
long replaceFileAndBackUp(char *file, char *replacement)
Replaces a file with a replacement file and creates a backup of the original.
Definition replacefile.c:75
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390
void free_scanargs(SCANNED_ARG **scanned, int argc)
Definition scanargs.c:584
long scanItemList(unsigned long *flags, char **item, long *items, unsigned long mode,...)
Scans a list of items and assigns values based on provided keywords and types.
KEYED_EQUIVALENT ** MakeSortedKeyGroups(long *keyGroups, long keyType, void *data, long points)
Create sorted key groups from data.
long FindMatchingKeyGroup(KEYED_EQUIVALENT **keyGroup, long keyGroups, long keyType, void *searchKeyData, long reuse)
Find a matching key group for a search key.
char * tmpname(char *s)
Supplies a unique temporary filename.
Definition tmpname.c:34