SDDS ToolKit Programs and Libraries for C and Python
All Classes Files Functions Variables Macros Pages
sddsmselect.c File Reference

Detailed Description

Utility for creating SDDS datasets by matching or equating data between two datasets.

This utility processes SDDS (Self Describing Data Set) files to match or equate rows of data between two input datasets. It provides several options to control the selection logic, output format, and behavior of the program.

Key features include:

  • Matching rows based on specific column values.
  • Equating rows based on numeric column values.
  • Reusing rows of the second dataset for multiple matches.
  • Inverting the selection to choose unmatched rows.
  • Selecting output data order (row-major or column-major).
  • Suppressing warnings for a cleaner output.

The program integrates with the SDDS library for robust data processing and supports piping for flexible input/output handling.

Usage

sddsmselect [<input1>] <input2> [<output>]
[-pipe[=input][,output]]
[-match=<column-name>[=<column-name>][,...]]
[-equate=<column-name>[=<column-name>][,...]]
[-invert]
[-reuse[=rows][,page]]
[-majorOrder=row|column]
[-nowarnings]

Options

Option Description
-pipe Use pipe for input and/or output.
-match Specify columns to match between datasets.
-equate Specify numeric columns to equate between datasets.
-invert Select rows with no matching rows in the second dataset.
-reuse Allow reuse of rows from the second dataset.
-majorOrder Set the output file order to row-major or column-major.
-nowarnings Suppress warning messages.

Incompatibilities

  • -reuse options rows and page cannot be used together.
  • One and only one of -match or -equate must be specified.
  • -invert alters the selection logic and may conflict with specific -reuse settings.
License
This file is distributed under the terms of the Software License Agreement found in the file LICENSE included with this distribution.
Authors
M. Borland, C. Saunders, R. Soliday, H. Shang

Definition in file sddsmselect.c.

#include "mdb.h"
#include "SDDS.h"
#include "scan.h"

Go to the source code of this file.

Typedefs

typedef char * STRING_PAIR[2]
 

Functions

long rows_equate (SDDS_DATASET *SDDS1, int64_t row1, SDDS_DATASET *SDDS2, int64_t row2, long equate_columns, STRING_PAIR *equate_column)
 
int main (int argc, char **argv)
 

Typedef Documentation

◆ STRING_PAIR

typedef char* STRING_PAIR[2]

Definition at line 77 of file sddsmselect.c.

Function Documentation

◆ main()

int main ( int argc,
char ** argv )

Definition at line 116 of file sddsmselect.c.

116 {
117 SDDS_DATASET SDDS_1, SDDS_2, SDDS_output;
118 long i, i_arg, reuse, reusePage;
119 int64_t j, k, rows1, rows2, n, outputRow;
120 SCANNED_ARG *s_arg;
121 char s[200], *ptr;
122 STRING_PAIR *match_column, *equate_column;
123 long match_columns, equate_columns;
124 char *input1, *input2, *output, *match_value;
125 long tmpfile_used, retval1, retval2;
126 long *row_used, warnings, invert;
127 unsigned long pipeFlags, majorOrderFlag;
128 short columnMajorOrder = -1;
129
131 argc = scanargs(&s_arg, argc, argv);
132 if (argc < 3)
133 bomb(NULL, USAGE);
134
135 input1 = input2 = output = NULL;
136 match_column = equate_column = NULL;
137 match_columns = equate_columns = reuse = reusePage = invert = 0;
138 tmpfile_used = 0;
139 warnings = 1;
140 pipeFlags = 0;
141
142 for (i_arg = 1; i_arg < argc; i_arg++) {
143 if (s_arg[i_arg].arg_type == OPTION) {
144 delete_chars(s_arg[i_arg].list[0], "_");
145 switch (match_string(s_arg[i_arg].list[0], option, N_OPTIONS, 0)) {
146 case SET_MAJOR_ORDER:
147 majorOrderFlag = 0;
148 s_arg[i_arg].n_items--;
149 if (s_arg[i_arg].n_items > 0 &&
150 (!scanItemList(&majorOrderFlag, s_arg[i_arg].list + 1, &s_arg[i_arg].n_items,
151 0, "row", -1, NULL, 0, SDDS_ROW_MAJOR_ORDER,
152 "column", -1, NULL, 0, SDDS_COLUMN_MAJOR_ORDER, NULL)))
153 SDDS_Bomb("Invalid -majorOrder syntax or values.");
154 if (majorOrderFlag & SDDS_COLUMN_MAJOR_ORDER)
155 columnMajorOrder = 1;
156 else if (majorOrderFlag & SDDS_ROW_MAJOR_ORDER)
157 columnMajorOrder = 0;
158 break;
159 case SET_MATCH_COLUMNS:
160 if (s_arg[i_arg].n_items < 2)
161 SDDS_Bomb("Invalid -match syntax.");
162 match_column = trealloc(match_column, sizeof(*match_column) * (match_columns + s_arg[i_arg].n_items - 1));
163 for (i = 1; i < s_arg[i_arg].n_items; i++) {
164 if ((ptr = strchr(s_arg[i_arg].list[i], '=')))
165 *ptr++ = 0;
166 else
167 ptr = s_arg[i_arg].list[i];
168 match_column[i - 1 + match_columns][0] = s_arg[i_arg].list[i];
169 match_column[i - 1 + match_columns][1] = ptr;
170 }
171 match_columns += s_arg[i_arg].n_items - 1;
172 break;
173 case SET_EQUATE_COLUMNS:
174 if (s_arg[i_arg].n_items < 2)
175 SDDS_Bomb("Invalid -equate syntax.");
176 equate_column = trealloc(equate_column, sizeof(*equate_column) * (equate_columns + s_arg[i_arg].n_items - 1));
177 for (i = 1; i < s_arg[i_arg].n_items; i++) {
178 if ((ptr = strchr(s_arg[i_arg].list[i], '=')))
179 *ptr++ = 0;
180 else
181 ptr = s_arg[i_arg].list[i];
182 equate_column[i - 1 + equate_columns][0] = s_arg[i_arg].list[i];
183 equate_column[i - 1 + equate_columns][1] = ptr;
184 }
185 equate_columns += s_arg[i_arg].n_items - 1;
186 break;
187 case SET_REUSE:
188 if (s_arg[i_arg].n_items == 1)
189 reuse = 1;
190 else {
191 char *reuseOptions[2] = {"rows", "page"};
192 for (i = 1; i < s_arg[i_arg].n_items; i++) {
193 switch (match_string(s_arg[i_arg].list[i], reuseOptions, 2, 0)) {
194 case 0:
195 reuse = 1;
196 break;
197 case 1:
198 reusePage = 1;
199 break;
200 default:
201 SDDS_Bomb("Unknown reuse keyword.");
202 break;
203 }
204 }
205 }
206 break;
207 case SET_NOWARNINGS:
208 warnings = 0;
209 break;
210 case SET_INVERT:
211 invert = 1;
212 break;
213 case SET_PIPE:
214 if (!processPipeOption(s_arg[i_arg].list + 1, s_arg[i_arg].n_items - 1, &pipeFlags))
215 SDDS_Bomb("Invalid -pipe syntax.");
216 break;
217 default:
218 fprintf(stderr, "Error: Unknown option: %s\n", s_arg[i_arg].list[0]);
219 bomb(NULL, USAGE);
220 break;
221 }
222 } else {
223 if (input1 == NULL)
224 input1 = s_arg[i_arg].list[0];
225 else if (input2 == NULL)
226 input2 = s_arg[i_arg].list[0];
227 else if (output == NULL)
228 output = s_arg[i_arg].list[0];
229 else
230 SDDS_Bomb("Too many filenames provided.");
231 }
232 }
233
234 if (pipeFlags & USE_STDIN && input1) {
235 if (output)
236 SDDS_Bomb("Too many filenames with -pipe option.");
237 output = input2;
238 input2 = input1;
239 input1 = NULL;
240 }
241 processFilenames("sddsmselect", &input1, &output, pipeFlags, !warnings, &tmpfile_used);
242 if (!input2)
243 SDDS_Bomb("Second input file not specified.");
244
245 if (!match_columns && !equate_columns)
246 SDDS_Bomb("Either -match or -equate must be specified.");
247
248 if (!SDDS_InitializeInput(&SDDS_1, input1)) {
249 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
250 exit(EXIT_FAILURE);
251 }
252 if (!SDDS_InitializeInput(&SDDS_2, input2)) {
253 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
254 exit(EXIT_FAILURE);
255 }
256
257 for (i = 0; i < match_columns; i++) {
258 if ((j = SDDS_GetColumnIndex(&SDDS_1, match_column[i][0])) < 0 ||
259 SDDS_GetColumnType(&SDDS_1, j) != SDDS_STRING) {
260 sprintf(s, "Error: Column '%s' not found or not of string type in file '%s'.",
261 match_column[i][0], input1 ? input1 : "stdin");
262 SDDS_SetError(s);
263 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
264 }
265 if ((j = SDDS_GetColumnIndex(&SDDS_2, match_column[i][1])) < 0 ||
266 SDDS_GetColumnType(&SDDS_2, j) != SDDS_STRING) {
267 sprintf(s, "Error: Column '%s' not found or not of string type in file '%s'.",
268 match_column[i][1], input2);
269 SDDS_SetError(s);
270 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
271 }
272 }
273 for (i = 0; i < equate_columns; i++) {
274 if ((j = SDDS_GetColumnIndex(&SDDS_1, equate_column[i][0])) < 0 ||
276 sprintf(s, "Error: Column '%s' not found or not of numeric type in file '%s'.",
277 equate_column[i][0], input1 ? input1 : "stdin");
278 SDDS_SetError(s);
279 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
280 }
281 if ((j = SDDS_GetColumnIndex(&SDDS_2, equate_column[i][1])) < 0 ||
283 sprintf(s, "Error: Column '%s' not found or not of numeric type in file '%s'.",
284 equate_column[i][1], input2);
285 SDDS_SetError(s);
286 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
287 }
288 }
289
290 if (output && (pipeFlags & USE_STDOUT))
291 SDDS_Bomb("Too many filenames with -pipe option.");
292 if (!output && !(pipeFlags & USE_STDOUT)) {
293 if (warnings)
294 fprintf(stderr, "Warning: Existing file '%s' will be replaced.\n", input1 ? input1 : "stdin");
295 tmpfile_used = 1;
296 cp_str(&output, tmpname(NULL));
297 }
298 if (!SDDS_InitializeCopy(&SDDS_output, &SDDS_1, output, "w")) {
299 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
300 exit(EXIT_FAILURE);
301 }
302 if (columnMajorOrder != -1)
303 SDDS_output.layout.data_mode.column_major = columnMajorOrder;
304 else
305 SDDS_output.layout.data_mode.column_major = SDDS_1.layout.data_mode.column_major;
306 if (!SDDS_WriteLayout(&SDDS_output))
307 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
308
309 row_used = NULL;
310 while ((retval1 = SDDS_ReadPage(&SDDS_1)) > 0) {
311 if (!reusePage) {
312 if ((retval2 = SDDS_ReadPage(&SDDS_2)) <= 0) {
313 if (warnings)
314 fprintf(stderr, "Warning: <input2> ends before <input1>.\n");
315 if (invert) {
316 /* Nothing to match, so everything would normally be thrown out */
317 if (!SDDS_CopyPage(&SDDS_output, &SDDS_1) ||
318 !SDDS_WritePage(&SDDS_output))
319 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
320 continue;
321 } else
322 /* Nothing to match, so everything thrown out */
323 break;
324 }
325 } else {
326 if (retval1 == 1 && (retval2 = SDDS_ReadPage(&SDDS_2)) <= 0)
327 SDDS_Bomb("<input2> has no data.");
328 SDDS_SetRowFlags(&SDDS_2, 1);
329 }
330 SDDS_SetRowFlags(&SDDS_1, 1);
331 rows1 = SDDS_CountRowsOfInterest(&SDDS_1);
332 if ((rows2 = SDDS_CountRowsOfInterest(&SDDS_2))) {
333 row_used = SDDS_Realloc(row_used, sizeof(*row_used) * rows2);
334 SDDS_ZeroMemory(row_used, rows2 * sizeof(*row_used));
335 }
336 if (!SDDS_StartPage(&SDDS_output, rows1)) {
337 SDDS_SetError("Problem starting output page.");
338 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
339 }
340 if (!SDDS_CopyParameters(&SDDS_output, &SDDS_1) ||
341 !SDDS_CopyArrays(&SDDS_output, &SDDS_1)) {
342 SDDS_SetError("Problem copying parameter or array data from first input file.");
343 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
344 }
345 outputRow = 0;
346 for (j = 0; j < rows1; j++) {
347 /* Set up to match all rows of file 2 to row j of file 1 */
348 SDDS_SetRowFlags(&SDDS_2, 1);
349 for (i = 0; i < match_columns; i++) {
350 if (!SDDS_GetValue(&SDDS_1, match_column[i][0], j, &match_value)) {
351 sprintf(s, "Problem getting column '%s' from file '%s'.",
352 match_column[i][0], input1 ? input1 : "stdin");
353 SDDS_SetError(s);
354 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
355 }
356 if (SDDS_MatchRowsOfInterest(&SDDS_2, match_column[i][1], match_value, SDDS_AND) < 0) {
357 sprintf(s, "Problem setting rows of interest for column '%s'.",
358 match_column[i][1]);
359 SDDS_SetError(s);
360 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
361 }
362 }
363 n = SDDS_CountRowsOfInterest(&SDDS_2);
364 if ((!n && !invert) || (n && invert))
365 /* No match in file2 for row j of file1, or unwanted match found--so don't copy it */
366 continue;
367 for (k = 0; k < rows2; k++) {
368 if (SDDS_GetRowFlag(&SDDS_2, k) < 0)
369 /* Test if row k of file2 passed string-matches. If not, go to next row */
370 continue;
371 /* If row k of file2 is not already used, then test it for a match to row j of file1.
372 If no -equate options were given, this test is always true.
373 */
374 if (!row_used[k]) {
375 long equal;
376 equal = rows_equate(&SDDS_1, j, &SDDS_2, k, equate_columns, equate_column);
377 if ((equal && !invert) || (!equal && invert)) {
378 row_used[k] = reuse ? 0 : 1;
379 break;
380 }
381 }
382 }
383 if ((k == rows2 && !invert) || (k != rows2 && invert))
384 /* No match in file2 for row j of file1, or unwanted match found--so don't copy it */
385 continue;
386 if (!SDDS_CopyRowDirect(&SDDS_output, outputRow, &SDDS_1, j)) {
387 sprintf(s, "Problem copying to row %" PRId64 " of output from row %" PRId64 " of data set 1.",
388 outputRow, j);
389 SDDS_SetError(s);
390 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
391 }
392 outputRow++;
393 }
394 if (!SDDS_WritePage(&SDDS_output)) {
395 SDDS_SetError("Problem writing data to output file.");
396 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
397 }
398 }
399
400 if (!SDDS_Terminate(&SDDS_1) ||
401 !SDDS_Terminate(&SDDS_2) ||
402 !SDDS_Terminate(&SDDS_output)) {
403 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
404 exit(EXIT_FAILURE);
405 }
406
407 if (tmpfile_used && !replaceFileAndBackUp(input1, output))
408 exit(EXIT_FAILURE);
409
410 return EXIT_SUCCESS;
411}
int32_t SDDS_CopyParameters(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
Definition SDDS_copy.c:286
int32_t SDDS_InitializeCopy(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source, char *filename, char *filemode)
Definition SDDS_copy.c:40
int32_t SDDS_CopyRowDirect(SDDS_DATASET *SDDS_target, int64_t target_row, SDDS_DATASET *SDDS_source, int64_t source_row)
Definition SDDS_copy.c:834
int32_t SDDS_CopyPage(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
Definition SDDS_copy.c:578
int32_t SDDS_CopyArrays(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
Definition SDDS_copy.c:334
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int64_t SDDS_CountRowsOfInterest(SDDS_DATASET *SDDS_dataset)
Counts the number of rows marked as "of interest" in the current data table.
int32_t SDDS_GetRowFlag(SDDS_DATASET *SDDS_dataset, int64_t row)
Retrieves the acceptance flag of a specific row in the current data table.
int32_t SDDS_SetRowFlags(SDDS_DATASET *SDDS_dataset, int32_t row_flag_value)
Sets the acceptance flags for all rows in the current data table of a data set.
int64_t SDDS_MatchRowsOfInterest(SDDS_DATASET *SDDS_dataset, char *selection_column, char *label_to_match, int32_t logic)
Matches and marks rows of interest in an SDDS dataset based on label matching.
void * SDDS_GetValue(SDDS_DATASET *SDDS_dataset, char *column_name, int64_t srow_index, void *memory)
Retrieves the value from a specified column and selected row, optionally storing it in provided memor...
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
void SDDS_SetError(char *error_text)
Records an error message in the SDDS error stack.
Definition SDDS_utils.c:379
int32_t SDDS_ZeroMemory(void *mem, int64_t n_bytes)
Sets a block of memory to zero.
int32_t SDDS_GetColumnIndex(SDDS_DATASET *SDDS_dataset, char *name)
Retrieves the index of a named column in the SDDS dataset.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
int32_t SDDS_GetColumnType(SDDS_DATASET *SDDS_dataset, int32_t index)
Retrieves the data type of a column in the SDDS dataset by its index.
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
Definition SDDS_utils.c:677
#define SDDS_STRING
Identifier for the string data type.
Definition SDDStypes.h:85
#define SDDS_NUMERIC_TYPE(type)
Checks if the given type identifier corresponds to any numeric type.
Definition SDDStypes.h:138
void * trealloc(void *old_ptr, uint64_t size_of_block)
Reallocates a memory block to a new size.
Definition array.c:181
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
Definition bomb.c:26
char * cp_str(char **s, char *t)
Copies a string, allocating memory for storage.
Definition cp_str.c:28
char * delete_chars(char *s, char *t)
Removes all occurrences of characters found in string t from string s.
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
long replaceFileAndBackUp(char *file, char *replacement)
Replaces a file with a replacement file and creates a backup of the original.
Definition replacefile.c:75
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390
long scanItemList(unsigned long *flags, char **item, long *items, unsigned long mode,...)
Scans a list of items and assigns values based on provided keywords and types.
char * tmpname(char *s)
Supplies a unique temporary filename.
Definition tmpname.c:34

◆ rows_equate()

long rows_equate ( SDDS_DATASET * SDDS1,
int64_t row1,
SDDS_DATASET * SDDS2,
int64_t row2,
long equate_columns,
STRING_PAIR * equate_column )

Definition at line 413 of file sddsmselect.c.

413 {
414 char *data1, *data2;
415 long index1, index2, size, type, i;
416 char s[SDDS_MAXLINE];
417 index2 = 0;
418 for (i = 0; i < equate_columns; i++) {
419 if ((index1 = SDDS_GetColumnIndex(SDDS1, equate_column[i][0])) < 0 ||
420 (index2 = SDDS_GetColumnIndex(SDDS2, equate_column[i][1])) < 0) {
421 SDDS_SetError("Problem equating rows.");
422 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
423 }
424 if ((type = SDDS_GetColumnType(SDDS1, index1)) != SDDS_GetColumnType(SDDS2, index2)) {
425 sprintf(s, "Problem equating rows--types don't match for columns '%s' and '%s'.",
426 equate_column[i][0], equate_column[i][1]);
427 SDDS_SetError(s);
428 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
429 }
430 size = SDDS_GetTypeSize(type);
431 data1 = (char *)SDDS1->data[index1] + size * row1;
432 data2 = (char *)SDDS2->data[index2] + size * row2;
433 if (memcmp(data1, data2, size) != 0)
434 return 0;
435 }
436 return 1;
437}
int32_t SDDS_GetTypeSize(int32_t type)
Retrieves the size in bytes of a specified SDDS data type.