SDDSlib
Loading...
Searching...
No Matches
sddsoutlier.c File Reference

Eliminates statistical outliers from SDDS data files. More...

#include "mdb.h"
#include "SDDS.h"
#include "scan.h"
#include "SDDSutils.h"
#include <ctype.h>

Go to the source code of this file.

Classes

struct  OUTLIER_CONTROL
 

Macros

#define USAGE
 
#define OUTLIER_CONTROL_INVOKED   0x00001U
 
#define OUTLIER_STDEV_GIVEN   0x00002U
 
#define OUTLIER_FRACTION_GIVEN   0x00004U
 
#define OUTLIER_STDEVLIMIT_GIVEN   0x00008U
 
#define OUTLIER_UNPOPULAR_BINS   0x00010U
 
#define OUTLIER_VERBOSE_GIVEN   0x00020U
 
#define OUTLIER_ABSLIMIT_GIVEN   0x00040U
 
#define OUTLIER_ABSDEVLIMIT_GIVEN   0x00080U
 
#define OUTLIER_INVERT_GIVEN   0x00100U
 
#define OUTLIER_MARKONLY   0x00200U
 
#define OUTLIER_CHANCELIMIT_GIVEN   0x00400U
 
#define OUTLIER_MAXLIMIT_GIVEN   0x00800U
 
#define OUTLIER_MINLIMIT_GIVEN   0x01000U
 
#define OUTLIER_REPLACELAST   0x02000U
 
#define OUTLIER_REPLACENEXT   0x04000U
 
#define OUTLIER_REPLACEINTERP   0x08000U
 
#define OUTLIER_REPLACEVALUE   0x10000U
 
#define OUTLIER_REPLACEFLAGS   (OUTLIER_REPLACELAST | OUTLIER_REPLACENEXT | OUTLIER_REPLACEINTERP | OUTLIER_REPLACEVALUE)
 
#define OUTLIER_PERCENTILE_LOWER   0x20000U
 
#define OUTLIER_PERCENTILE_UPPER   0x40000U
 
#define OUTLIER_PERCENTILE_FLAGS   (OUTLIER_PERCENTILE_LOWER | OUTLIER_PERCENTILE_UPPER)
 

Enumerations

enum  option_type {
  SET_COLUMNS , SET_EXCLUDE , SET_STDDEV_LIMIT , SET_ABS_LIMIT ,
  SET_ABSDEV_LIMIT , SET_VERBOSE , SET_PIPE , SET_NOWARNINGS ,
  SET_INVERT , SET_MARKONLY , SET_CHANCELIMIT , SET_PASSES ,
  SET_REPLACE , SET_MAXLIMIT , SET_MINLIMIT , SET_MAJOR_ORDER ,
  SET_PERCENTILE_LIMIT , SET_UNPOPULAR , N_OPTIONS
}
 

Functions

int64_t removeOutliers (SDDS_DATASET *SDDSin, int64_t rows, char **column, long columns, OUTLIER_CONTROL *outlierControl, int32_t *isOutlier)
 
long meanStDevForFlaggedData (double *mean, double *stDev, double *data, int32_t *keep, int64_t rows)
 
int main (int argc, char **argv)
 

Variables

char * option [N_OPTIONS]
 

Detailed Description

Eliminates statistical outliers from SDDS data files.

This program processes an input SDDS (Self Describing Data Set) file to identify and eliminate statistical outliers based on various criteria such as standard deviation limits, absolute limits, percentile limits, and more. Outliers can be removed, marked, or replaced with specified values. The processed data is then written to an output SDDS file.

Usage

sddsoutlier [<inputfile>] [<outputfile>] [OPTIONS]

Options

-pipe=[input][,output] -verbose -noWarnings -columns=<list-of-names> -excludeColumns=<list-of-names> -stDevLimit=

-absLimit=

-absDeviationLimit=

[,neighbor=<number>] -minimumLimit=

-maximumLimit=

-chanceLimit=<minimumChance> -percentileLimit=lower=<lowerPercent>,upper=<upperPercent> -unpopular=bins=<number> -invert -majorOrder -markOnly -replaceOnly={lastValue|nextValue|interpolatedValue|value=<number>} -passes=<number>

License
This file is distributed under the terms of the Software License Agreement found in the file LICENSE included with this distribution.
Author
M. Borland, C. Saunders, R. Soliday, H. Shang

Definition in file sddsoutlier.c.

Macro Definition Documentation

◆ OUTLIER_ABSDEVLIMIT_GIVEN

#define OUTLIER_ABSDEVLIMIT_GIVEN   0x00080U

Definition at line 146 of file sddsoutlier.c.

◆ OUTLIER_ABSLIMIT_GIVEN

#define OUTLIER_ABSLIMIT_GIVEN   0x00040U

Definition at line 145 of file sddsoutlier.c.

◆ OUTLIER_CHANCELIMIT_GIVEN

#define OUTLIER_CHANCELIMIT_GIVEN   0x00400U

Definition at line 149 of file sddsoutlier.c.

◆ OUTLIER_CONTROL_INVOKED

#define OUTLIER_CONTROL_INVOKED   0x00001U

Definition at line 139 of file sddsoutlier.c.

◆ OUTLIER_FRACTION_GIVEN

#define OUTLIER_FRACTION_GIVEN   0x00004U

Definition at line 141 of file sddsoutlier.c.

◆ OUTLIER_INVERT_GIVEN

#define OUTLIER_INVERT_GIVEN   0x00100U

Definition at line 147 of file sddsoutlier.c.

◆ OUTLIER_MARKONLY

#define OUTLIER_MARKONLY   0x00200U

Definition at line 148 of file sddsoutlier.c.

◆ OUTLIER_MAXLIMIT_GIVEN

#define OUTLIER_MAXLIMIT_GIVEN   0x00800U

Definition at line 150 of file sddsoutlier.c.

◆ OUTLIER_MINLIMIT_GIVEN

#define OUTLIER_MINLIMIT_GIVEN   0x01000U

Definition at line 151 of file sddsoutlier.c.

◆ OUTLIER_PERCENTILE_FLAGS

#define OUTLIER_PERCENTILE_FLAGS   (OUTLIER_PERCENTILE_LOWER | OUTLIER_PERCENTILE_UPPER)

Definition at line 159 of file sddsoutlier.c.

◆ OUTLIER_PERCENTILE_LOWER

#define OUTLIER_PERCENTILE_LOWER   0x20000U

Definition at line 157 of file sddsoutlier.c.

◆ OUTLIER_PERCENTILE_UPPER

#define OUTLIER_PERCENTILE_UPPER   0x40000U

Definition at line 158 of file sddsoutlier.c.

◆ OUTLIER_REPLACEFLAGS

#define OUTLIER_REPLACEFLAGS   (OUTLIER_REPLACELAST | OUTLIER_REPLACENEXT | OUTLIER_REPLACEINTERP | OUTLIER_REPLACEVALUE)

Definition at line 156 of file sddsoutlier.c.

◆ OUTLIER_REPLACEINTERP

#define OUTLIER_REPLACEINTERP   0x08000U

Definition at line 154 of file sddsoutlier.c.

◆ OUTLIER_REPLACELAST

#define OUTLIER_REPLACELAST   0x02000U

Definition at line 152 of file sddsoutlier.c.

◆ OUTLIER_REPLACENEXT

#define OUTLIER_REPLACENEXT   0x04000U

Definition at line 153 of file sddsoutlier.c.

◆ OUTLIER_REPLACEVALUE

#define OUTLIER_REPLACEVALUE   0x10000U

Definition at line 155 of file sddsoutlier.c.

◆ OUTLIER_STDEV_GIVEN

#define OUTLIER_STDEV_GIVEN   0x00002U

Definition at line 140 of file sddsoutlier.c.

◆ OUTLIER_STDEVLIMIT_GIVEN

#define OUTLIER_STDEVLIMIT_GIVEN   0x00008U

Definition at line 142 of file sddsoutlier.c.

◆ OUTLIER_UNPOPULAR_BINS

#define OUTLIER_UNPOPULAR_BINS   0x00010U

Definition at line 143 of file sddsoutlier.c.

◆ OUTLIER_VERBOSE_GIVEN

#define OUTLIER_VERBOSE_GIVEN   0x00020U

Definition at line 144 of file sddsoutlier.c.

◆ USAGE

#define USAGE

Definition at line 97 of file sddsoutlier.c.

97#define USAGE "Usage:\n" \
98 "sddsoutlier [<inputfile>] [<outputfile>] [OPTIONS]\n\n" \
99 "Options:\n" \
100 " -pipe=[input][,output]\n" \
101 " Use standard input and/or output as data streams.\n" \
102 " -verbose\n" \
103 " Enable verbose output, displaying processing information.\n" \
104 " -noWarnings\n" \
105 " Suppress warning messages.\n" \
106 " -columns=<list-of-names>\n" \
107 " Specify a comma-separated list of column names to process.\n" \
108 " -excludeColumns=<list-of-names>\n" \
109 " Specify a comma-separated list of column names to exclude from processing.\n" \
110 " -stDevLimit=<value>\n" \
111 " Point is an outlier if it is more than <value> standard deviations from the mean.\n" \
112 " -absLimit=<value>\n" \
113 " Point is an outlier if it has an absolute value greater than <value>.\n" \
114 " -absDeviationLimit=<value>[,neighbor=<number>]\n" \
115 " Point is an outlier if its absolute deviation from the mean exceeds <value>.\n" \
116 " If neighbor is provided, the mean is computed with the neighbors instead of the whole data.\n" \
117 " -minimumLimit=<value>\n" \
118 " Point is an outlier if it is less than <value>.\n" \
119 " -maximumLimit=<value>\n" \
120 " Point is an outlier if it is greater than <value>.\n" \
121 " -chanceLimit=<minimumChance>\n" \
122 " Point is an outlier if it has a probability less than <minimumChance> of occurring (Gaussian statistics).\n" \
123 " -percentileLimit=lower=<lowerPercent>,upper=<upperPercent>\n" \
124 " Point is an outlier if it is below the <lowerPercent> percentile or above the <upperPercent> percentile.\n" \
125 " -unpopular=bins=<number>\n" \
126 " Remove points that are not in the most populated bin based on a histogram with <number> bins.\n" \
127 " -invert\n" \
128 " Invert the outlier selection criteria.\n" \
129 " -majorOrder=row|column\n" \
130 " Specify output file data ordering as row or column major.\n" \
131 " -markOnly\n" \
132 " Mark identified outliers without removing them.\n" \
133 " -replaceOnly={lastValue|nextValue|interpolatedValue|value=<number>}\n" \
134 " Replace outliers with specified values or strategies.\n" \
135 " -passes=<number>\n" \
136 " Define the number of passes for outlier detection.\n\n" \
137 "Program by Michael Borland. (" __DATE__ " " __TIME__ ", SVN revision: " SVN_VERSION ")\n"

Enumeration Type Documentation

◆ option_type

enum option_type

Definition at line 54 of file sddsoutlier.c.

54 {
55 SET_COLUMNS,
56 SET_EXCLUDE,
57 SET_STDDEV_LIMIT,
58 SET_ABS_LIMIT,
59 SET_ABSDEV_LIMIT,
60 SET_VERBOSE,
61 SET_PIPE,
62 SET_NOWARNINGS,
63 SET_INVERT,
64 SET_MARKONLY,
65 SET_CHANCELIMIT,
66 SET_PASSES,
67 SET_REPLACE,
68 SET_MAXLIMIT,
69 SET_MINLIMIT,
70 SET_MAJOR_ORDER,
71 SET_PERCENTILE_LIMIT,
72 SET_UNPOPULAR,
73 N_OPTIONS
74};

Function Documentation

◆ main()

int main ( int argc,
char ** argv )

Definition at line 175 of file sddsoutlier.c.

175 {
176 int iArg;
177 char **column, **excludeColumn;
178 long columns, excludeColumns;
179 char *input, *output;
180 SCANNED_ARG *scanned;
181 SDDS_DATASET SDDSin, SDDSout;
182 long readCode, dataLimitGiven, tmpfileUsed;
183 int64_t i, rows;
184 long noWarnings, isOutlierIndex;
185 int32_t *isOutlier;
186 OUTLIER_CONTROL outlierControl;
187 unsigned long pipeFlags, tmpFlags, majorOrderFlag, dummyFlags;
188 short columnMajorOrder = -1;
189
191 argc = scanargs(&scanned, argc, argv);
192 if (argc < 3) {
193 bomb(NULL, USAGE);
194 }
195
196 output = input = NULL;
197 columns = excludeColumns = dataLimitGiven = 0;
198 column = excludeColumn = NULL;
199
200 outlierControl.flags = 0;
201 outlierControl.passes = 1;
202 outlierControl.neighbors = 0;
203 pipeFlags = tmpfileUsed = noWarnings = isOutlierIndex = 0;
204
205 for (iArg = 1; iArg < argc; iArg++) {
206 if (scanned[iArg].arg_type == OPTION) {
207 /* process options here */
208 switch (match_string(scanned[iArg].list[0], option, N_OPTIONS, 0)) {
209 case SET_MAJOR_ORDER:
210 majorOrderFlag = 0;
211 scanned[iArg].n_items--;
212 if (scanned[iArg].n_items > 0 &&
213 (!scanItemList(&majorOrderFlag, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
214 "row", -1, NULL, 0, SDDS_ROW_MAJOR_ORDER,
215 "column", -1, NULL, 0, SDDS_COLUMN_MAJOR_ORDER, NULL)))
216 SDDS_Bomb("invalid -majorOrder syntax/values");
217 if (majorOrderFlag & SDDS_COLUMN_MAJOR_ORDER)
218 columnMajorOrder = 1;
219 else if (majorOrderFlag & SDDS_ROW_MAJOR_ORDER)
220 columnMajorOrder = 0;
221 break;
222 case SET_COLUMNS:
223 if (columns)
224 SDDS_Bomb("only one -columns option may be given");
225 if (scanned[iArg].n_items < 2)
226 SDDS_Bomb("invalid -columns syntax");
227 column = tmalloc(sizeof(*column) * (columns = scanned[iArg].n_items - 1));
228 for (i = 0; i < columns; i++)
229 column[i] = scanned[iArg].list[i + 1];
230 break;
231 case SET_EXCLUDE:
232 if (excludeColumns)
233 SDDS_Bomb("only one -excludecolumns option may be given");
234 if (scanned[iArg].n_items < 2)
235 SDDS_Bomb("invalid -excludecolumns syntax");
236 excludeColumn = tmalloc(sizeof(*excludeColumn) * (excludeColumns = scanned[iArg].n_items - 1));
237 for (i = 0; i < excludeColumns; i++)
238 excludeColumn[i] = scanned[iArg].list[i + 1];
239 break;
240 case SET_STDDEV_LIMIT:
241 if (scanned[iArg].n_items != 2 || sscanf(scanned[iArg].list[1], "%lf", &outlierControl.stDevLimit) != 1 ||
242 outlierControl.stDevLimit <= 0)
243 SDDS_Bomb("invalid -stDevLimit syntax");
244 outlierControl.flags |= OUTLIER_CONTROL_INVOKED | OUTLIER_STDEV_GIVEN | OUTLIER_STDEVLIMIT_GIVEN;
245 break;
246 case SET_ABS_LIMIT:
247 if (scanned[iArg].n_items != 2 || sscanf(scanned[iArg].list[1], "%lf", &outlierControl.absoluteLimit) != 1 ||
248 outlierControl.absoluteLimit <= 0)
249 SDDS_Bomb("invalid -absLimit syntax");
250 outlierControl.flags |= OUTLIER_CONTROL_INVOKED | OUTLIER_ABSLIMIT_GIVEN;
251 break;
252 case SET_ABSDEV_LIMIT:
253 if (scanned[iArg].n_items < 2)
254 SDDS_Bomb("invalid -absDeviationLimit syntax");
255 if (scanned[iArg].n_items == 2) {
256 if (sscanf(scanned[iArg].list[1], "%lf", &outlierControl.absDevLimit) != 1 || outlierControl.absDevLimit <= 0)
257 SDDS_Bomb("invalid -absDeviationLimit syntax");
258 } else {
259 if (sscanf(scanned[iArg].list[1], "%lf", &outlierControl.absDevLimit) != 1 || outlierControl.absDevLimit <= 0)
260 SDDS_Bomb("invalid -absDeviationLimit syntax");
261 scanned[iArg].list += 2;
262 scanned[iArg].n_items -= 2;
263 if (scanned[iArg].n_items > 0 &&
264 (!scanItemList(&dummyFlags, scanned[iArg].list, &scanned[iArg].n_items, 0, "neighbors", SDDS_LONG, &(outlierControl.neighbors), 1, 0, NULL)))
265 SDDS_Bomb("invalid -absDeviationLimit syntax/value");
266 if (outlierControl.neighbors % 2 == 0)
267 outlierControl.neighbors += 1;
268 /* always make it an odd number */
269 scanned[iArg].list -= 2;
270 scanned[iArg].n_items += 2;
271 }
272 outlierControl.flags |= OUTLIER_CONTROL_INVOKED | OUTLIER_ABSDEVLIMIT_GIVEN;
273 break;
274 case SET_VERBOSE:
275 outlierControl.flags |= OUTLIER_VERBOSE_GIVEN;
276 break;
277 case SET_PIPE:
278 if (!processPipeOption(scanned[iArg].list + 1, scanned[iArg].n_items - 1, &pipeFlags))
279 SDDS_Bomb("invalid -pipe syntax");
280 break;
281 case SET_NOWARNINGS:
282 noWarnings = 1;
283 break;
284 case SET_INVERT:
285 outlierControl.flags |= OUTLIER_INVERT_GIVEN;
286 break;
287 case SET_MARKONLY:
288 outlierControl.flags |= OUTLIER_MARKONLY;
289 break;
290 case SET_CHANCELIMIT:
291 if (scanned[iArg].n_items != 2 ||
292 sscanf(scanned[iArg].list[1], "%lf", &outlierControl.chanceLimit) != 1 ||
293 outlierControl.chanceLimit <= 0)
294 SDDS_Bomb("invalid -chanceLimit syntax");
295 outlierControl.flags |= OUTLIER_CONTROL_INVOKED | OUTLIER_CHANCELIMIT_GIVEN;
296 break;
297 case SET_PASSES:
298 if (scanned[iArg].n_items != 2 ||
299 sscanf(scanned[iArg].list[1], "%ld", &outlierControl.passes) != 1 ||
300 outlierControl.passes < 1)
301 SDDS_Bomb("invalid -passes syntax");
302 break;
303 case SET_MAXLIMIT:
304 outlierControl.flags |= OUTLIER_MAXLIMIT_GIVEN | OUTLIER_CONTROL_INVOKED;
305 if (scanned[iArg].n_items != 2 ||
306 sscanf(scanned[iArg].list[1], "%lf", &outlierControl.maximumLimit) != 1)
307 SDDS_Bomb("invalid -maximumLimit syntax");
308 break;
309 case SET_MINLIMIT:
310 outlierControl.flags |= OUTLIER_MINLIMIT_GIVEN | OUTLIER_CONTROL_INVOKED;
311 if (scanned[iArg].n_items != 2 ||
312 sscanf(scanned[iArg].list[1], "%lf", &outlierControl.minimumLimit) != 1)
313 SDDS_Bomb("invalid -minimumLimit syntax");
314 break;
315 case SET_REPLACE:
316 if (scanned[iArg].n_items != 2)
317 SDDS_Bomb("invalid -replace syntax");
318 scanned[iArg].n_items -= 1;
319 if (!scanItemList(&tmpFlags, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
320 "lastvalue", -1, NULL, 0, OUTLIER_REPLACELAST,
321 "nextvalue", -1, NULL, 0, OUTLIER_REPLACENEXT,
322 "interpolatedvalue", -1, NULL, 0, OUTLIER_REPLACEINTERP,
323 "value", SDDS_DOUBLE, &outlierControl.replacementValue, 1, OUTLIER_REPLACEVALUE, NULL))
324 SDDS_Bomb("invalid -replace syntax/values");
325 outlierControl.flags |= tmpFlags | OUTLIER_CONTROL_INVOKED;
326 break;
327 case SET_PERCENTILE_LIMIT:
328 if (scanned[iArg].n_items < 3)
329 SDDS_Bomb("invalid -percentileLimit syntax");
330 scanned[iArg].n_items -= 1;
331 if (!scanItemList(&tmpFlags, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
332 "lower", SDDS_DOUBLE, &outlierControl.percentilePoint[0], 1, OUTLIER_PERCENTILE_LOWER,
333 "upper", SDDS_DOUBLE, &outlierControl.percentilePoint[1], 1, OUTLIER_PERCENTILE_UPPER, NULL) ||
334 !(tmpFlags & OUTLIER_PERCENTILE_LOWER) || !(tmpFlags & OUTLIER_PERCENTILE_UPPER) ||
335 outlierControl.percentilePoint[0] >= outlierControl.percentilePoint[1])
336 SDDS_Bomb("invalid -percentileLimit syntax");
337 outlierControl.flags |= tmpFlags | OUTLIER_CONTROL_INVOKED;
338 break;
339 case SET_UNPOPULAR:
340 if (scanned[iArg].n_items < 2)
341 SDDS_Bomb("invalid -unpopular syntax");
342 scanned[iArg].n_items -= 1;
343 if (!scanItemList(&tmpFlags, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
344 "bins", SDDS_LONG, &(outlierControl.unpopularBins), 1, OUTLIER_UNPOPULAR_BINS, NULL) ||
345 !(tmpFlags & OUTLIER_UNPOPULAR_BINS) || outlierControl.unpopularBins < 2)
346 SDDS_Bomb("invalid -unpopular syntax");
347 outlierControl.flags |= tmpFlags | OUTLIER_CONTROL_INVOKED;
348 break;
349 default:
350 fprintf(stderr, "Error: Unknown or ambiguous option: %s\n", scanned[iArg].list[0]);
351 exit(EXIT_FAILURE);
352 break;
353 }
354 } else {
355 if (!input)
356 input = scanned[iArg].list[0];
357 else if (!output)
358 output = scanned[iArg].list[0];
359 else
360 SDDS_Bomb("too many filenames seen");
361 }
362 }
363 if (outlierControl.flags & OUTLIER_REPLACEFLAGS && outlierControl.flags & OUTLIER_MARKONLY)
364 SDDS_Bomb("Cannot use -replaceOnly and -markOnly simultaneously.");
365
366 processFilenames("sddsoutlier", &input, &output, pipeFlags, noWarnings, &tmpfileUsed);
367
368 if (!(outlierControl.flags & OUTLIER_CONTROL_INVOKED)) {
369 outlierControl.flags |= OUTLIER_CONTROL_INVOKED | OUTLIER_STDEV_GIVEN | OUTLIER_STDEVLIMIT_GIVEN;
370 outlierControl.stDevLimit = 2;
371 }
372
373 if (!SDDS_InitializeInput(&SDDSin, input) ||
374 !SDDS_InitializeCopy(&SDDSout, &SDDSin, output, "w"))
375 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
376
377 if (columnMajorOrder != -1)
378 SDDSout.layout.data_mode.column_major = columnMajorOrder;
379 else
380 SDDSout.layout.data_mode.column_major = SDDSin.layout.data_mode.column_major;
381
382 if (outlierControl.flags & OUTLIER_MARKONLY &&
383 (isOutlierIndex = SDDS_GetColumnIndex(&SDDSout, "IsOutlier")) < 0) {
384 if (!SDDS_DefineColumn(&SDDSout, "IsOutlier", NULL, NULL, NULL, NULL, SDDS_SHORT, 0))
385 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
386 }
387
388 if (!SDDS_WriteLayout(&SDDSout))
389 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
390
391 if ((columns = expandColumnPairNames(&SDDSout, &column, NULL, columns, excludeColumn, excludeColumns, FIND_NUMERIC_TYPE, 0)) <= 0) {
392 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
393 SDDS_Bomb("No columns selected for outlier control.");
394 }
395
396 isOutlier = NULL;
397 while ((readCode = SDDS_ReadPage(&SDDSin)) > 0) {
398 if (!SDDS_CopyPage(&SDDSout, &SDDSin))
399 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
400
401 if ((rows = SDDS_CountRowsOfInterest(&SDDSout)) < 3) {
402 if (!SDDS_WritePage(&SDDSout))
403 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
404 continue;
405 }
406 if (outlierControl.flags & OUTLIER_MARKONLY) {
407 if (isOutlierIndex >= 0) {
408 if (!(isOutlier = SDDS_GetNumericColumn(&SDDSout, "IsOutlier", SDDS_LONG)))
409 SDDS_Bomb("Unable to retrieve 'IsOutlier' column from input file despite its existence.");
410 } else {
411 long i;
412 isOutlier = SDDS_Realloc(isOutlier, sizeof(*isOutlier) * rows);
413 if (!isOutlier)
414 SDDS_Bomb("Memory allocation failure.");
415 for (i = 0; i < rows; i++)
416 isOutlier[i] = 0;
417 }
418 }
419 if (outlierControl.flags & OUTLIER_VERBOSE_GIVEN)
420 fprintf(stderr, "%" PRId64 " rows in page %ld\n", rows, readCode);
421 if ((rows = removeOutliers(&SDDSout, rows, column, columns, &outlierControl, isOutlier)) == 0) {
422 if (!noWarnings)
423 fprintf(stderr, " No rows left after outlier control--skipping page.\n");
424 continue;
425 }
426 if (outlierControl.flags & OUTLIER_VERBOSE_GIVEN)
427 fprintf(stderr, "%" PRId64 " rows left after outlier control\n", rows);
428 if (rows != SDDS_CountRowsOfInterest(&SDDSout)) {
429 fprintf(stderr, "Problem with row selection:\n %" PRId64 " expected, %" PRId64 " counted\n", rows, SDDS_CountRowsOfInterest(&SDDSout));
430 exit(EXIT_FAILURE);
431 }
432 if ((isOutlier && !SDDS_SetColumnFromLongs(&SDDSout, SDDS_SET_BY_NAME, isOutlier, rows, "IsOutlier")) ||
433 !SDDS_WritePage(&SDDSout))
434 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
435 }
436 if (!SDDS_Terminate(&SDDSin) || !SDDS_Terminate(&SDDSout))
437 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
438 if (tmpfileUsed && !replaceFileAndBackUp(input, output))
439 exit(EXIT_FAILURE);
440 return EXIT_SUCCESS;
441}
int32_t SDDS_InitializeCopy(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source, char *filename, char *filemode)
Definition SDDS_copy.c:40
int32_t SDDS_CopyPage(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
Definition SDDS_copy.c:578
int32_t SDDS_SetColumnFromLongs(SDDS_DATASET *SDDS_dataset, int32_t mode, int32_t *data, int64_t rows,...)
Sets the values for a single data column using long integer numbers.
void * SDDS_GetNumericColumn(SDDS_DATASET *SDDS_dataset, char *column_name, int32_t desiredType)
Retrieves the data of a specified numerical column as an array of a desired numerical type,...
int64_t SDDS_CountRowsOfInterest(SDDS_DATASET *SDDS_dataset)
Counts the number of rows marked as "of interest" in the current data table.
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_DefineColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, int32_t field_length)
Defines a data column within the SDDS dataset.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_GetColumnIndex(SDDS_DATASET *SDDS_dataset, char *name)
Retrieves the index of a named column in the SDDS dataset.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
Definition SDDS_utils.c:677
#define SDDS_LONG
Identifier for the signed 32-bit integer data type.
Definition SDDStypes.h:61
#define SDDS_SHORT
Identifier for the signed short integer data type.
Definition SDDStypes.h:73
#define SDDS_DOUBLE
Identifier for the double data type.
Definition SDDStypes.h:37
void * tmalloc(uint64_t size_of_block)
Allocates a memory block of the specified size with zero initialization.
Definition array.c:59
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
Definition bomb.c:26
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
long replaceFileAndBackUp(char *file, char *replacement)
Replaces a file with a replacement file and creates a backup of the original.
Definition replacefile.c:75
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390
long scanItemList(unsigned long *flags, char **item, long *items, unsigned long mode,...)
Scans a list of items and assigns values based on provided keywords and types.

◆ meanStDevForFlaggedData()

long meanStDevForFlaggedData ( double * mean,
double * stDev,
double * data,
int32_t * keep,
int64_t rows )

Definition at line 750 of file sddsoutlier.c.

750 {
751 int64_t irow, summed;
752 double sum1, sum2, value;
753
754 *mean = *stDev = 0;
755 for (irow = sum1 = summed = 0; irow < rows; irow++) {
756 if (!keep[irow])
757 continue;
758 sum1 += data[irow];
759 summed++;
760 }
761 if (summed < 2)
762 return 0;
763 *mean = sum1 / summed;
764 for (irow = sum2 = 0; irow < rows; irow++) {
765 if (keep[irow]) {
766 value = data[irow] - *mean;
767 sum2 += value * value;
768 }
769 }
770 *stDev = sqrt(sum2 / (summed - 1));
771 return 1;
772}

◆ removeOutliers()

int64_t removeOutliers ( SDDS_DATASET * SDDSin,
int64_t rows,
char ** column,
long columns,
OUTLIER_CONTROL * outlierControl,
int32_t * isOutlier )

Definition at line 443 of file sddsoutlier.c.

443 {
444 long icol, ipass;
445 int64_t irow, kept, killed, j, k, summed;
446 double *data, sum1, stDev, mean;
447 static int32_t *keep = NULL;
448 double lastGoodValue = 0;
449 int64_t irow0, irow1;
450
451 if (!SDDS_SetRowFlags(dataset, 1))
452 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
453
454 /* Allocate or reallocate the keep array */
455 keep = SDDS_Realloc(keep, sizeof(*keep) * rows);
456 if (!keep)
457 SDDS_Bomb("Memory allocation failure.");
458
459 if (!isOutlier) {
460 for (irow = 0; irow < rows; irow++)
461 keep[irow] = 1;
462 kept = rows;
463 } else {
464 for (irow = kept = 0; irow < rows; irow++)
465 if ((keep[irow] = !isOutlier[irow]))
466 kept++;
467 }
468
469 for (icol = 0; icol < columns; icol++) {
470 /* Loop over columns for which outlier control is to be done */
471 data = SDDS_GetColumnInDoubles(dataset, column[icol]);
472 if (!data)
473 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
474
475 for (ipass = 0; ipass < outlierControl->passes; ipass++) {
476 if (outlierControl->flags & OUTLIER_UNPOPULAR_BINS && rows > 1) {
477 double *hist, lo, hi, delta;
478 long imin, imax, ih;
479 hist = tmalloc(sizeof(*hist) * outlierControl->unpopularBins);
480 find_min_max(&lo, &hi, data, rows);
481 make_histogram(hist, outlierControl->unpopularBins, lo, hi, data, rows, 1);
482 delta = (hi - lo) / outlierControl->unpopularBins; /* yes, not bins-1 */
483 index_min_max(&imin, &imax, hist, outlierControl->unpopularBins);
484 for (irow = killed = 0; irow < rows; irow++) {
485 ih = (data[irow] - lo) / delta;
486 if (ih != imax) {
487 killed++;
488 kept--;
489 keep[irow] = 0;
490 }
491 }
492 free(hist);
493 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
494 fprintf(stderr, "%" PRId64 " additional rows killed by column %s unpopular control\n", killed, column[icol]);
495 }
496
497 if (outlierControl->flags & OUTLIER_PERCENTILE_FLAGS) {
498 double percentileResult[2];
499 killed = 0;
500 if (compute_percentiles(percentileResult, outlierControl->percentilePoint, 2, data, rows)) {
501 for (irow = killed = 0; irow < rows; irow++) {
502 if ((data[irow] < percentileResult[0] || data[irow] > percentileResult[1]) && keep[irow]) {
503 killed++;
504 kept--;
505 keep[irow] = 0;
506 }
507 }
508 }
509 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
510 fprintf(stderr, "%" PRId64 " additional rows killed by column %s percentile outlier control\n", killed, column[icol]);
511 }
512
513 if (outlierControl->flags & OUTLIER_MINLIMIT_GIVEN) {
514 /* Limit values to exceed a minimum */
515 for (irow = killed = 0; irow < rows; irow++) {
516 if (keep[irow] && data[irow] < outlierControl->minimumLimit) {
517 kept--;
518 keep[irow] = 0;
519 killed++;
520 }
521 }
522 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
523 fprintf(stderr, "%" PRId64 " additional rows killed by column %s minimum value outlier control\n", killed, column[icol]);
524 }
525
526 if (outlierControl->flags & OUTLIER_MAXLIMIT_GIVEN) {
527 /* Limit values to exceed a maximum */
528 for (irow = killed = 0; irow < rows; irow++) {
529 if (keep[irow] && data[irow] > outlierControl->maximumLimit) {
530 kept--;
531 keep[irow] = 0;
532 killed++;
533 }
534 }
535 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
536 fprintf(stderr, "%" PRId64 " additional rows killed by column %s maximum value outlier control\n", killed, column[icol]);
537 }
538
539 if (outlierControl->flags & OUTLIER_ABSLIMIT_GIVEN) {
540 /* Limit absolute values */
541 for (irow = killed = 0; irow < rows; irow++) {
542 if (keep[irow] && fabs(data[irow]) > outlierControl->absoluteLimit) {
543 kept--;
544 keep[irow] = 0;
545 killed++;
546 }
547 }
548 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
549 fprintf(stderr, "%" PRId64 " additional rows killed by column %s absolute value outlier control\n", killed, column[icol]);
550 }
551
552 if (outlierControl->flags & OUTLIER_ABSDEVLIMIT_GIVEN) {
553 /* Limit absolute deviation from mean */
554 if (outlierControl->neighbors > 0) {
555 for (irow = killed = 0; irow < rows; irow++) {
556 if (!keep[irow])
557 continue;
558 mean = 0;
559 for (j = irow - outlierControl->neighbors / 2; j <= irow + outlierControl->neighbors / 2; j++) {
560 if (j < 0)
561 k = irow + outlierControl->neighbors / 2 - j;
562 else if (j > rows - 1)
563 k = irow - outlierControl->neighbors / 2 - (j - rows + 1);
564 else
565 k = j;
566 mean += fabs(data[k]);
567 }
568 mean = mean / outlierControl->neighbors;
569 if (keep[irow] && fabs(data[irow] - mean) > outlierControl->absDevLimit) {
570 keep[irow] = 0;
571 kept--;
572 killed++;
573 }
574 }
575 } else {
576 for (irow = sum1 = summed = 0; irow < rows; irow++) {
577 if (!keep[irow])
578 continue;
579 sum1 += data[irow];
580 summed++;
581 }
582 if (summed < 1)
583 continue;
584 mean = sum1 / summed;
585 for (irow = killed = 0; irow < rows; irow++)
586 if (keep[irow] && fabs(data[irow] - mean) > outlierControl->absDevLimit) {
587 keep[irow] = 0;
588 kept--;
589 killed++;
590 }
591 }
592 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
593 fprintf(stderr, "%" PRId64 " additional rows killed by column %s absolute deviation outlier control\n", killed, column[icol]);
594 }
595
596 if (outlierControl->flags & OUTLIER_STDEV_GIVEN && kept && meanStDevForFlaggedData(&mean, &stDev, data, keep, rows) && stDev) {
597 /* Limit deviation from mean in units of standard deviations */
598 for (irow = killed = 0; irow < rows; irow++)
599 if (keep[irow] && fabs(data[irow] - mean) > outlierControl->stDevLimit * stDev) {
600 keep[irow] = 0;
601 kept--;
602 killed++;
603 }
604 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
605 fprintf(stderr, "%" PRId64 " additional rows killed by column %s standard deviation outlier control\n", killed, column[icol]);
606 }
607
608 if (outlierControl->flags & OUTLIER_CHANCELIMIT_GIVEN) {
609 /* Limit improbability of a point occurring based on Gaussian statistics */
610 if (kept && meanStDevForFlaggedData(&mean, &stDev, data, keep, rows) && stDev) {
611 int64_t lastKept;
612 double gProb, probOfSeeing;
613 lastKept = kept;
614 for (irow = killed = 0; irow < rows; irow++) {
615 if (!keep[irow])
616 continue;
617 /* gProb = (probability of value >= x) */
618 /* (1-gProb)^n = probability of not seeing x in n trials */
619 gProb = normSigLevel((data[irow] - mean) / stDev, 2);
620 probOfSeeing = 1 - ipow(1 - gProb, lastKept);
621 if (probOfSeeing < outlierControl->chanceLimit) {
622 keep[irow] = 0;
623 kept--;
624 killed++;
625 }
626 }
627 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
628 fprintf(stderr, "%" PRId64 " additional rows killed by column %s chance limit outlier control\n", killed, column[icol]);
629 }
630 }
631 }
632
633 if (outlierControl->flags & OUTLIER_REPLACEFLAGS && (outlierControl->flags & OUTLIER_INVERT_GIVEN)) {
634 for (irow = 0; irow < rows; irow++)
635 keep[irow] = !keep[irow];
636 kept = rows - kept;
637 }
638
639 if (outlierControl->flags & OUTLIER_REPLACELAST) {
640 for (irow = 0; irow < rows; irow++) {
641 if (keep[irow]) {
642 lastGoodValue = data[irow];
643 break;
644 }
645 }
646 for (irow = 0; irow < rows; irow++) {
647 if (!keep[irow]) {
648 keep[irow] = 1;
649 data[irow] = lastGoodValue;
650 } else
651 lastGoodValue = data[irow];
652 }
653 kept = rows;
654 if (!SDDS_SetColumnFromDoubles(dataset, SDDS_SET_BY_NAME, data, rows, column[icol]))
655 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
656 } else if (outlierControl->flags & OUTLIER_REPLACENEXT) {
657 for (irow = rows - 1; irow >= 0; irow--) {
658 if (keep[irow]) {
659 lastGoodValue = data[irow];
660 break;
661 }
662 }
663 for (irow = rows - 1; irow >= 0; irow--) {
664 if (!keep[irow]) {
665 data[irow] = lastGoodValue;
666 keep[irow] = 1;
667 } else
668 lastGoodValue = data[irow];
669 }
670 kept = rows;
671 if (!SDDS_SetColumnFromDoubles(dataset, SDDS_SET_BY_NAME, data, rows, column[icol]))
672 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
673 } else if (outlierControl->flags & OUTLIER_REPLACEINTERP) {
674 irow0 = -1;
675 irow1 = -1;
676 for (irow = 0; irow < rows; irow++) {
677 if (!keep[irow]) {
678 if ((irow0 = irow - 1) >= 0) {
679 if ((irow1 = irow + 1) < rows) {
680 while (irow1 < rows && !keep[irow1])
681 irow1++;
682 }
683 if (irow1 < rows && keep[irow1]) {
684 /* irow is bracketed by irow0 and irow1, both of which have good data */
685 if (!keep[irow0])
686 SDDS_Bomb("Bracketing problem.");
687 for (; irow < irow1; irow++)
688 data[irow] = data[irow0] + (data[irow1] - data[irow0]) / (1.0 * irow1 - irow0) * (irow - irow0);
689 continue;
690 } else {
691 /* Ran off the end with bad points---just duplicate the last good point */
692 for (; irow < rows; irow++)
693 data[irow] = data[irow0];
694 continue;
695 }
696 } else {
697 /* No good point precedes this point---look for a good point following it */
698 for (irow1 = irow + 1; irow1 < rows; irow1++) {
699 if (keep[irow1])
700 break;
701 }
702 if (irow1 != rows) {
703 for (; irow < irow1; irow++)
704 data[irow] = data[irow1];
705 continue;
706 }
707 }
708 }
709 }
710 for (irow = 0; irow < rows; irow++)
711 keep[irow] = 1;
712 kept = rows;
713 if (!SDDS_SetColumnFromDoubles(dataset, SDDS_SET_BY_NAME, data, rows, column[icol]))
714 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
715 } else if (outlierControl->flags & OUTLIER_REPLACEVALUE) {
716 for (irow = 0; irow < rows; irow++) {
717 if (!keep[irow]) {
718 data[irow] = outlierControl->replacementValue;
719 keep[irow] = 1;
720 }
721 }
722 kept = rows;
723 if (!SDDS_SetColumnFromDoubles(dataset, SDDS_SET_BY_NAME, data, rows, column[icol]))
724 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
725 }
726 free(data);
727 }
728
729 if (outlierControl->flags & OUTLIER_INVERT_GIVEN) {
730 for (irow = 0; irow < rows; irow++)
731 keep[irow] = !keep[irow];
732 kept = rows - kept;
733 if (outlierControl->flags & OUTLIER_VERBOSE_GIVEN)
734 fprintf(stderr, "%" PRId64 " rows left after inversion\n", kept);
735 }
736
737 if (isOutlier && (outlierControl->flags & OUTLIER_MARKONLY)) {
738 for (irow = 0; irow < rows; irow++)
739 isOutlier[irow] = !keep[irow];
740 if (!SDDS_SetRowFlags(dataset, 1))
741 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
742 return rows;
743 }
744
745 if (!SDDS_AssertRowFlags(dataset, SDDS_FLAG_ARRAY, keep, rows))
746 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
747 return kept;
748}
int32_t SDDS_SetColumnFromDoubles(SDDS_DATASET *SDDS_dataset, int32_t mode, double *data, int64_t rows,...)
Sets the values for a single data column using double-precision floating-point numbers.
int32_t SDDS_AssertRowFlags(SDDS_DATASET *SDDS_dataset, uint32_t mode,...)
Sets acceptance flags for rows based on specified criteria.
int32_t SDDS_SetRowFlags(SDDS_DATASET *SDDS_dataset, int32_t row_flag_value)
Sets the acceptance flags for all rows in the current data table of a data set.
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int index_min_max(int64_t *imin, int64_t *imax, double *list, int64_t n)
Finds the indices of the minimum and maximum values in a list of doubles.
Definition findMinMax.c:116
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
Definition findMinMax.c:33
double ipow(const double x, const int64_t p)
Compute x raised to the power p (x^p).
Definition ipow.c:33
long make_histogram(double *hist, long n_bins, double lo, double hi, double *data, int64_t n_pts, long new_start)
Compiles a histogram from data points.
long compute_percentiles(double *position, double *percent, long positions, double *x, long n)
Computes multiple percentiles of an array of doubles.
Definition median.c:84
double normSigLevel(double z0, long tails)
Computes the probability that a standard normal variable exceeds a given value.
Definition sigLevel.c:40

Variable Documentation

◆ option

char* option[N_OPTIONS]
Initial value:
= {
"columns",
"excludecolumns",
"stdevlimit",
"abslimit",
"absdeviationlimit",
"verbose",
"pipe",
"nowarnings",
"invert",
"markonly",
"chancelimit",
"passes",
"replaceonly",
"maximumlimit",
"minimumlimit",
"majororder",
"percentilelimit",
"unpopular",
}

Definition at line 76 of file sddsoutlier.c.

76 {
77 "columns",
78 "excludecolumns",
79 "stdevlimit",
80 "abslimit",
81 "absdeviationlimit",
82 "verbose",
83 "pipe",
84 "nowarnings",
85 "invert",
86 "markonly",
87 "chancelimit",
88 "passes",
89 "replaceonly",
90 "maximumlimit",
91 "minimumlimit",
92 "majororder",
93 "percentilelimit",
94 "unpopular",
95};