SDDS ToolKit Programs and Libraries for C and Python
All Classes Files Functions Variables Macros Pages
sddsoutlier.c File Reference

Detailed Description

Eliminates statistical outliers from SDDS data files.

This program processes an input SDDS (Self Describing Data Set) file to identify and eliminate statistical outliers based on various criteria such as standard deviation limits, absolute limits, percentile limits, and more. Outliers can be removed, marked, or replaced with specified values. The processed data is then written to an output SDDS file.

Usage

sddsoutlier [<inputfile>] [<outputfile>]
[-pipe=[input][,output]]
[-verbose]
[-noWarnings]
-columns=<list-of-names>
[-excludeColumns=<list-of-names>]
-stDevLimit=<value>
[-absLimit=<value>]
[-absDeviationLimit=<value>[,neighbor=<number>]]
[-maximumLimit=<value>]
[-minimumLimit=<value>]
[-chanceLimit=<minimumChance>]
[-passes=<number>]
[-percentileLimit=lower=<lowerPercent>,upper=<upperPercent>]
[-unpopular=bins=<number>]
[-invert]
[-majorOrder]
[-markOnly]
[-replaceOnly={lastValue|nextValue|interpolatedValue|value=<number>}]

Options

Required Description
-columns Specifies a list of column names to process.
-stDevLimit Sets the standard deviation limit for outlier detection.
Optional Description
-pipe Use standard input/output as data streams.
-verbose Enables verbose output, displaying processing information.
-noWarnings Suppresses warning messages.
-excludeColumns Specifies a list of column names to exclude from processing.
-absLimit Sets an absolute value limit for outlier detection.
-absDeviationLimit Sets the absolute deviation limit from the mean with optional neighbors.
-maximumLimit Sets a maximum value for outlier detection.
-minimumLimit Sets a minimum value for outlier detection.
-chanceLimit Sets a minimum probability threshold for outlier detection (Gaussian statistics).
-passes Sets the number of passes for outlier detection.
-percentileLimit Sets percentile limits for outlier detection.
-unpopular Removes points not in the most populated bin of a histogram.
-invert Inverts the outlier selection criteria.
-majorOrder Specifies output file data ordering (row or column major).
-markOnly Marks identified outliers without removing them.
-replaceOnly Replaces outliers with specified values.

Incompatibilities

  • -markOnly is incompatible with:
    • -replaceOnly
  • For -absDeviationLimit:
    • Requires a positive value for the limit.
    • Optional neighbor parameter must be odd.
License
This file is distributed under the terms of the Software License Agreement found in the file LICENSE included with this distribution.
Authors
M. Borland, C. Saunders, R. Soliday, H. Shang

Definition in file sddsoutlier.c.

#include "mdb.h"
#include "SDDS.h"
#include "scan.h"
#include "SDDSutils.h"
#include <ctype.h>

Go to the source code of this file.

Functions

int64_t removeOutliers (SDDS_DATASET *SDDSin, int64_t rows, char **column, long columns, OUTLIER_CONTROL *outlierControl, int32_t *isOutlier)
 
long meanStDevForFlaggedData (double *mean, double *stDev, double *data, int32_t *keep, int64_t rows)
 
int main (int argc, char **argv)
 

Function Documentation

◆ main()

int main ( int argc,
char ** argv )

Definition at line 224 of file sddsoutlier.c.

224 {
225 int iArg;
226 char **column, **excludeColumn;
227 long columns, excludeColumns;
228 char *input, *output;
229 SCANNED_ARG *scanned;
230 SDDS_DATASET SDDSin, SDDSout;
231 long readCode, dataLimitGiven, tmpfileUsed;
232 int64_t i, rows;
233 long noWarnings, isOutlierIndex;
234 int32_t *isOutlier;
235 OUTLIER_CONTROL outlierControl;
236 unsigned long pipeFlags, tmpFlags, majorOrderFlag, dummyFlags;
237 short columnMajorOrder = -1;
238
240 argc = scanargs(&scanned, argc, argv);
241 if (argc < 3) {
242 bomb(NULL, USAGE);
243 }
244
245 output = input = NULL;
246 columns = excludeColumns = dataLimitGiven = 0;
247 column = excludeColumn = NULL;
248
249 outlierControl.flags = 0;
250 outlierControl.passes = 1;
251 outlierControl.neighbors = 0;
252 pipeFlags = tmpfileUsed = noWarnings = isOutlierIndex = 0;
253
254 for (iArg = 1; iArg < argc; iArg++) {
255 if (scanned[iArg].arg_type == OPTION) {
256 /* process options here */
257 switch (match_string(scanned[iArg].list[0], option, N_OPTIONS, 0)) {
258 case SET_MAJOR_ORDER:
259 majorOrderFlag = 0;
260 scanned[iArg].n_items--;
261 if (scanned[iArg].n_items > 0 &&
262 (!scanItemList(&majorOrderFlag, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
263 "row", -1, NULL, 0, SDDS_ROW_MAJOR_ORDER,
264 "column", -1, NULL, 0, SDDS_COLUMN_MAJOR_ORDER, NULL)))
265 SDDS_Bomb("invalid -majorOrder syntax/values");
266 if (majorOrderFlag & SDDS_COLUMN_MAJOR_ORDER)
267 columnMajorOrder = 1;
268 else if (majorOrderFlag & SDDS_ROW_MAJOR_ORDER)
269 columnMajorOrder = 0;
270 break;
271 case SET_COLUMNS:
272 if (columns)
273 SDDS_Bomb("only one -columns option may be given");
274 if (scanned[iArg].n_items < 2)
275 SDDS_Bomb("invalid -columns syntax");
276 column = tmalloc(sizeof(*column) * (columns = scanned[iArg].n_items - 1));
277 for (i = 0; i < columns; i++)
278 column[i] = scanned[iArg].list[i + 1];
279 break;
280 case SET_EXCLUDE:
281 if (excludeColumns)
282 SDDS_Bomb("only one -excludecolumns option may be given");
283 if (scanned[iArg].n_items < 2)
284 SDDS_Bomb("invalid -excludecolumns syntax");
285 excludeColumn = tmalloc(sizeof(*excludeColumn) * (excludeColumns = scanned[iArg].n_items - 1));
286 for (i = 0; i < excludeColumns; i++)
287 excludeColumn[i] = scanned[iArg].list[i + 1];
288 break;
289 case SET_STDDEV_LIMIT:
290 if (scanned[iArg].n_items != 2 || sscanf(scanned[iArg].list[1], "%lf", &outlierControl.stDevLimit) != 1 ||
291 outlierControl.stDevLimit <= 0)
292 SDDS_Bomb("invalid -stDevLimit syntax");
293 outlierControl.flags |= OUTLIER_CONTROL_INVOKED | OUTLIER_STDEV_GIVEN | OUTLIER_STDEVLIMIT_GIVEN;
294 break;
295 case SET_ABS_LIMIT:
296 if (scanned[iArg].n_items != 2 || sscanf(scanned[iArg].list[1], "%lf", &outlierControl.absoluteLimit) != 1 ||
297 outlierControl.absoluteLimit <= 0)
298 SDDS_Bomb("invalid -absLimit syntax");
299 outlierControl.flags |= OUTLIER_CONTROL_INVOKED | OUTLIER_ABSLIMIT_GIVEN;
300 break;
301 case SET_ABSDEV_LIMIT:
302 if (scanned[iArg].n_items < 2)
303 SDDS_Bomb("invalid -absDeviationLimit syntax");
304 if (scanned[iArg].n_items == 2) {
305 if (sscanf(scanned[iArg].list[1], "%lf", &outlierControl.absDevLimit) != 1 || outlierControl.absDevLimit <= 0)
306 SDDS_Bomb("invalid -absDeviationLimit syntax");
307 } else {
308 if (sscanf(scanned[iArg].list[1], "%lf", &outlierControl.absDevLimit) != 1 || outlierControl.absDevLimit <= 0)
309 SDDS_Bomb("invalid -absDeviationLimit syntax");
310 scanned[iArg].list += 2;
311 scanned[iArg].n_items -= 2;
312 if (scanned[iArg].n_items > 0 &&
313 (!scanItemList(&dummyFlags, scanned[iArg].list, &scanned[iArg].n_items, 0, "neighbors", SDDS_LONG, &(outlierControl.neighbors), 1, 0, NULL)))
314 SDDS_Bomb("invalid -absDeviationLimit syntax/value");
315 if (outlierControl.neighbors % 2 == 0)
316 outlierControl.neighbors += 1;
317 /* always make it an odd number */
318 scanned[iArg].list -= 2;
319 scanned[iArg].n_items += 2;
320 }
321 outlierControl.flags |= OUTLIER_CONTROL_INVOKED | OUTLIER_ABSDEVLIMIT_GIVEN;
322 break;
323 case SET_VERBOSE:
324 outlierControl.flags |= OUTLIER_VERBOSE_GIVEN;
325 break;
326 case SET_PIPE:
327 if (!processPipeOption(scanned[iArg].list + 1, scanned[iArg].n_items - 1, &pipeFlags))
328 SDDS_Bomb("invalid -pipe syntax");
329 break;
330 case SET_NOWARNINGS:
331 noWarnings = 1;
332 break;
333 case SET_INVERT:
334 outlierControl.flags |= OUTLIER_INVERT_GIVEN;
335 break;
336 case SET_MARKONLY:
337 outlierControl.flags |= OUTLIER_MARKONLY;
338 break;
339 case SET_CHANCELIMIT:
340 if (scanned[iArg].n_items != 2 ||
341 sscanf(scanned[iArg].list[1], "%lf", &outlierControl.chanceLimit) != 1 ||
342 outlierControl.chanceLimit <= 0)
343 SDDS_Bomb("invalid -chanceLimit syntax");
344 outlierControl.flags |= OUTLIER_CONTROL_INVOKED | OUTLIER_CHANCELIMIT_GIVEN;
345 break;
346 case SET_PASSES:
347 if (scanned[iArg].n_items != 2 ||
348 sscanf(scanned[iArg].list[1], "%ld", &outlierControl.passes) != 1 ||
349 outlierControl.passes < 1)
350 SDDS_Bomb("invalid -passes syntax");
351 break;
352 case SET_MAXLIMIT:
353 outlierControl.flags |= OUTLIER_MAXLIMIT_GIVEN | OUTLIER_CONTROL_INVOKED;
354 if (scanned[iArg].n_items != 2 ||
355 sscanf(scanned[iArg].list[1], "%lf", &outlierControl.maximumLimit) != 1)
356 SDDS_Bomb("invalid -maximumLimit syntax");
357 break;
358 case SET_MINLIMIT:
359 outlierControl.flags |= OUTLIER_MINLIMIT_GIVEN | OUTLIER_CONTROL_INVOKED;
360 if (scanned[iArg].n_items != 2 ||
361 sscanf(scanned[iArg].list[1], "%lf", &outlierControl.minimumLimit) != 1)
362 SDDS_Bomb("invalid -minimumLimit syntax");
363 break;
364 case SET_REPLACE:
365 if (scanned[iArg].n_items != 2)
366 SDDS_Bomb("invalid -replace syntax");
367 scanned[iArg].n_items -= 1;
368 if (!scanItemList(&tmpFlags, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
369 "lastvalue", -1, NULL, 0, OUTLIER_REPLACELAST,
370 "nextvalue", -1, NULL, 0, OUTLIER_REPLACENEXT,
371 "interpolatedvalue", -1, NULL, 0, OUTLIER_REPLACEINTERP,
372 "value", SDDS_DOUBLE, &outlierControl.replacementValue, 1, OUTLIER_REPLACEVALUE, NULL))
373 SDDS_Bomb("invalid -replace syntax/values");
374 outlierControl.flags |= tmpFlags | OUTLIER_CONTROL_INVOKED;
375 break;
376 case SET_PERCENTILE_LIMIT:
377 if (scanned[iArg].n_items < 3)
378 SDDS_Bomb("invalid -percentileLimit syntax");
379 scanned[iArg].n_items -= 1;
380 if (!scanItemList(&tmpFlags, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
381 "lower", SDDS_DOUBLE, &outlierControl.percentilePoint[0], 1, OUTLIER_PERCENTILE_LOWER,
382 "upper", SDDS_DOUBLE, &outlierControl.percentilePoint[1], 1, OUTLIER_PERCENTILE_UPPER, NULL) ||
383 !(tmpFlags & OUTLIER_PERCENTILE_LOWER) || !(tmpFlags & OUTLIER_PERCENTILE_UPPER) ||
384 outlierControl.percentilePoint[0] >= outlierControl.percentilePoint[1])
385 SDDS_Bomb("invalid -percentileLimit syntax");
386 outlierControl.flags |= tmpFlags | OUTLIER_CONTROL_INVOKED;
387 break;
388 case SET_UNPOPULAR:
389 if (scanned[iArg].n_items < 2)
390 SDDS_Bomb("invalid -unpopular syntax");
391 scanned[iArg].n_items -= 1;
392 if (!scanItemList(&tmpFlags, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
393 "bins", SDDS_LONG, &(outlierControl.unpopularBins), 1, OUTLIER_UNPOPULAR_BINS, NULL) ||
394 !(tmpFlags & OUTLIER_UNPOPULAR_BINS) || outlierControl.unpopularBins < 2)
395 SDDS_Bomb("invalid -unpopular syntax");
396 outlierControl.flags |= tmpFlags | OUTLIER_CONTROL_INVOKED;
397 break;
398 default:
399 fprintf(stderr, "Error: Unknown or ambiguous option: %s\n", scanned[iArg].list[0]);
400 exit(EXIT_FAILURE);
401 break;
402 }
403 } else {
404 if (!input)
405 input = scanned[iArg].list[0];
406 else if (!output)
407 output = scanned[iArg].list[0];
408 else
409 SDDS_Bomb("too many filenames seen");
410 }
411 }
412 if (outlierControl.flags & OUTLIER_REPLACEFLAGS && outlierControl.flags & OUTLIER_MARKONLY)
413 SDDS_Bomb("Cannot use -replaceOnly and -markOnly simultaneously.");
414
415 processFilenames("sddsoutlier", &input, &output, pipeFlags, noWarnings, &tmpfileUsed);
416
417 if (!(outlierControl.flags & OUTLIER_CONTROL_INVOKED)) {
418 outlierControl.flags |= OUTLIER_CONTROL_INVOKED | OUTLIER_STDEV_GIVEN | OUTLIER_STDEVLIMIT_GIVEN;
419 outlierControl.stDevLimit = 2;
420 }
421
422 if (!SDDS_InitializeInput(&SDDSin, input) ||
423 !SDDS_InitializeCopy(&SDDSout, &SDDSin, output, "w"))
424 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
425
426 if (columnMajorOrder != -1)
427 SDDSout.layout.data_mode.column_major = columnMajorOrder;
428 else
429 SDDSout.layout.data_mode.column_major = SDDSin.layout.data_mode.column_major;
430
431 if (outlierControl.flags & OUTLIER_MARKONLY &&
432 (isOutlierIndex = SDDS_GetColumnIndex(&SDDSout, "IsOutlier")) < 0) {
433 if (!SDDS_DefineColumn(&SDDSout, "IsOutlier", NULL, NULL, NULL, NULL, SDDS_SHORT, 0))
434 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
435 }
436
437 if (!SDDS_WriteLayout(&SDDSout))
438 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
439
440 if ((columns = expandColumnPairNames(&SDDSout, &column, NULL, columns, excludeColumn, excludeColumns, FIND_NUMERIC_TYPE, 0)) <= 0) {
441 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
442 SDDS_Bomb("No columns selected for outlier control.");
443 }
444
445 isOutlier = NULL;
446 while ((readCode = SDDS_ReadPage(&SDDSin)) > 0) {
447 if (!SDDS_CopyPage(&SDDSout, &SDDSin))
448 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
449
450 if ((rows = SDDS_CountRowsOfInterest(&SDDSout)) < 3) {
451 if (!SDDS_WritePage(&SDDSout))
452 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
453 continue;
454 }
455 if (outlierControl.flags & OUTLIER_MARKONLY) {
456 if (isOutlierIndex >= 0) {
457 if (!(isOutlier = SDDS_GetNumericColumn(&SDDSout, "IsOutlier", SDDS_LONG)))
458 SDDS_Bomb("Unable to retrieve 'IsOutlier' column from input file despite its existence.");
459 } else {
460 long i;
461 isOutlier = SDDS_Realloc(isOutlier, sizeof(*isOutlier) * rows);
462 if (!isOutlier)
463 SDDS_Bomb("Memory allocation failure.");
464 for (i = 0; i < rows; i++)
465 isOutlier[i] = 0;
466 }
467 }
468 if (outlierControl.flags & OUTLIER_VERBOSE_GIVEN)
469 fprintf(stderr, "%" PRId64 " rows in page %ld\n", rows, readCode);
470 if ((rows = removeOutliers(&SDDSout, rows, column, columns, &outlierControl, isOutlier)) == 0) {
471 if (!noWarnings)
472 fprintf(stderr, " No rows left after outlier control--skipping page.\n");
473 continue;
474 }
475 if (outlierControl.flags & OUTLIER_VERBOSE_GIVEN)
476 fprintf(stderr, "%" PRId64 " rows left after outlier control\n", rows);
477 if (rows != SDDS_CountRowsOfInterest(&SDDSout)) {
478 fprintf(stderr, "Problem with row selection:\n %" PRId64 " expected, %" PRId64 " counted\n", rows, SDDS_CountRowsOfInterest(&SDDSout));
479 exit(EXIT_FAILURE);
480 }
481 if ((isOutlier && !SDDS_SetColumnFromLongs(&SDDSout, SDDS_SET_BY_NAME, isOutlier, rows, "IsOutlier")) ||
482 !SDDS_WritePage(&SDDSout))
483 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
484 }
485 if (!SDDS_Terminate(&SDDSin) || !SDDS_Terminate(&SDDSout))
486 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
487 if (tmpfileUsed && !replaceFileAndBackUp(input, output))
488 exit(EXIT_FAILURE);
489 return EXIT_SUCCESS;
490}
int32_t SDDS_InitializeCopy(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source, char *filename, char *filemode)
Definition SDDS_copy.c:40
int32_t SDDS_CopyPage(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
Definition SDDS_copy.c:578
int32_t SDDS_SetColumnFromLongs(SDDS_DATASET *SDDS_dataset, int32_t mode, int32_t *data, int64_t rows,...)
Sets the values for a single data column using long integer numbers.
void * SDDS_GetNumericColumn(SDDS_DATASET *SDDS_dataset, char *column_name, int32_t desiredType)
Retrieves the data of a specified numerical column as an array of a desired numerical type,...
int64_t SDDS_CountRowsOfInterest(SDDS_DATASET *SDDS_dataset)
Counts the number of rows marked as "of interest" in the current data table.
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_DefineColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, int32_t field_length)
Defines a data column within the SDDS dataset.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_GetColumnIndex(SDDS_DATASET *SDDS_dataset, char *name)
Retrieves the index of a named column in the SDDS dataset.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
Definition SDDS_utils.c:677
#define SDDS_LONG
Identifier for the signed 32-bit integer data type.
Definition SDDStypes.h:61
#define SDDS_SHORT
Identifier for the signed short integer data type.
Definition SDDStypes.h:73
#define SDDS_DOUBLE
Identifier for the double data type.
Definition SDDStypes.h:37
void * tmalloc(uint64_t size_of_block)
Allocates a memory block of the specified size with zero initialization.
Definition array.c:59
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
Definition bomb.c:26
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
long replaceFileAndBackUp(char *file, char *replacement)
Replaces a file with a replacement file and creates a backup of the original.
Definition replacefile.c:75
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390
long scanItemList(unsigned long *flags, char **item, long *items, unsigned long mode,...)
Scans a list of items and assigns values based on provided keywords and types.

◆ meanStDevForFlaggedData()

long meanStDevForFlaggedData ( double * mean,
double * stDev,
double * data,
int32_t * keep,
int64_t rows )

Definition at line 799 of file sddsoutlier.c.

799 {
800 int64_t irow, summed;
801 double sum1, sum2, value;
802
803 *mean = *stDev = 0;
804 for (irow = sum1 = summed = 0; irow < rows; irow++) {
805 if (!keep[irow])
806 continue;
807 sum1 += data[irow];
808 summed++;
809 }
810 if (summed < 2)
811 return 0;
812 *mean = sum1 / summed;
813 for (irow = sum2 = 0; irow < rows; irow++) {
814 if (keep[irow]) {
815 value = data[irow] - *mean;
816 sum2 += value * value;
817 }
818 }
819 *stDev = sqrt(sum2 / (summed - 1));
820 return 1;
821}

◆ removeOutliers()

int64_t removeOutliers ( SDDS_DATASET * SDDSin,
int64_t rows,
char ** column,
long columns,
OUTLIER_CONTROL * outlierControl,
int32_t * isOutlier )

Definition at line 492 of file sddsoutlier.c.

492 {
493 long icol, ipass;
494 int64_t irow, kept, killed, j, k, summed;
495 double *data, sum1, stDev, mean;
496 static int32_t *keep = NULL;
497 double lastGoodValue = 0;
498 int64_t irow0, irow1;
499
500 if (!SDDS_SetRowFlags(dataset, 1))
501 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
502
503 /* Allocate or reallocate the keep array */
504 keep = SDDS_Realloc(keep, sizeof(*keep) * rows);
505 if (!keep)
506 SDDS_Bomb("Memory allocation failure.");
507
508 if (!isOutlier) {
509 for (irow = 0; irow < rows; irow++)
510 keep[irow] = 1;
511 kept = rows;
512 } else {
513 for (irow = kept = 0; irow < rows; irow++)
514 if ((keep[irow] = !isOutlier[irow]))
515 kept++;
516 }
517
518 for (icol = 0; icol < columns; icol++) {
519 /* Loop over columns for which outlier control is to be done */
520 data = SDDS_GetColumnInDoubles(dataset, column[icol]);
521 if (!data)
522 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
523
524 for (ipass = 0; ipass < outlierControl->passes; ipass++) {
525 if (outlierControl->flags & OUTLIER_UNPOPULAR_BINS && rows > 1) {
526 double *hist, lo, hi, delta;
527 int64_t imin, imax, ih;
528 hist = tmalloc(sizeof(*hist) * outlierControl->unpopularBins);
529 find_min_max(&lo, &hi, data, rows);
530 make_histogram(hist, outlierControl->unpopularBins, lo, hi, data, rows, 1);
531 delta = (hi - lo) / outlierControl->unpopularBins; /* yes, not bins-1 */
532 index_min_max(&imin, &imax, hist, outlierControl->unpopularBins);
533 for (irow = killed = 0; irow < rows; irow++) {
534 ih = (data[irow] - lo) / delta;
535 if (ih != imax) {
536 killed++;
537 kept--;
538 keep[irow] = 0;
539 }
540 }
541 free(hist);
542 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
543 fprintf(stderr, "%" PRId64 " additional rows killed by column %s unpopular control\n", killed, column[icol]);
544 }
545
546 if (outlierControl->flags & OUTLIER_PERCENTILE_FLAGS) {
547 double percentileResult[2];
548 killed = 0;
549 if (compute_percentiles(percentileResult, outlierControl->percentilePoint, 2, data, rows)) {
550 for (irow = killed = 0; irow < rows; irow++) {
551 if ((data[irow] < percentileResult[0] || data[irow] > percentileResult[1]) && keep[irow]) {
552 killed++;
553 kept--;
554 keep[irow] = 0;
555 }
556 }
557 }
558 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
559 fprintf(stderr, "%" PRId64 " additional rows killed by column %s percentile outlier control\n", killed, column[icol]);
560 }
561
562 if (outlierControl->flags & OUTLIER_MINLIMIT_GIVEN) {
563 /* Limit values to exceed a minimum */
564 for (irow = killed = 0; irow < rows; irow++) {
565 if (keep[irow] && data[irow] < outlierControl->minimumLimit) {
566 kept--;
567 keep[irow] = 0;
568 killed++;
569 }
570 }
571 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
572 fprintf(stderr, "%" PRId64 " additional rows killed by column %s minimum value outlier control\n", killed, column[icol]);
573 }
574
575 if (outlierControl->flags & OUTLIER_MAXLIMIT_GIVEN) {
576 /* Limit values to exceed a maximum */
577 for (irow = killed = 0; irow < rows; irow++) {
578 if (keep[irow] && data[irow] > outlierControl->maximumLimit) {
579 kept--;
580 keep[irow] = 0;
581 killed++;
582 }
583 }
584 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
585 fprintf(stderr, "%" PRId64 " additional rows killed by column %s maximum value outlier control\n", killed, column[icol]);
586 }
587
588 if (outlierControl->flags & OUTLIER_ABSLIMIT_GIVEN) {
589 /* Limit absolute values */
590 for (irow = killed = 0; irow < rows; irow++) {
591 if (keep[irow] && fabs(data[irow]) > outlierControl->absoluteLimit) {
592 kept--;
593 keep[irow] = 0;
594 killed++;
595 }
596 }
597 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
598 fprintf(stderr, "%" PRId64 " additional rows killed by column %s absolute value outlier control\n", killed, column[icol]);
599 }
600
601 if (outlierControl->flags & OUTLIER_ABSDEVLIMIT_GIVEN) {
602 /* Limit absolute deviation from mean */
603 if (outlierControl->neighbors > 0) {
604 for (irow = killed = 0; irow < rows; irow++) {
605 if (!keep[irow])
606 continue;
607 mean = 0;
608 for (j = irow - outlierControl->neighbors / 2; j <= irow + outlierControl->neighbors / 2; j++) {
609 if (j < 0)
610 k = irow + outlierControl->neighbors / 2 - j;
611 else if (j > rows - 1)
612 k = irow - outlierControl->neighbors / 2 - (j - rows + 1);
613 else
614 k = j;
615 mean += fabs(data[k]);
616 }
617 mean = mean / outlierControl->neighbors;
618 if (keep[irow] && fabs(data[irow] - mean) > outlierControl->absDevLimit) {
619 keep[irow] = 0;
620 kept--;
621 killed++;
622 }
623 }
624 } else {
625 for (irow = sum1 = summed = 0; irow < rows; irow++) {
626 if (!keep[irow])
627 continue;
628 sum1 += data[irow];
629 summed++;
630 }
631 if (summed < 1)
632 continue;
633 mean = sum1 / summed;
634 for (irow = killed = 0; irow < rows; irow++)
635 if (keep[irow] && fabs(data[irow] - mean) > outlierControl->absDevLimit) {
636 keep[irow] = 0;
637 kept--;
638 killed++;
639 }
640 }
641 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
642 fprintf(stderr, "%" PRId64 " additional rows killed by column %s absolute deviation outlier control\n", killed, column[icol]);
643 }
644
645 if (outlierControl->flags & OUTLIER_STDEV_GIVEN && kept && meanStDevForFlaggedData(&mean, &stDev, data, keep, rows) && stDev) {
646 /* Limit deviation from mean in units of standard deviations */
647 for (irow = killed = 0; irow < rows; irow++)
648 if (keep[irow] && fabs(data[irow] - mean) > outlierControl->stDevLimit * stDev) {
649 keep[irow] = 0;
650 kept--;
651 killed++;
652 }
653 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
654 fprintf(stderr, "%" PRId64 " additional rows killed by column %s standard deviation outlier control\n", killed, column[icol]);
655 }
656
657 if (outlierControl->flags & OUTLIER_CHANCELIMIT_GIVEN) {
658 /* Limit improbability of a point occurring based on Gaussian statistics */
659 if (kept && meanStDevForFlaggedData(&mean, &stDev, data, keep, rows) && stDev) {
660 int64_t lastKept;
661 double gProb, probOfSeeing;
662 lastKept = kept;
663 for (irow = killed = 0; irow < rows; irow++) {
664 if (!keep[irow])
665 continue;
666 /* gProb = (probability of value >= x) */
667 /* (1-gProb)^n = probability of not seeing x in n trials */
668 gProb = normSigLevel((data[irow] - mean) / stDev, 2);
669 probOfSeeing = 1 - ipow(1 - gProb, lastKept);
670 if (probOfSeeing < outlierControl->chanceLimit) {
671 keep[irow] = 0;
672 kept--;
673 killed++;
674 }
675 }
676 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
677 fprintf(stderr, "%" PRId64 " additional rows killed by column %s chance limit outlier control\n", killed, column[icol]);
678 }
679 }
680 }
681
682 if (outlierControl->flags & OUTLIER_REPLACEFLAGS && (outlierControl->flags & OUTLIER_INVERT_GIVEN)) {
683 for (irow = 0; irow < rows; irow++)
684 keep[irow] = !keep[irow];
685 kept = rows - kept;
686 }
687
688 if (outlierControl->flags & OUTLIER_REPLACELAST) {
689 for (irow = 0; irow < rows; irow++) {
690 if (keep[irow]) {
691 lastGoodValue = data[irow];
692 break;
693 }
694 }
695 for (irow = 0; irow < rows; irow++) {
696 if (!keep[irow]) {
697 keep[irow] = 1;
698 data[irow] = lastGoodValue;
699 } else
700 lastGoodValue = data[irow];
701 }
702 kept = rows;
703 if (!SDDS_SetColumnFromDoubles(dataset, SDDS_SET_BY_NAME, data, rows, column[icol]))
704 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
705 } else if (outlierControl->flags & OUTLIER_REPLACENEXT) {
706 for (irow = rows - 1; irow >= 0; irow--) {
707 if (keep[irow]) {
708 lastGoodValue = data[irow];
709 break;
710 }
711 }
712 for (irow = rows - 1; irow >= 0; irow--) {
713 if (!keep[irow]) {
714 data[irow] = lastGoodValue;
715 keep[irow] = 1;
716 } else
717 lastGoodValue = data[irow];
718 }
719 kept = rows;
720 if (!SDDS_SetColumnFromDoubles(dataset, SDDS_SET_BY_NAME, data, rows, column[icol]))
721 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
722 } else if (outlierControl->flags & OUTLIER_REPLACEINTERP) {
723 irow0 = -1;
724 irow1 = -1;
725 for (irow = 0; irow < rows; irow++) {
726 if (!keep[irow]) {
727 if ((irow0 = irow - 1) >= 0) {
728 if ((irow1 = irow + 1) < rows) {
729 while (irow1 < rows && !keep[irow1])
730 irow1++;
731 }
732 if (irow1 < rows && keep[irow1]) {
733 /* irow is bracketed by irow0 and irow1, both of which have good data */
734 if (!keep[irow0])
735 SDDS_Bomb("Bracketing problem.");
736 for (; irow < irow1; irow++)
737 data[irow] = data[irow0] + (data[irow1] - data[irow0]) / (1.0 * irow1 - irow0) * (irow - irow0);
738 continue;
739 } else {
740 /* Ran off the end with bad points---just duplicate the last good point */
741 for (; irow < rows; irow++)
742 data[irow] = data[irow0];
743 continue;
744 }
745 } else {
746 /* No good point precedes this point---look for a good point following it */
747 for (irow1 = irow + 1; irow1 < rows; irow1++) {
748 if (keep[irow1])
749 break;
750 }
751 if (irow1 != rows) {
752 for (; irow < irow1; irow++)
753 data[irow] = data[irow1];
754 continue;
755 }
756 }
757 }
758 }
759 for (irow = 0; irow < rows; irow++)
760 keep[irow] = 1;
761 kept = rows;
762 if (!SDDS_SetColumnFromDoubles(dataset, SDDS_SET_BY_NAME, data, rows, column[icol]))
763 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
764 } else if (outlierControl->flags & OUTLIER_REPLACEVALUE) {
765 for (irow = 0; irow < rows; irow++) {
766 if (!keep[irow]) {
767 data[irow] = outlierControl->replacementValue;
768 keep[irow] = 1;
769 }
770 }
771 kept = rows;
772 if (!SDDS_SetColumnFromDoubles(dataset, SDDS_SET_BY_NAME, data, rows, column[icol]))
773 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
774 }
775 free(data);
776 }
777
778 if (outlierControl->flags & OUTLIER_INVERT_GIVEN) {
779 for (irow = 0; irow < rows; irow++)
780 keep[irow] = !keep[irow];
781 kept = rows - kept;
782 if (outlierControl->flags & OUTLIER_VERBOSE_GIVEN)
783 fprintf(stderr, "%" PRId64 " rows left after inversion\n", kept);
784 }
785
786 if (isOutlier && (outlierControl->flags & OUTLIER_MARKONLY)) {
787 for (irow = 0; irow < rows; irow++)
788 isOutlier[irow] = !keep[irow];
789 if (!SDDS_SetRowFlags(dataset, 1))
790 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
791 return rows;
792 }
793
794 if (!SDDS_AssertRowFlags(dataset, SDDS_FLAG_ARRAY, keep, rows))
795 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
796 return kept;
797}
int32_t SDDS_SetColumnFromDoubles(SDDS_DATASET *SDDS_dataset, int32_t mode, double *data, int64_t rows,...)
Sets the values for a single data column using double-precision floating-point numbers.
int32_t SDDS_AssertRowFlags(SDDS_DATASET *SDDS_dataset, uint32_t mode,...)
Sets acceptance flags for rows based on specified criteria.
int32_t SDDS_SetRowFlags(SDDS_DATASET *SDDS_dataset, int32_t row_flag_value)
Sets the acceptance flags for all rows in the current data table of a data set.
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int index_min_max(int64_t *imin, int64_t *imax, double *list, int64_t n)
Finds the indices of the minimum and maximum values in a list of doubles.
Definition findMinMax.c:116
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
Definition findMinMax.c:33
double ipow(const double x, const int64_t p)
Compute x raised to the power p (x^p).
Definition ipow.c:33
long make_histogram(double *hist, long n_bins, double lo, double hi, double *data, int64_t n_pts, long new_start)
Compiles a histogram from data points.
long compute_percentiles(double *position, double *percent, long positions, double *x, long n)
Computes multiple percentiles of an array of doubles.
Definition median.c:84
double normSigLevel(double z0, long tails)
Computes the probability that a standard normal variable exceeds a given value.
Definition sigLevel.c:40