SDDSlib
Loading...
Searching...
No Matches
sddsoutlier.c
Go to the documentation of this file.
1/**
2 * @file sddsoutlier.c
3 * @brief Eliminates statistical outliers from SDDS data files.
4 *
5 * This program processes an input SDDS (Self Describing Data Set) file to identify and eliminate
6 * statistical outliers based on various criteria such as standard deviation limits, absolute
7 * limits, percentile limits, and more. Outliers can be removed, marked, or replaced with
8 * specified values. The processed data is then written to an output SDDS file.
9 *
10 * ## Usage
11 * ```
12 * sddsoutlier [<inputfile>] [<outputfile>] [OPTIONS]
13 * ```
14 *
15 * ## Options
16 *
17 * -pipe=[input][,output]
18 * -verbose
19 * -noWarnings
20 * -columns=<list-of-names>
21 * -excludeColumns=<list-of-names>
22 * -stDevLimit=<value>
23 * -absLimit=<value>
24 * -absDeviationLimit=<value>[,neighbor=<number>]
25 * -minimumLimit=<value>
26 * -maximumLimit=<value>
27 * -chanceLimit=<minimumChance>
28 * -percentileLimit=lower=<lowerPercent>,upper=<upperPercent>
29 * -unpopular=bins=<number>
30 * -invert
31 * -majorOrder
32 * -markOnly
33 * -replaceOnly={lastValue|nextValue|interpolatedValue|value=<number>}
34 * -passes=<number>
35 *
36 * @copyright
37 * - (c) 2002 The University of Chicago, as Operator of Argonne National Laboratory.
38 * - (c) 2002 The Regents of the University of California, as Operator of Los Alamos National Laboratory.
39 *
40 * @license
41 * This file is distributed under the terms of the Software License Agreement
42 * found in the file LICENSE included with this distribution.
43 *
44 * @author M. Borland, C. Saunders, R. Soliday, H. Shang
45 */
46
47#include "mdb.h"
48#include "SDDS.h"
49#include "scan.h"
50#include "SDDSutils.h"
51#include <ctype.h>
52
53/* Enumeration for option types */
54enum option_type {
55 SET_COLUMNS,
56 SET_EXCLUDE,
57 SET_STDDEV_LIMIT,
58 SET_ABS_LIMIT,
59 SET_ABSDEV_LIMIT,
60 SET_VERBOSE,
61 SET_PIPE,
62 SET_NOWARNINGS,
63 SET_INVERT,
64 SET_MARKONLY,
65 SET_CHANCELIMIT,
66 SET_PASSES,
67 SET_REPLACE,
68 SET_MAXLIMIT,
69 SET_MINLIMIT,
70 SET_MAJOR_ORDER,
71 SET_PERCENTILE_LIMIT,
72 SET_UNPOPULAR,
73 N_OPTIONS
74};
75
76char *option[N_OPTIONS] = {
77 "columns",
78 "excludecolumns",
79 "stdevlimit",
80 "abslimit",
81 "absdeviationlimit",
82 "verbose",
83 "pipe",
84 "nowarnings",
85 "invert",
86 "markonly",
87 "chancelimit",
88 "passes",
89 "replaceonly",
90 "maximumlimit",
91 "minimumlimit",
92 "majororder",
93 "percentilelimit",
94 "unpopular",
95};
96
97#define USAGE "Usage:\n" \
98 "sddsoutlier [<inputfile>] [<outputfile>] [OPTIONS]\n\n" \
99 "Options:\n" \
100 " -pipe=[input][,output]\n" \
101 " Use standard input and/or output as data streams.\n" \
102 " -verbose\n" \
103 " Enable verbose output, displaying processing information.\n" \
104 " -noWarnings\n" \
105 " Suppress warning messages.\n" \
106 " -columns=<list-of-names>\n" \
107 " Specify a comma-separated list of column names to process.\n" \
108 " -excludeColumns=<list-of-names>\n" \
109 " Specify a comma-separated list of column names to exclude from processing.\n" \
110 " -stDevLimit=<value>\n" \
111 " Point is an outlier if it is more than <value> standard deviations from the mean.\n" \
112 " -absLimit=<value>\n" \
113 " Point is an outlier if it has an absolute value greater than <value>.\n" \
114 " -absDeviationLimit=<value>[,neighbor=<number>]\n" \
115 " Point is an outlier if its absolute deviation from the mean exceeds <value>.\n" \
116 " If neighbor is provided, the mean is computed with the neighbors instead of the whole data.\n" \
117 " -minimumLimit=<value>\n" \
118 " Point is an outlier if it is less than <value>.\n" \
119 " -maximumLimit=<value>\n" \
120 " Point is an outlier if it is greater than <value>.\n" \
121 " -chanceLimit=<minimumChance>\n" \
122 " Point is an outlier if it has a probability less than <minimumChance> of occurring (Gaussian statistics).\n" \
123 " -percentileLimit=lower=<lowerPercent>,upper=<upperPercent>\n" \
124 " Point is an outlier if it is below the <lowerPercent> percentile or above the <upperPercent> percentile.\n" \
125 " -unpopular=bins=<number>\n" \
126 " Remove points that are not in the most populated bin based on a histogram with <number> bins.\n" \
127 " -invert\n" \
128 " Invert the outlier selection criteria.\n" \
129 " -majorOrder=row|column\n" \
130 " Specify output file data ordering as row or column major.\n" \
131 " -markOnly\n" \
132 " Mark identified outliers without removing them.\n" \
133 " -replaceOnly={lastValue|nextValue|interpolatedValue|value=<number>}\n" \
134 " Replace outliers with specified values or strategies.\n" \
135 " -passes=<number>\n" \
136 " Define the number of passes for outlier detection.\n\n" \
137 "Program by Michael Borland. (" __DATE__ " " __TIME__ ", SVN revision: " SVN_VERSION ")\n"
138
139#define OUTLIER_CONTROL_INVOKED 0x00001U
140#define OUTLIER_STDEV_GIVEN 0x00002U
141#define OUTLIER_FRACTION_GIVEN 0x00004U
142#define OUTLIER_STDEVLIMIT_GIVEN 0x00008U
143#define OUTLIER_UNPOPULAR_BINS 0x00010U
144#define OUTLIER_VERBOSE_GIVEN 0x00020U
145#define OUTLIER_ABSLIMIT_GIVEN 0x00040U
146#define OUTLIER_ABSDEVLIMIT_GIVEN 0x00080U
147#define OUTLIER_INVERT_GIVEN 0x00100U
148#define OUTLIER_MARKONLY 0x00200U
149#define OUTLIER_CHANCELIMIT_GIVEN 0x00400U
150#define OUTLIER_MAXLIMIT_GIVEN 0x00800U
151#define OUTLIER_MINLIMIT_GIVEN 0x01000U
152#define OUTLIER_REPLACELAST 0x02000U
153#define OUTLIER_REPLACENEXT 0x04000U
154#define OUTLIER_REPLACEINTERP 0x08000U
155#define OUTLIER_REPLACEVALUE 0x10000U
156#define OUTLIER_REPLACEFLAGS (OUTLIER_REPLACELAST | OUTLIER_REPLACENEXT | OUTLIER_REPLACEINTERP | OUTLIER_REPLACEVALUE)
157#define OUTLIER_PERCENTILE_LOWER 0x20000U
158#define OUTLIER_PERCENTILE_UPPER 0x40000U
159#define OUTLIER_PERCENTILE_FLAGS (OUTLIER_PERCENTILE_LOWER | OUTLIER_PERCENTILE_UPPER)
160
161typedef struct
162{
163 double stDevLimit, fractionLimit, absoluteLimit, absDevLimit;
164 double chanceLimit, replacementValue, maximumLimit, minimumLimit;
165 double percentilePoint[2];
166 long passes;
167 int32_t unpopularBins;
168 int32_t neighbors;
169 unsigned long flags;
171
172int64_t removeOutliers(SDDS_DATASET *SDDSin, int64_t rows, char **column, long columns, OUTLIER_CONTROL *outlierControl, int32_t *isOutlier);
173long meanStDevForFlaggedData(double *mean, double *stDev, double *data, int32_t *keep, int64_t rows);
174
175int main(int argc, char **argv) {
176 int iArg;
177 char **column, **excludeColumn;
178 long columns, excludeColumns;
179 char *input, *output;
180 SCANNED_ARG *scanned;
181 SDDS_DATASET SDDSin, SDDSout;
182 long readCode, dataLimitGiven, tmpfileUsed;
183 int64_t i, rows;
184 long noWarnings, isOutlierIndex;
185 int32_t *isOutlier;
186 OUTLIER_CONTROL outlierControl;
187 unsigned long pipeFlags, tmpFlags, majorOrderFlag, dummyFlags;
188 short columnMajorOrder = -1;
189
191 argc = scanargs(&scanned, argc, argv);
192 if (argc < 3) {
193 bomb(NULL, USAGE);
194 }
195
196 output = input = NULL;
197 columns = excludeColumns = dataLimitGiven = 0;
198 column = excludeColumn = NULL;
199
200 outlierControl.flags = 0;
201 outlierControl.passes = 1;
202 outlierControl.neighbors = 0;
203 pipeFlags = tmpfileUsed = noWarnings = isOutlierIndex = 0;
204
205 for (iArg = 1; iArg < argc; iArg++) {
206 if (scanned[iArg].arg_type == OPTION) {
207 /* process options here */
208 switch (match_string(scanned[iArg].list[0], option, N_OPTIONS, 0)) {
209 case SET_MAJOR_ORDER:
210 majorOrderFlag = 0;
211 scanned[iArg].n_items--;
212 if (scanned[iArg].n_items > 0 &&
213 (!scanItemList(&majorOrderFlag, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
214 "row", -1, NULL, 0, SDDS_ROW_MAJOR_ORDER,
215 "column", -1, NULL, 0, SDDS_COLUMN_MAJOR_ORDER, NULL)))
216 SDDS_Bomb("invalid -majorOrder syntax/values");
217 if (majorOrderFlag & SDDS_COLUMN_MAJOR_ORDER)
218 columnMajorOrder = 1;
219 else if (majorOrderFlag & SDDS_ROW_MAJOR_ORDER)
220 columnMajorOrder = 0;
221 break;
222 case SET_COLUMNS:
223 if (columns)
224 SDDS_Bomb("only one -columns option may be given");
225 if (scanned[iArg].n_items < 2)
226 SDDS_Bomb("invalid -columns syntax");
227 column = tmalloc(sizeof(*column) * (columns = scanned[iArg].n_items - 1));
228 for (i = 0; i < columns; i++)
229 column[i] = scanned[iArg].list[i + 1];
230 break;
231 case SET_EXCLUDE:
232 if (excludeColumns)
233 SDDS_Bomb("only one -excludecolumns option may be given");
234 if (scanned[iArg].n_items < 2)
235 SDDS_Bomb("invalid -excludecolumns syntax");
236 excludeColumn = tmalloc(sizeof(*excludeColumn) * (excludeColumns = scanned[iArg].n_items - 1));
237 for (i = 0; i < excludeColumns; i++)
238 excludeColumn[i] = scanned[iArg].list[i + 1];
239 break;
240 case SET_STDDEV_LIMIT:
241 if (scanned[iArg].n_items != 2 || sscanf(scanned[iArg].list[1], "%lf", &outlierControl.stDevLimit) != 1 ||
242 outlierControl.stDevLimit <= 0)
243 SDDS_Bomb("invalid -stDevLimit syntax");
244 outlierControl.flags |= OUTLIER_CONTROL_INVOKED | OUTLIER_STDEV_GIVEN | OUTLIER_STDEVLIMIT_GIVEN;
245 break;
246 case SET_ABS_LIMIT:
247 if (scanned[iArg].n_items != 2 || sscanf(scanned[iArg].list[1], "%lf", &outlierControl.absoluteLimit) != 1 ||
248 outlierControl.absoluteLimit <= 0)
249 SDDS_Bomb("invalid -absLimit syntax");
250 outlierControl.flags |= OUTLIER_CONTROL_INVOKED | OUTLIER_ABSLIMIT_GIVEN;
251 break;
252 case SET_ABSDEV_LIMIT:
253 if (scanned[iArg].n_items < 2)
254 SDDS_Bomb("invalid -absDeviationLimit syntax");
255 if (scanned[iArg].n_items == 2) {
256 if (sscanf(scanned[iArg].list[1], "%lf", &outlierControl.absDevLimit) != 1 || outlierControl.absDevLimit <= 0)
257 SDDS_Bomb("invalid -absDeviationLimit syntax");
258 } else {
259 if (sscanf(scanned[iArg].list[1], "%lf", &outlierControl.absDevLimit) != 1 || outlierControl.absDevLimit <= 0)
260 SDDS_Bomb("invalid -absDeviationLimit syntax");
261 scanned[iArg].list += 2;
262 scanned[iArg].n_items -= 2;
263 if (scanned[iArg].n_items > 0 &&
264 (!scanItemList(&dummyFlags, scanned[iArg].list, &scanned[iArg].n_items, 0, "neighbors", SDDS_LONG, &(outlierControl.neighbors), 1, 0, NULL)))
265 SDDS_Bomb("invalid -absDeviationLimit syntax/value");
266 if (outlierControl.neighbors % 2 == 0)
267 outlierControl.neighbors += 1;
268 /* always make it an odd number */
269 scanned[iArg].list -= 2;
270 scanned[iArg].n_items += 2;
271 }
272 outlierControl.flags |= OUTLIER_CONTROL_INVOKED | OUTLIER_ABSDEVLIMIT_GIVEN;
273 break;
274 case SET_VERBOSE:
275 outlierControl.flags |= OUTLIER_VERBOSE_GIVEN;
276 break;
277 case SET_PIPE:
278 if (!processPipeOption(scanned[iArg].list + 1, scanned[iArg].n_items - 1, &pipeFlags))
279 SDDS_Bomb("invalid -pipe syntax");
280 break;
281 case SET_NOWARNINGS:
282 noWarnings = 1;
283 break;
284 case SET_INVERT:
285 outlierControl.flags |= OUTLIER_INVERT_GIVEN;
286 break;
287 case SET_MARKONLY:
288 outlierControl.flags |= OUTLIER_MARKONLY;
289 break;
290 case SET_CHANCELIMIT:
291 if (scanned[iArg].n_items != 2 ||
292 sscanf(scanned[iArg].list[1], "%lf", &outlierControl.chanceLimit) != 1 ||
293 outlierControl.chanceLimit <= 0)
294 SDDS_Bomb("invalid -chanceLimit syntax");
295 outlierControl.flags |= OUTLIER_CONTROL_INVOKED | OUTLIER_CHANCELIMIT_GIVEN;
296 break;
297 case SET_PASSES:
298 if (scanned[iArg].n_items != 2 ||
299 sscanf(scanned[iArg].list[1], "%ld", &outlierControl.passes) != 1 ||
300 outlierControl.passes < 1)
301 SDDS_Bomb("invalid -passes syntax");
302 break;
303 case SET_MAXLIMIT:
304 outlierControl.flags |= OUTLIER_MAXLIMIT_GIVEN | OUTLIER_CONTROL_INVOKED;
305 if (scanned[iArg].n_items != 2 ||
306 sscanf(scanned[iArg].list[1], "%lf", &outlierControl.maximumLimit) != 1)
307 SDDS_Bomb("invalid -maximumLimit syntax");
308 break;
309 case SET_MINLIMIT:
310 outlierControl.flags |= OUTLIER_MINLIMIT_GIVEN | OUTLIER_CONTROL_INVOKED;
311 if (scanned[iArg].n_items != 2 ||
312 sscanf(scanned[iArg].list[1], "%lf", &outlierControl.minimumLimit) != 1)
313 SDDS_Bomb("invalid -minimumLimit syntax");
314 break;
315 case SET_REPLACE:
316 if (scanned[iArg].n_items != 2)
317 SDDS_Bomb("invalid -replace syntax");
318 scanned[iArg].n_items -= 1;
319 if (!scanItemList(&tmpFlags, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
320 "lastvalue", -1, NULL, 0, OUTLIER_REPLACELAST,
321 "nextvalue", -1, NULL, 0, OUTLIER_REPLACENEXT,
322 "interpolatedvalue", -1, NULL, 0, OUTLIER_REPLACEINTERP,
323 "value", SDDS_DOUBLE, &outlierControl.replacementValue, 1, OUTLIER_REPLACEVALUE, NULL))
324 SDDS_Bomb("invalid -replace syntax/values");
325 outlierControl.flags |= tmpFlags | OUTLIER_CONTROL_INVOKED;
326 break;
327 case SET_PERCENTILE_LIMIT:
328 if (scanned[iArg].n_items < 3)
329 SDDS_Bomb("invalid -percentileLimit syntax");
330 scanned[iArg].n_items -= 1;
331 if (!scanItemList(&tmpFlags, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
332 "lower", SDDS_DOUBLE, &outlierControl.percentilePoint[0], 1, OUTLIER_PERCENTILE_LOWER,
333 "upper", SDDS_DOUBLE, &outlierControl.percentilePoint[1], 1, OUTLIER_PERCENTILE_UPPER, NULL) ||
334 !(tmpFlags & OUTLIER_PERCENTILE_LOWER) || !(tmpFlags & OUTLIER_PERCENTILE_UPPER) ||
335 outlierControl.percentilePoint[0] >= outlierControl.percentilePoint[1])
336 SDDS_Bomb("invalid -percentileLimit syntax");
337 outlierControl.flags |= tmpFlags | OUTLIER_CONTROL_INVOKED;
338 break;
339 case SET_UNPOPULAR:
340 if (scanned[iArg].n_items < 2)
341 SDDS_Bomb("invalid -unpopular syntax");
342 scanned[iArg].n_items -= 1;
343 if (!scanItemList(&tmpFlags, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
344 "bins", SDDS_LONG, &(outlierControl.unpopularBins), 1, OUTLIER_UNPOPULAR_BINS, NULL) ||
345 !(tmpFlags & OUTLIER_UNPOPULAR_BINS) || outlierControl.unpopularBins < 2)
346 SDDS_Bomb("invalid -unpopular syntax");
347 outlierControl.flags |= tmpFlags | OUTLIER_CONTROL_INVOKED;
348 break;
349 default:
350 fprintf(stderr, "Error: Unknown or ambiguous option: %s\n", scanned[iArg].list[0]);
351 exit(EXIT_FAILURE);
352 break;
353 }
354 } else {
355 if (!input)
356 input = scanned[iArg].list[0];
357 else if (!output)
358 output = scanned[iArg].list[0];
359 else
360 SDDS_Bomb("too many filenames seen");
361 }
362 }
363 if (outlierControl.flags & OUTLIER_REPLACEFLAGS && outlierControl.flags & OUTLIER_MARKONLY)
364 SDDS_Bomb("Cannot use -replaceOnly and -markOnly simultaneously.");
365
366 processFilenames("sddsoutlier", &input, &output, pipeFlags, noWarnings, &tmpfileUsed);
367
368 if (!(outlierControl.flags & OUTLIER_CONTROL_INVOKED)) {
369 outlierControl.flags |= OUTLIER_CONTROL_INVOKED | OUTLIER_STDEV_GIVEN | OUTLIER_STDEVLIMIT_GIVEN;
370 outlierControl.stDevLimit = 2;
371 }
372
373 if (!SDDS_InitializeInput(&SDDSin, input) ||
374 !SDDS_InitializeCopy(&SDDSout, &SDDSin, output, "w"))
375 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
376
377 if (columnMajorOrder != -1)
378 SDDSout.layout.data_mode.column_major = columnMajorOrder;
379 else
380 SDDSout.layout.data_mode.column_major = SDDSin.layout.data_mode.column_major;
381
382 if (outlierControl.flags & OUTLIER_MARKONLY &&
383 (isOutlierIndex = SDDS_GetColumnIndex(&SDDSout, "IsOutlier")) < 0) {
384 if (!SDDS_DefineColumn(&SDDSout, "IsOutlier", NULL, NULL, NULL, NULL, SDDS_SHORT, 0))
385 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
386 }
387
388 if (!SDDS_WriteLayout(&SDDSout))
389 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
390
391 if ((columns = expandColumnPairNames(&SDDSout, &column, NULL, columns, excludeColumn, excludeColumns, FIND_NUMERIC_TYPE, 0)) <= 0) {
392 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
393 SDDS_Bomb("No columns selected for outlier control.");
394 }
395
396 isOutlier = NULL;
397 while ((readCode = SDDS_ReadPage(&SDDSin)) > 0) {
398 if (!SDDS_CopyPage(&SDDSout, &SDDSin))
399 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
400
401 if ((rows = SDDS_CountRowsOfInterest(&SDDSout)) < 3) {
402 if (!SDDS_WritePage(&SDDSout))
403 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
404 continue;
405 }
406 if (outlierControl.flags & OUTLIER_MARKONLY) {
407 if (isOutlierIndex >= 0) {
408 if (!(isOutlier = SDDS_GetNumericColumn(&SDDSout, "IsOutlier", SDDS_LONG)))
409 SDDS_Bomb("Unable to retrieve 'IsOutlier' column from input file despite its existence.");
410 } else {
411 long i;
412 isOutlier = SDDS_Realloc(isOutlier, sizeof(*isOutlier) * rows);
413 if (!isOutlier)
414 SDDS_Bomb("Memory allocation failure.");
415 for (i = 0; i < rows; i++)
416 isOutlier[i] = 0;
417 }
418 }
419 if (outlierControl.flags & OUTLIER_VERBOSE_GIVEN)
420 fprintf(stderr, "%" PRId64 " rows in page %ld\n", rows, readCode);
421 if ((rows = removeOutliers(&SDDSout, rows, column, columns, &outlierControl, isOutlier)) == 0) {
422 if (!noWarnings)
423 fprintf(stderr, " No rows left after outlier control--skipping page.\n");
424 continue;
425 }
426 if (outlierControl.flags & OUTLIER_VERBOSE_GIVEN)
427 fprintf(stderr, "%" PRId64 " rows left after outlier control\n", rows);
428 if (rows != SDDS_CountRowsOfInterest(&SDDSout)) {
429 fprintf(stderr, "Problem with row selection:\n %" PRId64 " expected, %" PRId64 " counted\n", rows, SDDS_CountRowsOfInterest(&SDDSout));
430 exit(EXIT_FAILURE);
431 }
432 if ((isOutlier && !SDDS_SetColumnFromLongs(&SDDSout, SDDS_SET_BY_NAME, isOutlier, rows, "IsOutlier")) ||
433 !SDDS_WritePage(&SDDSout))
434 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
435 }
436 if (!SDDS_Terminate(&SDDSin) || !SDDS_Terminate(&SDDSout))
437 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
438 if (tmpfileUsed && !replaceFileAndBackUp(input, output))
439 exit(EXIT_FAILURE);
440 return EXIT_SUCCESS;
441}
442
443int64_t removeOutliers(SDDS_DATASET *dataset, int64_t rows, char **column, long columns, OUTLIER_CONTROL *outlierControl, int32_t *isOutlier) {
444 long icol, ipass;
445 int64_t irow, kept, killed, j, k, summed;
446 double *data, sum1, stDev, mean;
447 static int32_t *keep = NULL;
448 double lastGoodValue = 0;
449 int64_t irow0, irow1;
450
451 if (!SDDS_SetRowFlags(dataset, 1))
452 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
453
454 /* Allocate or reallocate the keep array */
455 keep = SDDS_Realloc(keep, sizeof(*keep) * rows);
456 if (!keep)
457 SDDS_Bomb("Memory allocation failure.");
458
459 if (!isOutlier) {
460 for (irow = 0; irow < rows; irow++)
461 keep[irow] = 1;
462 kept = rows;
463 } else {
464 for (irow = kept = 0; irow < rows; irow++)
465 if ((keep[irow] = !isOutlier[irow]))
466 kept++;
467 }
468
469 for (icol = 0; icol < columns; icol++) {
470 /* Loop over columns for which outlier control is to be done */
471 data = SDDS_GetColumnInDoubles(dataset, column[icol]);
472 if (!data)
473 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
474
475 for (ipass = 0; ipass < outlierControl->passes; ipass++) {
476 if (outlierControl->flags & OUTLIER_UNPOPULAR_BINS && rows > 1) {
477 double *hist, lo, hi, delta;
478 long imin, imax, ih;
479 hist = tmalloc(sizeof(*hist) * outlierControl->unpopularBins);
480 find_min_max(&lo, &hi, data, rows);
481 make_histogram(hist, outlierControl->unpopularBins, lo, hi, data, rows, 1);
482 delta = (hi - lo) / outlierControl->unpopularBins; /* yes, not bins-1 */
483 index_min_max(&imin, &imax, hist, outlierControl->unpopularBins);
484 for (irow = killed = 0; irow < rows; irow++) {
485 ih = (data[irow] - lo) / delta;
486 if (ih != imax) {
487 killed++;
488 kept--;
489 keep[irow] = 0;
490 }
491 }
492 free(hist);
493 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
494 fprintf(stderr, "%" PRId64 " additional rows killed by column %s unpopular control\n", killed, column[icol]);
495 }
496
497 if (outlierControl->flags & OUTLIER_PERCENTILE_FLAGS) {
498 double percentileResult[2];
499 killed = 0;
500 if (compute_percentiles(percentileResult, outlierControl->percentilePoint, 2, data, rows)) {
501 for (irow = killed = 0; irow < rows; irow++) {
502 if ((data[irow] < percentileResult[0] || data[irow] > percentileResult[1]) && keep[irow]) {
503 killed++;
504 kept--;
505 keep[irow] = 0;
506 }
507 }
508 }
509 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
510 fprintf(stderr, "%" PRId64 " additional rows killed by column %s percentile outlier control\n", killed, column[icol]);
511 }
512
513 if (outlierControl->flags & OUTLIER_MINLIMIT_GIVEN) {
514 /* Limit values to exceed a minimum */
515 for (irow = killed = 0; irow < rows; irow++) {
516 if (keep[irow] && data[irow] < outlierControl->minimumLimit) {
517 kept--;
518 keep[irow] = 0;
519 killed++;
520 }
521 }
522 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
523 fprintf(stderr, "%" PRId64 " additional rows killed by column %s minimum value outlier control\n", killed, column[icol]);
524 }
525
526 if (outlierControl->flags & OUTLIER_MAXLIMIT_GIVEN) {
527 /* Limit values to exceed a maximum */
528 for (irow = killed = 0; irow < rows; irow++) {
529 if (keep[irow] && data[irow] > outlierControl->maximumLimit) {
530 kept--;
531 keep[irow] = 0;
532 killed++;
533 }
534 }
535 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
536 fprintf(stderr, "%" PRId64 " additional rows killed by column %s maximum value outlier control\n", killed, column[icol]);
537 }
538
539 if (outlierControl->flags & OUTLIER_ABSLIMIT_GIVEN) {
540 /* Limit absolute values */
541 for (irow = killed = 0; irow < rows; irow++) {
542 if (keep[irow] && fabs(data[irow]) > outlierControl->absoluteLimit) {
543 kept--;
544 keep[irow] = 0;
545 killed++;
546 }
547 }
548 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
549 fprintf(stderr, "%" PRId64 " additional rows killed by column %s absolute value outlier control\n", killed, column[icol]);
550 }
551
552 if (outlierControl->flags & OUTLIER_ABSDEVLIMIT_GIVEN) {
553 /* Limit absolute deviation from mean */
554 if (outlierControl->neighbors > 0) {
555 for (irow = killed = 0; irow < rows; irow++) {
556 if (!keep[irow])
557 continue;
558 mean = 0;
559 for (j = irow - outlierControl->neighbors / 2; j <= irow + outlierControl->neighbors / 2; j++) {
560 if (j < 0)
561 k = irow + outlierControl->neighbors / 2 - j;
562 else if (j > rows - 1)
563 k = irow - outlierControl->neighbors / 2 - (j - rows + 1);
564 else
565 k = j;
566 mean += fabs(data[k]);
567 }
568 mean = mean / outlierControl->neighbors;
569 if (keep[irow] && fabs(data[irow] - mean) > outlierControl->absDevLimit) {
570 keep[irow] = 0;
571 kept--;
572 killed++;
573 }
574 }
575 } else {
576 for (irow = sum1 = summed = 0; irow < rows; irow++) {
577 if (!keep[irow])
578 continue;
579 sum1 += data[irow];
580 summed++;
581 }
582 if (summed < 1)
583 continue;
584 mean = sum1 / summed;
585 for (irow = killed = 0; irow < rows; irow++)
586 if (keep[irow] && fabs(data[irow] - mean) > outlierControl->absDevLimit) {
587 keep[irow] = 0;
588 kept--;
589 killed++;
590 }
591 }
592 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
593 fprintf(stderr, "%" PRId64 " additional rows killed by column %s absolute deviation outlier control\n", killed, column[icol]);
594 }
595
596 if (outlierControl->flags & OUTLIER_STDEV_GIVEN && kept && meanStDevForFlaggedData(&mean, &stDev, data, keep, rows) && stDev) {
597 /* Limit deviation from mean in units of standard deviations */
598 for (irow = killed = 0; irow < rows; irow++)
599 if (keep[irow] && fabs(data[irow] - mean) > outlierControl->stDevLimit * stDev) {
600 keep[irow] = 0;
601 kept--;
602 killed++;
603 }
604 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
605 fprintf(stderr, "%" PRId64 " additional rows killed by column %s standard deviation outlier control\n", killed, column[icol]);
606 }
607
608 if (outlierControl->flags & OUTLIER_CHANCELIMIT_GIVEN) {
609 /* Limit improbability of a point occurring based on Gaussian statistics */
610 if (kept && meanStDevForFlaggedData(&mean, &stDev, data, keep, rows) && stDev) {
611 int64_t lastKept;
612 double gProb, probOfSeeing;
613 lastKept = kept;
614 for (irow = killed = 0; irow < rows; irow++) {
615 if (!keep[irow])
616 continue;
617 /* gProb = (probability of value >= x) */
618 /* (1-gProb)^n = probability of not seeing x in n trials */
619 gProb = normSigLevel((data[irow] - mean) / stDev, 2);
620 probOfSeeing = 1 - ipow(1 - gProb, lastKept);
621 if (probOfSeeing < outlierControl->chanceLimit) {
622 keep[irow] = 0;
623 kept--;
624 killed++;
625 }
626 }
627 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
628 fprintf(stderr, "%" PRId64 " additional rows killed by column %s chance limit outlier control\n", killed, column[icol]);
629 }
630 }
631 }
632
633 if (outlierControl->flags & OUTLIER_REPLACEFLAGS && (outlierControl->flags & OUTLIER_INVERT_GIVEN)) {
634 for (irow = 0; irow < rows; irow++)
635 keep[irow] = !keep[irow];
636 kept = rows - kept;
637 }
638
639 if (outlierControl->flags & OUTLIER_REPLACELAST) {
640 for (irow = 0; irow < rows; irow++) {
641 if (keep[irow]) {
642 lastGoodValue = data[irow];
643 break;
644 }
645 }
646 for (irow = 0; irow < rows; irow++) {
647 if (!keep[irow]) {
648 keep[irow] = 1;
649 data[irow] = lastGoodValue;
650 } else
651 lastGoodValue = data[irow];
652 }
653 kept = rows;
654 if (!SDDS_SetColumnFromDoubles(dataset, SDDS_SET_BY_NAME, data, rows, column[icol]))
655 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
656 } else if (outlierControl->flags & OUTLIER_REPLACENEXT) {
657 for (irow = rows - 1; irow >= 0; irow--) {
658 if (keep[irow]) {
659 lastGoodValue = data[irow];
660 break;
661 }
662 }
663 for (irow = rows - 1; irow >= 0; irow--) {
664 if (!keep[irow]) {
665 data[irow] = lastGoodValue;
666 keep[irow] = 1;
667 } else
668 lastGoodValue = data[irow];
669 }
670 kept = rows;
671 if (!SDDS_SetColumnFromDoubles(dataset, SDDS_SET_BY_NAME, data, rows, column[icol]))
672 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
673 } else if (outlierControl->flags & OUTLIER_REPLACEINTERP) {
674 irow0 = -1;
675 irow1 = -1;
676 for (irow = 0; irow < rows; irow++) {
677 if (!keep[irow]) {
678 if ((irow0 = irow - 1) >= 0) {
679 if ((irow1 = irow + 1) < rows) {
680 while (irow1 < rows && !keep[irow1])
681 irow1++;
682 }
683 if (irow1 < rows && keep[irow1]) {
684 /* irow is bracketed by irow0 and irow1, both of which have good data */
685 if (!keep[irow0])
686 SDDS_Bomb("Bracketing problem.");
687 for (; irow < irow1; irow++)
688 data[irow] = data[irow0] + (data[irow1] - data[irow0]) / (1.0 * irow1 - irow0) * (irow - irow0);
689 continue;
690 } else {
691 /* Ran off the end with bad points---just duplicate the last good point */
692 for (; irow < rows; irow++)
693 data[irow] = data[irow0];
694 continue;
695 }
696 } else {
697 /* No good point precedes this point---look for a good point following it */
698 for (irow1 = irow + 1; irow1 < rows; irow1++) {
699 if (keep[irow1])
700 break;
701 }
702 if (irow1 != rows) {
703 for (; irow < irow1; irow++)
704 data[irow] = data[irow1];
705 continue;
706 }
707 }
708 }
709 }
710 for (irow = 0; irow < rows; irow++)
711 keep[irow] = 1;
712 kept = rows;
713 if (!SDDS_SetColumnFromDoubles(dataset, SDDS_SET_BY_NAME, data, rows, column[icol]))
714 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
715 } else if (outlierControl->flags & OUTLIER_REPLACEVALUE) {
716 for (irow = 0; irow < rows; irow++) {
717 if (!keep[irow]) {
718 data[irow] = outlierControl->replacementValue;
719 keep[irow] = 1;
720 }
721 }
722 kept = rows;
723 if (!SDDS_SetColumnFromDoubles(dataset, SDDS_SET_BY_NAME, data, rows, column[icol]))
724 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
725 }
726 free(data);
727 }
728
729 if (outlierControl->flags & OUTLIER_INVERT_GIVEN) {
730 for (irow = 0; irow < rows; irow++)
731 keep[irow] = !keep[irow];
732 kept = rows - kept;
733 if (outlierControl->flags & OUTLIER_VERBOSE_GIVEN)
734 fprintf(stderr, "%" PRId64 " rows left after inversion\n", kept);
735 }
736
737 if (isOutlier && (outlierControl->flags & OUTLIER_MARKONLY)) {
738 for (irow = 0; irow < rows; irow++)
739 isOutlier[irow] = !keep[irow];
740 if (!SDDS_SetRowFlags(dataset, 1))
741 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
742 return rows;
743 }
744
745 if (!SDDS_AssertRowFlags(dataset, SDDS_FLAG_ARRAY, keep, rows))
746 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
747 return kept;
748}
749
750long meanStDevForFlaggedData(double *mean, double *stDev, double *data, int32_t *keep, int64_t rows) {
751 int64_t irow, summed;
752 double sum1, sum2, value;
753
754 *mean = *stDev = 0;
755 for (irow = sum1 = summed = 0; irow < rows; irow++) {
756 if (!keep[irow])
757 continue;
758 sum1 += data[irow];
759 summed++;
760 }
761 if (summed < 2)
762 return 0;
763 *mean = sum1 / summed;
764 for (irow = sum2 = 0; irow < rows; irow++) {
765 if (keep[irow]) {
766 value = data[irow] - *mean;
767 sum2 += value * value;
768 }
769 }
770 *stDev = sqrt(sum2 / (summed - 1));
771 return 1;
772}
SDDS (Self Describing Data Set) Data Types Definitions and Function Prototypes.
int32_t SDDS_InitializeCopy(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source, char *filename, char *filemode)
Definition SDDS_copy.c:40
int32_t SDDS_CopyPage(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
Definition SDDS_copy.c:578
int32_t SDDS_SetColumnFromDoubles(SDDS_DATASET *SDDS_dataset, int32_t mode, double *data, int64_t rows,...)
Sets the values for a single data column using double-precision floating-point numbers.
int32_t SDDS_SetColumnFromLongs(SDDS_DATASET *SDDS_dataset, int32_t mode, int32_t *data, int64_t rows,...)
Sets the values for a single data column using long integer numbers.
int32_t SDDS_AssertRowFlags(SDDS_DATASET *SDDS_dataset, uint32_t mode,...)
Sets acceptance flags for rows based on specified criteria.
void * SDDS_GetNumericColumn(SDDS_DATASET *SDDS_dataset, char *column_name, int32_t desiredType)
Retrieves the data of a specified numerical column as an array of a desired numerical type,...
int64_t SDDS_CountRowsOfInterest(SDDS_DATASET *SDDS_dataset)
Counts the number of rows marked as "of interest" in the current data table.
int32_t SDDS_SetRowFlags(SDDS_DATASET *SDDS_dataset, int32_t row_flag_value)
Sets the acceptance flags for all rows in the current data table of a data set.
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_DefineColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, int32_t field_length)
Defines a data column within the SDDS dataset.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_GetColumnIndex(SDDS_DATASET *SDDS_dataset, char *name)
Retrieves the index of a named column in the SDDS dataset.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
Definition SDDS_utils.c:677
#define SDDS_LONG
Identifier for the signed 32-bit integer data type.
Definition SDDStypes.h:61
#define SDDS_SHORT
Identifier for the signed short integer data type.
Definition SDDStypes.h:73
#define SDDS_DOUBLE
Identifier for the double data type.
Definition SDDStypes.h:37
void * tmalloc(uint64_t size_of_block)
Allocates a memory block of the specified size with zero initialization.
Definition array.c:59
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
Definition bomb.c:26
int index_min_max(int64_t *imin, int64_t *imax, double *list, int64_t n)
Finds the indices of the minimum and maximum values in a list of doubles.
Definition findMinMax.c:116
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
Definition findMinMax.c:33
double ipow(const double x, const int64_t p)
Compute x raised to the power p (x^p).
Definition ipow.c:33
long make_histogram(double *hist, long n_bins, double lo, double hi, double *data, int64_t n_pts, long new_start)
Compiles a histogram from data points.
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
long compute_percentiles(double *position, double *percent, long positions, double *x, long n)
Computes multiple percentiles of an array of doubles.
Definition median.c:84
long replaceFileAndBackUp(char *file, char *replacement)
Replaces a file with a replacement file and creates a backup of the original.
Definition replacefile.c:75
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390
long scanItemList(unsigned long *flags, char **item, long *items, unsigned long mode,...)
Scans a list of items and assigns values based on provided keywords and types.
double normSigLevel(double z0, long tails)
Computes the probability that a standard normal variable exceeds a given value.
Definition sigLevel.c:40