102 SET_PERCENTILE_LIMIT,
107char *option[N_OPTIONS] = {
129 "sddsoutlier [<inputfile>] [<outputfile>]\n" \
130 " [-pipe=[input][,output]]\n" \
132 " [-noWarnings] \n" \
133 " -columns=<list-of-names>\n" \
134 " [-excludeColumns=<list-of-names>]\n" \
135 " -stDevLimit=<value>\n" \
136 " [-absLimit=<value>] \n" \
137 " [-absDeviationLimit=<value>[,neighbor=<number>]]\n" \
138 " [-maximumLimit=<value>] \n" \
139 " [-minimumLimit=<value>]\n" \
140 " [-chanceLimit=<minimumChance>] \n" \
141 " [-passes=<number>]\n" \
142 " [-percentileLimit=lower=<lowerPercent>,upper=<upperPercent>]\n" \
143 " [-unpopular=bins=<number>]\n" \
145 " [-majorOrder] \n" \
147 " [-replaceOnly={lastValue|nextValue|interpolatedValue|value=<number>}]\n" \
149 " -pipe=[input][,output]\n" \
150 " Use standard input and/or output as data streams.\n" \
152 " Enable verbose output, displaying processing information.\n" \
154 " Suppress warning messages.\n" \
155 " -columns=<list-of-names>\n" \
156 " Specify a comma-separated list of column names to process.\n" \
157 " -excludeColumns=<list-of-names>\n" \
158 " Specify a comma-separated list of column names to exclude from processing.\n" \
159 " -stDevLimit=<value>\n" \
160 " Point is an outlier if it is more than <value> standard deviations from the mean.\n" \
161 " -absLimit=<value>\n" \
162 " Point is an outlier if it has an absolute value greater than <value>.\n" \
163 " -absDeviationLimit=<value>[,neighbor=<number>]\n" \
164 " Point is an outlier if its absolute deviation from the mean exceeds <value>.\n" \
165 " If neighbor is provided, the mean is computed with the neighbors instead of the whole data.\n" \
166 " -minimumLimit=<value>\n" \
167 " Point is an outlier if it is less than <value>.\n" \
168 " -maximumLimit=<value>\n" \
169 " Point is an outlier if it is greater than <value>.\n" \
170 " -chanceLimit=<minimumChance>\n" \
171 " Point is an outlier if it has a probability less than <minimumChance> of occurring (Gaussian statistics).\n" \
172 " -percentileLimit=lower=<lowerPercent>,upper=<upperPercent>\n" \
173 " Point is an outlier if it is below the <lowerPercent> percentile or above the <upperPercent> percentile.\n" \
174 " -unpopular=bins=<number>\n" \
175 " Remove points that are not in the most populated bin based on a histogram with <number> bins.\n" \
177 " Invert the outlier selection criteria.\n" \
178 " -majorOrder=row|column\n" \
179 " Specify output file data ordering as row or column major.\n" \
181 " Mark identified outliers without removing them.\n" \
182 " -replaceOnly={lastValue|nextValue|interpolatedValue|value=<number>}\n" \
183 " Replace outliers with specified values or strategies.\n" \
184 " -passes=<number>\n" \
185 " Define the number of passes for outlier detection.\n\n" \
186 "Program by Michael Borland. (" __DATE__ " " __TIME__ ", SVN revision: " SVN_VERSION ")\n"
188#define OUTLIER_CONTROL_INVOKED 0x00001U
189#define OUTLIER_STDEV_GIVEN 0x00002U
190#define OUTLIER_FRACTION_GIVEN 0x00004U
191#define OUTLIER_STDEVLIMIT_GIVEN 0x00008U
192#define OUTLIER_UNPOPULAR_BINS 0x00010U
193#define OUTLIER_VERBOSE_GIVEN 0x00020U
194#define OUTLIER_ABSLIMIT_GIVEN 0x00040U
195#define OUTLIER_ABSDEVLIMIT_GIVEN 0x00080U
196#define OUTLIER_INVERT_GIVEN 0x00100U
197#define OUTLIER_MARKONLY 0x00200U
198#define OUTLIER_CHANCELIMIT_GIVEN 0x00400U
199#define OUTLIER_MAXLIMIT_GIVEN 0x00800U
200#define OUTLIER_MINLIMIT_GIVEN 0x01000U
201#define OUTLIER_REPLACELAST 0x02000U
202#define OUTLIER_REPLACENEXT 0x04000U
203#define OUTLIER_REPLACEINTERP 0x08000U
204#define OUTLIER_REPLACEVALUE 0x10000U
205#define OUTLIER_REPLACEFLAGS (OUTLIER_REPLACELAST | OUTLIER_REPLACENEXT | OUTLIER_REPLACEINTERP | OUTLIER_REPLACEVALUE)
206#define OUTLIER_PERCENTILE_LOWER 0x20000U
207#define OUTLIER_PERCENTILE_UPPER 0x40000U
208#define OUTLIER_PERCENTILE_FLAGS (OUTLIER_PERCENTILE_LOWER | OUTLIER_PERCENTILE_UPPER)
212 double stDevLimit, fractionLimit, absoluteLimit, absDevLimit;
213 double chanceLimit, replacementValue, maximumLimit, minimumLimit;
214 double percentilePoint[2];
216 int32_t unpopularBins;
221int64_t removeOutliers(
SDDS_DATASET *SDDSin, int64_t rows,
char **column,
long columns,
OUTLIER_CONTROL *outlierControl, int32_t *isOutlier);
222long meanStDevForFlaggedData(
double *mean,
double *stDev,
double *data, int32_t *keep, int64_t rows);
224int main(
int argc,
char **argv) {
226 char **column, **excludeColumn;
227 long columns, excludeColumns;
228 char *input, *output;
229 SCANNED_ARG *scanned;
231 long readCode, dataLimitGiven, tmpfileUsed;
233 long noWarnings, isOutlierIndex;
236 unsigned long pipeFlags, tmpFlags, majorOrderFlag, dummyFlags;
237 short columnMajorOrder = -1;
240 argc =
scanargs(&scanned, argc, argv);
245 output = input = NULL;
246 columns = excludeColumns = dataLimitGiven = 0;
247 column = excludeColumn = NULL;
249 outlierControl.flags = 0;
250 outlierControl.passes = 1;
251 outlierControl.neighbors = 0;
252 pipeFlags = tmpfileUsed = noWarnings = isOutlierIndex = 0;
254 for (iArg = 1; iArg < argc; iArg++) {
255 if (scanned[iArg].arg_type == OPTION) {
257 switch (
match_string(scanned[iArg].list[0], option, N_OPTIONS, 0)) {
258 case SET_MAJOR_ORDER:
260 scanned[iArg].n_items--;
261 if (scanned[iArg].n_items > 0 &&
262 (!
scanItemList(&majorOrderFlag, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
263 "row", -1, NULL, 0, SDDS_ROW_MAJOR_ORDER,
264 "column", -1, NULL, 0, SDDS_COLUMN_MAJOR_ORDER, NULL)))
265 SDDS_Bomb(
"invalid -majorOrder syntax/values");
266 if (majorOrderFlag & SDDS_COLUMN_MAJOR_ORDER)
267 columnMajorOrder = 1;
268 else if (majorOrderFlag & SDDS_ROW_MAJOR_ORDER)
269 columnMajorOrder = 0;
273 SDDS_Bomb(
"only one -columns option may be given");
274 if (scanned[iArg].n_items < 2)
276 column =
tmalloc(
sizeof(*column) * (columns = scanned[iArg].n_items - 1));
277 for (i = 0; i < columns; i++)
278 column[i] = scanned[iArg].list[i + 1];
282 SDDS_Bomb(
"only one -excludecolumns option may be given");
283 if (scanned[iArg].n_items < 2)
284 SDDS_Bomb(
"invalid -excludecolumns syntax");
285 excludeColumn =
tmalloc(
sizeof(*excludeColumn) * (excludeColumns = scanned[iArg].n_items - 1));
286 for (i = 0; i < excludeColumns; i++)
287 excludeColumn[i] = scanned[iArg].list[i + 1];
289 case SET_STDDEV_LIMIT:
290 if (scanned[iArg].n_items != 2 || sscanf(scanned[iArg].list[1],
"%lf", &outlierControl.stDevLimit) != 1 ||
291 outlierControl.stDevLimit <= 0)
293 outlierControl.flags |= OUTLIER_CONTROL_INVOKED | OUTLIER_STDEV_GIVEN | OUTLIER_STDEVLIMIT_GIVEN;
296 if (scanned[iArg].n_items != 2 || sscanf(scanned[iArg].list[1],
"%lf", &outlierControl.absoluteLimit) != 1 ||
297 outlierControl.absoluteLimit <= 0)
299 outlierControl.flags |= OUTLIER_CONTROL_INVOKED | OUTLIER_ABSLIMIT_GIVEN;
301 case SET_ABSDEV_LIMIT:
302 if (scanned[iArg].n_items < 2)
303 SDDS_Bomb(
"invalid -absDeviationLimit syntax");
304 if (scanned[iArg].n_items == 2) {
305 if (sscanf(scanned[iArg].list[1],
"%lf", &outlierControl.absDevLimit) != 1 || outlierControl.absDevLimit <= 0)
306 SDDS_Bomb(
"invalid -absDeviationLimit syntax");
308 if (sscanf(scanned[iArg].list[1],
"%lf", &outlierControl.absDevLimit) != 1 || outlierControl.absDevLimit <= 0)
309 SDDS_Bomb(
"invalid -absDeviationLimit syntax");
310 scanned[iArg].list += 2;
311 scanned[iArg].n_items -= 2;
312 if (scanned[iArg].n_items > 0 &&
313 (!
scanItemList(&dummyFlags, scanned[iArg].list, &scanned[iArg].n_items, 0,
"neighbors",
SDDS_LONG, &(outlierControl.neighbors), 1, 0, NULL)))
314 SDDS_Bomb(
"invalid -absDeviationLimit syntax/value");
315 if (outlierControl.neighbors % 2 == 0)
316 outlierControl.neighbors += 1;
318 scanned[iArg].list -= 2;
319 scanned[iArg].n_items += 2;
321 outlierControl.flags |= OUTLIER_CONTROL_INVOKED | OUTLIER_ABSDEVLIMIT_GIVEN;
324 outlierControl.flags |= OUTLIER_VERBOSE_GIVEN;
327 if (!
processPipeOption(scanned[iArg].list + 1, scanned[iArg].n_items - 1, &pipeFlags))
334 outlierControl.flags |= OUTLIER_INVERT_GIVEN;
337 outlierControl.flags |= OUTLIER_MARKONLY;
339 case SET_CHANCELIMIT:
340 if (scanned[iArg].n_items != 2 ||
341 sscanf(scanned[iArg].list[1],
"%lf", &outlierControl.chanceLimit) != 1 ||
342 outlierControl.chanceLimit <= 0)
343 SDDS_Bomb(
"invalid -chanceLimit syntax");
344 outlierControl.flags |= OUTLIER_CONTROL_INVOKED | OUTLIER_CHANCELIMIT_GIVEN;
347 if (scanned[iArg].n_items != 2 ||
348 sscanf(scanned[iArg].list[1],
"%ld", &outlierControl.passes) != 1 ||
349 outlierControl.passes < 1)
353 outlierControl.flags |= OUTLIER_MAXLIMIT_GIVEN | OUTLIER_CONTROL_INVOKED;
354 if (scanned[iArg].n_items != 2 ||
355 sscanf(scanned[iArg].list[1],
"%lf", &outlierControl.maximumLimit) != 1)
356 SDDS_Bomb(
"invalid -maximumLimit syntax");
359 outlierControl.flags |= OUTLIER_MINLIMIT_GIVEN | OUTLIER_CONTROL_INVOKED;
360 if (scanned[iArg].n_items != 2 ||
361 sscanf(scanned[iArg].list[1],
"%lf", &outlierControl.minimumLimit) != 1)
362 SDDS_Bomb(
"invalid -minimumLimit syntax");
365 if (scanned[iArg].n_items != 2)
367 scanned[iArg].n_items -= 1;
368 if (!
scanItemList(&tmpFlags, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
369 "lastvalue", -1, NULL, 0, OUTLIER_REPLACELAST,
370 "nextvalue", -1, NULL, 0, OUTLIER_REPLACENEXT,
371 "interpolatedvalue", -1, NULL, 0, OUTLIER_REPLACEINTERP,
372 "value",
SDDS_DOUBLE, &outlierControl.replacementValue, 1, OUTLIER_REPLACEVALUE, NULL))
373 SDDS_Bomb(
"invalid -replace syntax/values");
374 outlierControl.flags |= tmpFlags | OUTLIER_CONTROL_INVOKED;
376 case SET_PERCENTILE_LIMIT:
377 if (scanned[iArg].n_items < 3)
378 SDDS_Bomb(
"invalid -percentileLimit syntax");
379 scanned[iArg].n_items -= 1;
380 if (!
scanItemList(&tmpFlags, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
381 "lower",
SDDS_DOUBLE, &outlierControl.percentilePoint[0], 1, OUTLIER_PERCENTILE_LOWER,
382 "upper",
SDDS_DOUBLE, &outlierControl.percentilePoint[1], 1, OUTLIER_PERCENTILE_UPPER, NULL) ||
383 !(tmpFlags & OUTLIER_PERCENTILE_LOWER) || !(tmpFlags & OUTLIER_PERCENTILE_UPPER) ||
384 outlierControl.percentilePoint[0] >= outlierControl.percentilePoint[1])
385 SDDS_Bomb(
"invalid -percentileLimit syntax");
386 outlierControl.flags |= tmpFlags | OUTLIER_CONTROL_INVOKED;
389 if (scanned[iArg].n_items < 2)
391 scanned[iArg].n_items -= 1;
392 if (!
scanItemList(&tmpFlags, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
393 "bins",
SDDS_LONG, &(outlierControl.unpopularBins), 1, OUTLIER_UNPOPULAR_BINS, NULL) ||
394 !(tmpFlags & OUTLIER_UNPOPULAR_BINS) || outlierControl.unpopularBins < 2)
396 outlierControl.flags |= tmpFlags | OUTLIER_CONTROL_INVOKED;
399 fprintf(stderr,
"Error: Unknown or ambiguous option: %s\n", scanned[iArg].list[0]);
405 input = scanned[iArg].list[0];
407 output = scanned[iArg].list[0];
412 if (outlierControl.flags & OUTLIER_REPLACEFLAGS && outlierControl.flags & OUTLIER_MARKONLY)
413 SDDS_Bomb(
"Cannot use -replaceOnly and -markOnly simultaneously.");
415 processFilenames(
"sddsoutlier", &input, &output, pipeFlags, noWarnings, &tmpfileUsed);
417 if (!(outlierControl.flags & OUTLIER_CONTROL_INVOKED)) {
418 outlierControl.flags |= OUTLIER_CONTROL_INVOKED | OUTLIER_STDEV_GIVEN | OUTLIER_STDEVLIMIT_GIVEN;
419 outlierControl.stDevLimit = 2;
426 if (columnMajorOrder != -1)
427 SDDSout.layout.data_mode.column_major = columnMajorOrder;
429 SDDSout.layout.data_mode.column_major = SDDSin.layout.data_mode.column_major;
431 if (outlierControl.flags & OUTLIER_MARKONLY &&
440 if ((columns = expandColumnPairNames(&SDDSout, &column, NULL, columns, excludeColumn, excludeColumns, FIND_NUMERIC_TYPE, 0)) <= 0) {
442 SDDS_Bomb(
"No columns selected for outlier control.");
455 if (outlierControl.flags & OUTLIER_MARKONLY) {
456 if (isOutlierIndex >= 0) {
458 SDDS_Bomb(
"Unable to retrieve 'IsOutlier' column from input file despite its existence.");
461 isOutlier =
SDDS_Realloc(isOutlier,
sizeof(*isOutlier) * rows);
464 for (i = 0; i < rows; i++)
468 if (outlierControl.flags & OUTLIER_VERBOSE_GIVEN)
469 fprintf(stderr,
"%" PRId64
" rows in page %ld\n", rows, readCode);
470 if ((rows = removeOutliers(&SDDSout, rows, column, columns, &outlierControl, isOutlier)) == 0) {
472 fprintf(stderr,
" No rows left after outlier control--skipping page.\n");
475 if (outlierControl.flags & OUTLIER_VERBOSE_GIVEN)
476 fprintf(stderr,
"%" PRId64
" rows left after outlier control\n", rows);
478 fprintf(stderr,
"Problem with row selection:\n %" PRId64
" expected, %" PRId64
" counted\n", rows,
SDDS_CountRowsOfInterest(&SDDSout));
492int64_t removeOutliers(
SDDS_DATASET *dataset, int64_t rows,
char **column,
long columns,
OUTLIER_CONTROL *outlierControl, int32_t *isOutlier) {
494 int64_t irow, kept, killed, j, k, summed;
495 double *data, sum1, stDev, mean;
496 static int32_t *keep = NULL;
497 double lastGoodValue = 0;
498 int64_t irow0, irow1;
509 for (irow = 0; irow < rows; irow++)
513 for (irow = kept = 0; irow < rows; irow++)
514 if ((keep[irow] = !isOutlier[irow]))
518 for (icol = 0; icol < columns; icol++) {
524 for (ipass = 0; ipass < outlierControl->passes; ipass++) {
525 if (outlierControl->flags & OUTLIER_UNPOPULAR_BINS && rows > 1) {
526 double *hist, lo, hi, delta;
527 int64_t imin, imax, ih;
528 hist =
tmalloc(
sizeof(*hist) * outlierControl->unpopularBins);
530 make_histogram(hist, outlierControl->unpopularBins, lo, hi, data, rows, 1);
531 delta = (hi - lo) / outlierControl->unpopularBins;
532 index_min_max(&imin, &imax, hist, outlierControl->unpopularBins);
533 for (irow = killed = 0; irow < rows; irow++) {
534 ih = (data[irow] - lo) / delta;
542 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
543 fprintf(stderr,
"%" PRId64
" additional rows killed by column %s unpopular control\n", killed, column[icol]);
546 if (outlierControl->flags & OUTLIER_PERCENTILE_FLAGS) {
547 double percentileResult[2];
549 if (
compute_percentiles(percentileResult, outlierControl->percentilePoint, 2, data, rows)) {
550 for (irow = killed = 0; irow < rows; irow++) {
551 if ((data[irow] < percentileResult[0] || data[irow] > percentileResult[1]) && keep[irow]) {
558 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
559 fprintf(stderr,
"%" PRId64
" additional rows killed by column %s percentile outlier control\n", killed, column[icol]);
562 if (outlierControl->flags & OUTLIER_MINLIMIT_GIVEN) {
564 for (irow = killed = 0; irow < rows; irow++) {
565 if (keep[irow] && data[irow] < outlierControl->minimumLimit) {
571 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
572 fprintf(stderr,
"%" PRId64
" additional rows killed by column %s minimum value outlier control\n", killed, column[icol]);
575 if (outlierControl->flags & OUTLIER_MAXLIMIT_GIVEN) {
577 for (irow = killed = 0; irow < rows; irow++) {
578 if (keep[irow] && data[irow] > outlierControl->maximumLimit) {
584 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
585 fprintf(stderr,
"%" PRId64
" additional rows killed by column %s maximum value outlier control\n", killed, column[icol]);
588 if (outlierControl->flags & OUTLIER_ABSLIMIT_GIVEN) {
590 for (irow = killed = 0; irow < rows; irow++) {
591 if (keep[irow] && fabs(data[irow]) > outlierControl->absoluteLimit) {
597 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
598 fprintf(stderr,
"%" PRId64
" additional rows killed by column %s absolute value outlier control\n", killed, column[icol]);
601 if (outlierControl->flags & OUTLIER_ABSDEVLIMIT_GIVEN) {
603 if (outlierControl->neighbors > 0) {
604 for (irow = killed = 0; irow < rows; irow++) {
608 for (j = irow - outlierControl->neighbors / 2; j <= irow + outlierControl->neighbors / 2; j++) {
610 k = irow + outlierControl->neighbors / 2 - j;
611 else if (j > rows - 1)
612 k = irow - outlierControl->neighbors / 2 - (j - rows + 1);
615 mean += fabs(data[k]);
617 mean = mean / outlierControl->neighbors;
618 if (keep[irow] && fabs(data[irow] - mean) > outlierControl->absDevLimit) {
625 for (irow = sum1 = summed = 0; irow < rows; irow++) {
633 mean = sum1 / summed;
634 for (irow = killed = 0; irow < rows; irow++)
635 if (keep[irow] && fabs(data[irow] - mean) > outlierControl->absDevLimit) {
641 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
642 fprintf(stderr,
"%" PRId64
" additional rows killed by column %s absolute deviation outlier control\n", killed, column[icol]);
645 if (outlierControl->flags & OUTLIER_STDEV_GIVEN && kept && meanStDevForFlaggedData(&mean, &stDev, data, keep, rows) && stDev) {
647 for (irow = killed = 0; irow < rows; irow++)
648 if (keep[irow] && fabs(data[irow] - mean) > outlierControl->stDevLimit * stDev) {
653 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
654 fprintf(stderr,
"%" PRId64
" additional rows killed by column %s standard deviation outlier control\n", killed, column[icol]);
657 if (outlierControl->flags & OUTLIER_CHANCELIMIT_GIVEN) {
659 if (kept && meanStDevForFlaggedData(&mean, &stDev, data, keep, rows) && stDev) {
661 double gProb, probOfSeeing;
663 for (irow = killed = 0; irow < rows; irow++) {
669 probOfSeeing = 1 -
ipow(1 - gProb, lastKept);
670 if (probOfSeeing < outlierControl->chanceLimit) {
676 if (killed && (outlierControl->flags & OUTLIER_VERBOSE_GIVEN))
677 fprintf(stderr,
"%" PRId64
" additional rows killed by column %s chance limit outlier control\n", killed, column[icol]);
682 if (outlierControl->flags & OUTLIER_REPLACEFLAGS && (outlierControl->flags & OUTLIER_INVERT_GIVEN)) {
683 for (irow = 0; irow < rows; irow++)
684 keep[irow] = !keep[irow];
688 if (outlierControl->flags & OUTLIER_REPLACELAST) {
689 for (irow = 0; irow < rows; irow++) {
691 lastGoodValue = data[irow];
695 for (irow = 0; irow < rows; irow++) {
698 data[irow] = lastGoodValue;
700 lastGoodValue = data[irow];
705 }
else if (outlierControl->flags & OUTLIER_REPLACENEXT) {
706 for (irow = rows - 1; irow >= 0; irow--) {
708 lastGoodValue = data[irow];
712 for (irow = rows - 1; irow >= 0; irow--) {
714 data[irow] = lastGoodValue;
717 lastGoodValue = data[irow];
722 }
else if (outlierControl->flags & OUTLIER_REPLACEINTERP) {
725 for (irow = 0; irow < rows; irow++) {
727 if ((irow0 = irow - 1) >= 0) {
728 if ((irow1 = irow + 1) < rows) {
729 while (irow1 < rows && !keep[irow1])
732 if (irow1 < rows && keep[irow1]) {
736 for (; irow < irow1; irow++)
737 data[irow] = data[irow0] + (data[irow1] - data[irow0]) / (1.0 * irow1 - irow0) * (irow - irow0);
741 for (; irow < rows; irow++)
742 data[irow] = data[irow0];
747 for (irow1 = irow + 1; irow1 < rows; irow1++) {
752 for (; irow < irow1; irow++)
753 data[irow] = data[irow1];
759 for (irow = 0; irow < rows; irow++)
764 }
else if (outlierControl->flags & OUTLIER_REPLACEVALUE) {
765 for (irow = 0; irow < rows; irow++) {
767 data[irow] = outlierControl->replacementValue;
778 if (outlierControl->flags & OUTLIER_INVERT_GIVEN) {
779 for (irow = 0; irow < rows; irow++)
780 keep[irow] = !keep[irow];
782 if (outlierControl->flags & OUTLIER_VERBOSE_GIVEN)
783 fprintf(stderr,
"%" PRId64
" rows left after inversion\n", kept);
786 if (isOutlier && (outlierControl->flags & OUTLIER_MARKONLY)) {
787 for (irow = 0; irow < rows; irow++)
788 isOutlier[irow] = !keep[irow];
799long meanStDevForFlaggedData(
double *mean,
double *stDev,
double *data, int32_t *keep, int64_t rows) {
800 int64_t irow, summed;
801 double sum1, sum2, value;
804 for (irow = sum1 = summed = 0; irow < rows; irow++) {
812 *mean = sum1 / summed;
813 for (irow = sum2 = 0; irow < rows; irow++) {
815 value = data[irow] - *mean;
816 sum2 += value * value;
819 *stDev = sqrt(sum2 / (summed - 1));
SDDS (Self Describing Data Set) Data Types Definitions and Function Prototypes.
int32_t SDDS_InitializeCopy(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source, char *filename, char *filemode)
int32_t SDDS_CopyPage(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
int32_t SDDS_SetColumnFromDoubles(SDDS_DATASET *SDDS_dataset, int32_t mode, double *data, int64_t rows,...)
Sets the values for a single data column using double-precision floating-point numbers.
int32_t SDDS_SetColumnFromLongs(SDDS_DATASET *SDDS_dataset, int32_t mode, int32_t *data, int64_t rows,...)
Sets the values for a single data column using long integer numbers.
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_DefineColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, int32_t field_length)
Defines a data column within the SDDS dataset.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_GetColumnIndex(SDDS_DATASET *SDDS_dataset, char *name)
Retrieves the index of a named column in the SDDS dataset.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
#define SDDS_LONG
Identifier for the signed 32-bit integer data type.
#define SDDS_SHORT
Identifier for the signed short integer data type.
#define SDDS_DOUBLE
Identifier for the double data type.
Utility functions for SDDS dataset manipulation and string array operations.
void * tmalloc(uint64_t size_of_block)
Allocates a memory block of the specified size with zero initialization.
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
int index_min_max(int64_t *imin, int64_t *imax, double *list, int64_t n)
Finds the indices of the minimum and maximum values in a list of doubles.
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
double ipow(const double x, const int64_t p)
Compute x raised to the power p (x^p).
long make_histogram(double *hist, long n_bins, double lo, double hi, double *data, int64_t n_pts, long new_start)
Compiles a histogram from data points.
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
long replaceFileAndBackUp(char *file, char *replacement)
Replaces a file with a replacement file and creates a backup of the original.
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
long processPipeOption(char **item, long items, unsigned long *flags)
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
long scanItemList(unsigned long *flags, char **item, long *items, unsigned long mode,...)
Scans a list of items and assigns values based on provided keywords and types.
double normSigLevel(double z0, long tails)
Computes the probability that a standard normal variable exceeds a given value.