47void compareToFileDistribution(
char *output,
long testCode,
SDDS_DATASET *SDDSin,
char **columnName,
long columnNames,
char *distFile,
char *distFileIndep,
char *distFileDepen);
48void compareToDistribution(
char *output,
long testCode,
SDDS_DATASET *SDDSin,
char **columnName,
char **sigmaName,
long columnNames,
long distCode,
long degreesFree,
char *dofParameter,
short columnMajorOrder,
int threads);
49void ksTestWithFunction(
double *data, int64_t rows,
double (*CDF)(
double x),
double *statReturn,
double *sigLevelReturn);
50void chiSquaredTestWithFunction(
double *data, int64_t rows,
double (*PDF)(
double x),
double *statReturn,
double *sigLevelReturn);
53 "Usage: sddsdistest [<input>] [<output>]\n"
54 " [-pipe=[in][,out]]\n"
55 " -column=<name>[,sigma=<name>]...\n"
56 " -exclude=<name>[,...]\n"
57 " [-degreesOfFreedom={<value>|@<parameterName>}]\n"
58 " [-test={ks|chisquared}]\n"
59 " [{-fileDistribution=<filename>,<indepName>,<depenName> |\n"
60 " -gaussian | -poisson | -student | -chisquared}]\n"
61 " [-majorOrder=row|column]\n"
62 " [-threads=<number>]\n\n"
64 " Tests data columns against specified statistical distributions using the\n"
65 " Kolmogorov-Smirnov or Chi-Squared tests.\n\n"
67 " <input> Input SDDS file. If omitted, standard input is used.\n"
68 " <output> Output SDDS file. If omitted, standard output is used.\n"
69 " -pipe=[in][,out] Use pipe for input and/or output.\n"
70 " -column=<name>[,sigma=<name>]...\n"
71 " Specify one or more columns to test, optionally with\n"
72 " corresponding sigma (error) columns.\n"
73 " -exclude=<name>[,...] Exclude specified columns from testing.\n"
74 " -degreesOfFreedom={<value>|@<parameterName>}\n"
75 " Specify degrees of freedom as a fixed value or reference\n"
76 " a parameter in the input SDDS file.\n"
77 " -test={ks|chisquared} Choose the statistical test to perform: 'ks' for\n"
78 " Kolmogorov-Smirnov or 'chisquared' for Chi-Squared.\n"
79 " -fileDistribution=<filename>,<indepName>,<depenName>\n"
80 " Use a user-defined distribution from a file.\n"
81 " -gaussian Test against a Gaussian distribution.\n"
82 " -poisson Test against a Poisson distribution.\n"
83 " -student Test against a Student's t distribution.\n"
84 " -chisquared Test against a Chi-Squared distribution.\n"
85 " -majorOrder=row|column Specify data ordering: 'row' for row-major or 'column'\n"
86 " for column-major.\n"
87 " -threads=<number> Number of threads to use for computations.\n\n"
88 "Program Information:\n"
89 " Program by M. Borland. (" __DATE__
" " __TIME__
", SVN revision: " SVN_VERSION
")\n";
108static char *option[N_OPTIONS] = {
126static char *testChoice[N_TESTS] = {
131int main(
int argc,
char **argv) {
133 unsigned long dummyFlags, pipeFlags, majorOrderFlag;
134 SCANNED_ARG *scanned;
136 char *input = NULL, *output = NULL, *distFile = NULL, **columnName = NULL, **sigmaName = NULL, **excludeName = NULL, *distFileIndep = NULL, *distFileDepen = NULL;
137 long testCode = 0, distCode = -1, code, degreesFree = -1, columnNames = 0, excludeNames = 0;
138 char *dofParameter = NULL;
139 short columnMajorOrder = -1;
143 argc =
scanargs(&scanned, argc, argv);
149 for (iArg = 1; iArg < argc; iArg++) {
150 if (scanned[iArg].arg_type == OPTION) {
152 code =
match_string(scanned[iArg].list[0], option, N_OPTIONS, 0);
154 case CLO_MAJOR_ORDER:
156 scanned[iArg].n_items--;
157 if (scanned[iArg].n_items > 0 &&
158 !
scanItemList(&majorOrderFlag, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
159 "row", -1, NULL, 0, SDDS_ROW_MAJOR_ORDER,
160 "column", -1, NULL, 0, SDDS_COLUMN_MAJOR_ORDER, NULL)) {
161 SDDS_Bomb(
"invalid -majorOrder syntax/values");
163 if (majorOrderFlag & SDDS_COLUMN_MAJOR_ORDER)
164 columnMajorOrder = 1;
165 else if (majorOrderFlag & SDDS_ROW_MAJOR_ORDER)
166 columnMajorOrder = 0;
169 if (!
processPipeOption(scanned[iArg].list + 1, scanned[iArg].n_items - 1, &pipeFlags))
170 SDDS_Bomb(
"invalid -pipe syntax/values");
173 if ((scanned[iArg].n_items != 2 && scanned[iArg].n_items != 3) ||
175 SDDS_Bomb(
"invalid -column syntax/values");
177 moveToStringArray(&columnName, &columnNames, scanned[iArg].list + 1, 1);
178 sigmaName =
SDDS_Realloc(sigmaName,
sizeof(*sigmaName) * columnNames);
179 if (scanned[iArg].n_items == 3) {
180 scanned[iArg].n_items -= 2;
181 if (!
scan_item_list(&dummyFlags, scanned[iArg].list + 2, &scanned[iArg].n_items,
"sigma",
182 SDDS_STRING, sigmaName + columnNames - 1, 1, 1, NULL) ||
185 SDDS_Bomb(
"invalid -column syntax/values");
188 sigmaName[columnNames - 1] = NULL;
192 if (scanned[iArg].n_items != 2 ||
193 (testCode =
match_string(scanned[iArg].list[1], testChoice, N_TESTS, 0)) < 0) {
194 SDDS_Bomb(
"invalid -test syntax/values");
198 if (scanned[iArg].n_items != 4) {
199 SDDS_Bomb(
"too few qualifiers for -fileDistribution");
204 SDDS_Bomb(
"invalid -fileDistribution values");
214 if (scanned[iArg].n_items != 2) {
215 SDDS_Bomb(
"too few qualifiers for -degreesOfFreedom");
217 if (scanned[iArg].list[1][0] ==
'@') {
221 }
else if (sscanf(scanned[iArg].list[1],
"%ld", °reesFree) != 1 || degreesFree <= 1) {
222 SDDS_Bomb(
"invalid degrees-of-freedom given for -student/-chiSquared");
227 SDDS_Bomb(
"invalid -exclude syntax/values");
229 moveToStringArray(&excludeName, &excludeNames, scanned[iArg].list + 1, scanned[iArg].n_items - 1);
232 if (scanned[iArg].n_items != 2 ||
233 !sscanf(scanned[iArg].list[1],
"%d", &threads) ||
239 fprintf(stderr,
"error: unknown/ambiguous option: %s (%s)\n", scanned[iArg].list[0], argv[0]);
244 input = scanned[iArg].list[0];
246 output = scanned[iArg].list[0];
257 SDDS_Bomb(
"-column option must be supplied");
258 if (!(columnNames = expandColumnPairNames(&SDDSin, &columnName, &sigmaName, columnNames, excludeName, excludeNames, FIND_NUMERIC_TYPE, 0))) {
260 SDDS_Bomb(
"named columns nonexistent or nonnumerical");
263 SDDS_Bomb(
"degrees-of-freedom parameter not found");
266 compareToFileDistribution(output, testCode, &SDDSin, columnName, columnNames, distFile, distFileIndep, distFileDepen);
268 compareToDistribution(output, testCode, &SDDSin, columnName, sigmaName, columnNames, distCode, degreesFree, dofParameter, columnMajorOrder, threads);
277void compareToFileDistribution(
char *output,
long testCode,
SDDS_DATASET *SDDSin,
char **columnName,
long columnNames,
char *distFile,
char *distFileIndep,
char *distFileDepen) {
288 SDDS_Bomb(
"-fileDistribution option not implemented yet");
291static double sampleMean, sampleStDev;
294double gaussianPDF(
double x) {
296 z = (x - sampleMean) / sampleStDev;
297 return exp(-z * z / 2) / sqrt(PIx2);
300double gaussianCDF(
double x) {
303 z = (x - sampleMean) / sampleStDev;
309 return (1 + zSign * erf(z / SQRT2)) / 2.0;
312#define POISSON_ACCURACY (1e-8)
314double poissonPDF(
double xd) {
319 return exp(-sampleMean + x * log(sampleMean) - lgamma(x + 1.0));
322double poissonCDF(
double xd) {
323 double CDF, term, accuracy;
332 accuracy = POISSON_ACCURACY / exp(-sampleMean);
333 for (n = 1; n <= x; n++) {
334 term *= sampleMean / n;
339 return CDF * exp(-sampleMean);
342double studentPDF(
double t) {
343 return exp(-0.5 * (DOF + 1) * log(1 + t * t / DOF) + lgamma((DOF + 1.0) / 2.0) - lgamma(DOF / 2.0)) / sqrt(PI * DOF);
346double studentCDF(
double t) {
348 return 1 -
betaInc(DOF / 2.0, 0.5, DOF / (DOF + t * t)) / 2;
350 return betaInc(DOF / 2.0, 0.5, DOF / (DOF + t * t)) / 2;
353#define LOG2 0.693147180559945
355double chiSquaredPDF(
double x) {
356 double chiSqr, DOFover2;
360 chiSqr = x * DOF / sampleMean;
361 DOFover2 = DOF / 2.0;
362 return exp((DOFover2 - 1.0) * log(chiSqr) - chiSqr / 2 - DOFover2 * LOG2 - lgamma(DOFover2));
365double chiSquaredCDF(
double x) {
370 chiSqr = x * DOF / sampleMean;
371 return 1 -
gammaQ(DOF / 2.0, chiSqr / 2.0);
374void compareToDistribution(
char *output,
long testCode,
SDDS_DATASET *SDDSin,
char **columnName,
char **sigmaName,
long columnNames,
long distCode,
long degreesFree,
char *dofParameter,
short columnMajorOrder,
int threads) {
376 double *data, *data1, stat, sigLevel;
377 long iStat, iSigLevel, iCount, iColumnName;
380 iStat = iSigLevel = iCount = iColumnName = 0;
382 0 >
SDDS_DefineParameter(&SDDSout,
"distestDistribution", NULL, NULL,
"sddsdistest distribution name", NULL,
388 if (columnMajorOrder != -1)
389 SDDSout.layout.data_mode.column_major = columnMajorOrder;
391 SDDSout.layout.data_mode.column_major = SDDSin->layout.data_mode.column_major;
394 if (0 > (iColumnName =
SDDS_DefineColumn(&SDDSout,
"ColumnName", NULL, NULL,
"Column analysed by sddsdistest",
397 0 > (iSigLevel =
SDDS_DefineColumn(&SDDSout,
"distestSigLevel",
"P(D$ba$n>D)", NULL,
"Probability of exceeding D", NULL,
SDDS_DOUBLE, 0))) {
402 if (0 > (iColumnName =
SDDS_DefineColumn(&SDDSout,
"ColumnName", NULL, NULL,
"Column analysed by sddsdistest",
404 0 > (iStat =
SDDS_DefineParameter(&SDDSout,
"ChiSquared",
"$gh$r$a2$n", NULL,
"Chi-squared statistic", NULL,
406 0 > (iSigLevel =
SDDS_DefineParameter(&SDDSout,
"distestSigLevel",
"P($gh$r$a2$n$ba$n>$gh$r$a2$n)", NULL,
"Probability of exceeding $gh$r$a2$n", NULL,
SDDS_DOUBLE, 0))) {
411 SDDS_Bomb(
"Invalid testCode seen in compareToDistribution--this shouldn't happen.");
420 for (icol = 0; icol < columnNames; icol++) {
423 if (!
SDDS_SetColumn(&SDDSout, SDDS_SET_BY_INDEX, columnName, columnNames, iColumnName) ||
424 !
SDDS_SetParameters(&SDDSout, SDDS_SET_BY_INDEX | SDDS_PASS_BY_VALUE, iCount, rows, -1)) {
437 if (testCode == KS_TEST)
438 ksTestWithFunction(data, rows, gaussianCDF, &stat, &sigLevel);
440 chiSquaredTestWithFunction(data, rows, gaussianPDF, &stat, &sigLevel);
444 if (testCode == KS_TEST)
445 ksTestWithFunction(data, rows, poissonCDF, &stat, &sigLevel);
447 chiSquaredTestWithFunction(data, rows, poissonPDF, &stat, &sigLevel);
451 SDDS_Bomb(
"must have at least one degree of freedom for Student distribution tests");
453 if (sigmaName && sigmaName[icol]) {
456 for (row = 0; row < rows; row++)
458 data[row] = (data[row] - sampleMean) / data1[row];
461 for (row = 0; row < rows; row++)
462 data[row] -= sampleMean;
464 if (testCode == KS_TEST)
465 ksTestWithFunction(data, rows, studentCDF, &stat, &sigLevel);
467 chiSquaredTestWithFunction(data, rows, studentPDF, &stat, &sigLevel);
472 SDDS_Bomb(
"must have at least one degree of freedom for chi-squared distribution tests");
473 if (testCode == KS_TEST)
474 ksTestWithFunction(data, rows, chiSquaredCDF, &stat, &sigLevel);
476 chiSquaredTestWithFunction(data, rows, chiSquaredPDF, &stat, &sigLevel);
479 SDDS_Bomb(
"Invalid distCode in compareToDistribution--this shouldn't happen");
483 if (!
SDDS_SetRowValues(&SDDSout, SDDS_PASS_BY_VALUE | SDDS_SET_BY_INDEX, icol, iStat, stat, iSigLevel, sigLevel, -1))
495void chiSquaredTestWithFunction(
double *data, int64_t rows,
double (*PDF)(
double x),
double *statReturn,
double *sigLevelReturn) {
496 SDDS_Bomb(
"Chi-squared distribution test not implemented yet---wouldn't you really like a nice K-S test instead?");
499void ksTestWithFunction(
double *data, int64_t rows,
double (*CDF)(
double x),
double *statReturn,
double *sigLevelReturn) {
500 double CDF1, CDF2, dCDF1, dCDF2, CDF0, dCDFmaximum;
504 dCDFmaximum = CDF1 = 0;
505 for (row = 0; row < rows; row++) {
506 CDF0 = (*CDF)(data[row]);
507 CDF2 = (row + 1.0) / rows;
515 if (dCDF1 > dCDFmaximum)
517 if (dCDF2 > dCDFmaximum)
520 *statReturn = dCDFmaximum;
521 *sigLevelReturn =
KS_Qfunction(sqrt((
double)rows) * dCDFmaximum);
SDDS (Self Describing Data Set) Data Types Definitions and Function Prototypes.
int32_t SDDS_SetRowValues(SDDS_DATASET *SDDS_dataset, int32_t mode, int64_t row,...)
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int32_t SDDS_SetParameters(SDDS_DATASET *SDDS_dataset, int32_t mode,...)
int32_t SDDS_SetColumn(SDDS_DATASET *SDDS_dataset, int32_t mode, void *data, int64_t rows,...)
Sets the values for one data column in the current data table of an SDDS dataset.
int32_t SDDS_InitializeOutput(SDDS_DATASET *SDDS_dataset, int32_t data_mode, int32_t lines_per_row, const char *description, const char *contents, const char *filename)
Initializes the SDDS output dataset.
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_DefineColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, int32_t field_length)
Defines a data column within the SDDS dataset.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_DefineParameter(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, char *fixed_value)
Defines a data parameter with a fixed string value.
int32_t SDDS_CheckColumn(SDDS_DATASET *SDDS_dataset, char *name, char *units, int32_t type, FILE *fp_message)
Checks if a column exists in the SDDS dataset with the specified name, units, and type.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
int32_t SDDS_StringIsBlank(char *s)
Checks if a string is blank (contains only whitespace characters).
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
int32_t SDDS_CheckParameter(SDDS_DATASET *SDDS_dataset, char *name, char *units, int32_t type, FILE *fp_message)
Checks if a parameter exists in the SDDS dataset with the specified name, units, and type.
int32_t SDDS_CopyString(char **target, const char *source)
Copies a source string to a target string with memory allocation.
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
#define SDDS_STRING
Identifier for the string data type.
#define SDDS_LONG
Identifier for the signed 32-bit integer data type.
#define SDDS_ANY_NUMERIC_TYPE
Special identifier used by SDDS_Check*() routines to accept any numeric type.
#define SDDS_DOUBLE
Identifier for the double data type.
double betaInc(double a, double b, double x)
Compute the incomplete beta function.
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
double gammaQ(double a, double x)
Compute the regularized upper incomplete gamma function Q(a,x).
double KS_Qfunction(double lambda)
Compute the Q-function for the Kolmogorov-Smirnov test.
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
long computeMomentsThreaded(double *mean, double *rms, double *standDev, double *meanAbsoluteDev, double *x, long n, long numThreads)
Computes the mean, RMS, standard deviation, and mean absolute deviation of an array using multiple th...
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
long processPipeOption(char **item, long items, unsigned long *flags)
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
long scanItemList(unsigned long *flags, char **item, long *items, unsigned long mode,...)
Scans a list of items and assigns values based on provided keywords and types.
long scan_item_list(unsigned long *flags, char **item, long *items,...)
Scans a list of items without flag extension and assigns values based on provided keywords and types.
int double_cmpasc(const void *a, const void *b)
Compare two doubles in ascending order.