79void compareToFileDistribution(
char *output,
long testCode,
SDDS_DATASET *SDDSin,
char **columnName,
long columnNames,
char *distFile,
char *distFileIndep,
char *distFileDepen);
80void compareToDistribution(
char *output,
long testCode,
SDDS_DATASET *SDDSin,
char **columnName,
char **sigmaName,
long columnNames,
long distCode,
long degreesFree,
char *dofParameter,
short columnMajorOrder,
int threads);
81void ksTestWithFunction(
double *data, int64_t rows,
double (*CDF)(
double x),
double *statReturn,
double *sigLevelReturn);
82void chiSquaredTestWithFunction(
double *data, int64_t rows,
double (*PDF)(
double x),
double *statReturn,
double *sigLevelReturn);
85 "sddsdistest [<input>] [<output>]\n"
86 " [-pipe=[in][,out]]\n"
87 " -column=<name>[,sigma=<name>]...\n"
88 " -exclude=<name>[,...]\n"
89 " [-degreesOfFreedom={<value>|@<parameterName>}]\n"
90 " [-test={ks|chisquared}]\n"
92 " -fileDistribution=<filename>,<indepName>,<depenName> |\n"
98 " [-majorOrder=row|column]\n"
99 " [-threads=<number>]\n\n"
101 " Tests data columns against specified statistical distributions using the\n"
102 " Kolmogorov-Smirnov or Chi-Squared tests.\n\n"
104 " <input> Input SDDS file. If omitted, standard input is used.\n"
105 " <output> Output SDDS file. If omitted, standard output is used.\n"
106 " -pipe=[in][,out] Use pipe for input and/or output.\n"
107 " -column=<name>[,sigma=<name>]...\n"
108 " Specify one or more columns to test, optionally with\n"
109 " corresponding sigma (error) columns.\n"
110 " -exclude=<name>[,...] Exclude specified columns from testing.\n"
111 " -degreesOfFreedom={<value>|@<parameterName>}\n"
112 " Specify degrees of freedom as a fixed value or reference\n"
113 " a parameter in the input SDDS file.\n"
114 " -test={ks|chisquared} Choose the statistical test to perform: 'ks' for\n"
115 " Kolmogorov-Smirnov or 'chisquared' for Chi-Squared.\n"
116 " -fileDistribution=<filename>,<indepName>,<depenName>\n"
117 " Use a user-defined distribution from a file.\n"
118 " -gaussian Test against a Gaussian distribution.\n"
119 " -poisson Test against a Poisson distribution.\n"
120 " -student Test against a Student's t distribution.\n"
121 " -chisquared Test against a Chi-Squared distribution.\n"
122 " -majorOrder=row|column Specify data ordering: 'row' for row-major or 'column'\n"
123 " for column-major.\n"
124 " -threads=<number> Number of threads to use for computations.\n\n"
125 "Program Information:\n"
126 " Program by M. Borland. (" __DATE__
" " __TIME__
", SVN revision: " SVN_VERSION
")\n";
145static char *option[N_OPTIONS] = {
163static char *testChoice[N_TESTS] = {
168int main(
int argc,
char **argv) {
170 unsigned long dummyFlags, pipeFlags, majorOrderFlag;
171 SCANNED_ARG *scanned;
173 char *input = NULL, *output = NULL, *distFile = NULL, **columnName = NULL, **sigmaName = NULL, **excludeName = NULL, *distFileIndep = NULL, *distFileDepen = NULL;
174 long testCode = 0, distCode = -1, code, degreesFree = -1, columnNames = 0, excludeNames = 0;
175 char *dofParameter = NULL;
176 short columnMajorOrder = -1;
180 argc =
scanargs(&scanned, argc, argv);
186 for (iArg = 1; iArg < argc; iArg++) {
187 if (scanned[iArg].arg_type == OPTION) {
189 code =
match_string(scanned[iArg].list[0], option, N_OPTIONS, 0);
191 case CLO_MAJOR_ORDER:
193 scanned[iArg].n_items--;
194 if (scanned[iArg].n_items > 0 &&
195 !
scanItemList(&majorOrderFlag, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
196 "row", -1, NULL, 0, SDDS_ROW_MAJOR_ORDER,
197 "column", -1, NULL, 0, SDDS_COLUMN_MAJOR_ORDER, NULL)) {
198 SDDS_Bomb(
"invalid -majorOrder syntax/values");
200 if (majorOrderFlag & SDDS_COLUMN_MAJOR_ORDER)
201 columnMajorOrder = 1;
202 else if (majorOrderFlag & SDDS_ROW_MAJOR_ORDER)
203 columnMajorOrder = 0;
206 if (!
processPipeOption(scanned[iArg].list + 1, scanned[iArg].n_items - 1, &pipeFlags))
207 SDDS_Bomb(
"invalid -pipe syntax/values");
210 if ((scanned[iArg].n_items != 2 && scanned[iArg].n_items != 3) ||
212 SDDS_Bomb(
"invalid -column syntax/values");
214 moveToStringArray(&columnName, &columnNames, scanned[iArg].list + 1, 1);
215 sigmaName =
SDDS_Realloc(sigmaName,
sizeof(*sigmaName) * columnNames);
216 if (scanned[iArg].n_items == 3) {
217 scanned[iArg].n_items -= 2;
218 if (!
scan_item_list(&dummyFlags, scanned[iArg].list + 2, &scanned[iArg].n_items,
"sigma",
219 SDDS_STRING, sigmaName + columnNames - 1, 1, 1, NULL) ||
222 SDDS_Bomb(
"invalid -column syntax/values");
225 sigmaName[columnNames - 1] = NULL;
229 if (scanned[iArg].n_items != 2 ||
230 (testCode =
match_string(scanned[iArg].list[1], testChoice, N_TESTS, 0)) < 0) {
231 SDDS_Bomb(
"invalid -test syntax/values");
235 if (scanned[iArg].n_items != 4) {
236 SDDS_Bomb(
"too few qualifiers for -fileDistribution");
241 SDDS_Bomb(
"invalid -fileDistribution values");
251 if (scanned[iArg].n_items != 2) {
252 SDDS_Bomb(
"too few qualifiers for -degreesOfFreedom");
254 if (scanned[iArg].list[1][0] ==
'@') {
258 }
else if (sscanf(scanned[iArg].list[1],
"%ld", °reesFree) != 1 || degreesFree <= 1) {
259 SDDS_Bomb(
"invalid degrees-of-freedom given for -student/-chiSquared");
264 SDDS_Bomb(
"invalid -exclude syntax/values");
266 moveToStringArray(&excludeName, &excludeNames, scanned[iArg].list + 1, scanned[iArg].n_items - 1);
269 if (scanned[iArg].n_items != 2 ||
270 !sscanf(scanned[iArg].list[1],
"%d", &threads) ||
276 fprintf(stderr,
"error: unknown/ambiguous option: %s (%s)\n", scanned[iArg].list[0], argv[0]);
281 input = scanned[iArg].list[0];
283 output = scanned[iArg].list[0];
294 SDDS_Bomb(
"-column option must be supplied");
295 if (!(columnNames = expandColumnPairNames(&SDDSin, &columnName, &sigmaName, columnNames, excludeName, excludeNames, FIND_NUMERIC_TYPE, 0))) {
297 SDDS_Bomb(
"named columns nonexistent or nonnumerical");
300 SDDS_Bomb(
"degrees-of-freedom parameter not found");
303 compareToFileDistribution(output, testCode, &SDDSin, columnName, columnNames, distFile, distFileIndep, distFileDepen);
305 compareToDistribution(output, testCode, &SDDSin, columnName, sigmaName, columnNames, distCode, degreesFree, dofParameter, columnMajorOrder, threads);
314void compareToFileDistribution(
char *output,
long testCode,
SDDS_DATASET *SDDSin,
char **columnName,
long columnNames,
char *distFile,
char *distFileIndep,
char *distFileDepen) {
325 SDDS_Bomb(
"-fileDistribution option not implemented yet");
328static double sampleMean, sampleStDev;
331double gaussianPDF(
double x) {
333 z = (x - sampleMean) / sampleStDev;
334 return exp(-z * z / 2) / sqrt(PIx2);
337double gaussianCDF(
double x) {
340 z = (x - sampleMean) / sampleStDev;
346 return (1 + zSign * erf(z / SQRT2)) / 2.0;
349#define POISSON_ACCURACY (1e-8)
351double poissonPDF(
double xd) {
356 return exp(-sampleMean + x * log(sampleMean) - lgamma(x + 1.0));
359double poissonCDF(
double xd) {
360 double CDF, term, accuracy;
369 accuracy = POISSON_ACCURACY / exp(-sampleMean);
370 for (n = 1; n <= x; n++) {
371 term *= sampleMean / n;
376 return CDF * exp(-sampleMean);
379double studentPDF(
double t) {
380 return exp(-0.5 * (DOF + 1) * log(1 + t * t / DOF) + lgamma((DOF + 1.0) / 2.0) - lgamma(DOF / 2.0)) / sqrt(PI * DOF);
383double studentCDF(
double t) {
385 return 1 -
betaInc(DOF / 2.0, 0.5, DOF / (DOF + t * t)) / 2;
387 return betaInc(DOF / 2.0, 0.5, DOF / (DOF + t * t)) / 2;
390#define LOG2 0.693147180559945
392double chiSquaredPDF(
double x) {
393 double chiSqr, DOFover2;
397 chiSqr = x * DOF / sampleMean;
398 DOFover2 = DOF / 2.0;
399 return exp((DOFover2 - 1.0) * log(chiSqr) - chiSqr / 2 - DOFover2 * LOG2 - lgamma(DOFover2));
402double chiSquaredCDF(
double x) {
407 chiSqr = x * DOF / sampleMean;
408 return 1 -
gammaQ(DOF / 2.0, chiSqr / 2.0);
411void compareToDistribution(
char *output,
long testCode,
SDDS_DATASET *SDDSin,
char **columnName,
char **sigmaName,
long columnNames,
long distCode,
long degreesFree,
char *dofParameter,
short columnMajorOrder,
int threads) {
413 double *data, *data1, stat, sigLevel;
414 long iStat, iSigLevel, iCount, iColumnName;
417 iStat = iSigLevel = iCount = iColumnName = 0;
419 0 >
SDDS_DefineParameter(&SDDSout,
"distestDistribution", NULL, NULL,
"sddsdistest distribution name", NULL,
425 if (columnMajorOrder != -1)
426 SDDSout.layout.data_mode.column_major = columnMajorOrder;
428 SDDSout.layout.data_mode.column_major = SDDSin->layout.data_mode.column_major;
431 if (0 > (iColumnName =
SDDS_DefineColumn(&SDDSout,
"ColumnName", NULL, NULL,
"Column analysed by sddsdistest",
434 0 > (iSigLevel =
SDDS_DefineColumn(&SDDSout,
"distestSigLevel",
"P(D$ba$n>D)", NULL,
"Probability of exceeding D", NULL,
SDDS_DOUBLE, 0))) {
439 if (0 > (iColumnName =
SDDS_DefineColumn(&SDDSout,
"ColumnName", NULL, NULL,
"Column analysed by sddsdistest",
441 0 > (iStat =
SDDS_DefineParameter(&SDDSout,
"ChiSquared",
"$gh$r$a2$n", NULL,
"Chi-squared statistic", NULL,
443 0 > (iSigLevel =
SDDS_DefineParameter(&SDDSout,
"distestSigLevel",
"P($gh$r$a2$n$ba$n>$gh$r$a2$n)", NULL,
"Probability of exceeding $gh$r$a2$n", NULL,
SDDS_DOUBLE, 0))) {
448 SDDS_Bomb(
"Invalid testCode seen in compareToDistribution--this shouldn't happen.");
457 for (icol = 0; icol < columnNames; icol++) {
460 if (!
SDDS_SetColumn(&SDDSout, SDDS_SET_BY_INDEX, columnName, columnNames, iColumnName) ||
461 !
SDDS_SetParameters(&SDDSout, SDDS_SET_BY_INDEX | SDDS_PASS_BY_VALUE, iCount, rows, -1)) {
474 if (testCode == KS_TEST)
475 ksTestWithFunction(data, rows, gaussianCDF, &stat, &sigLevel);
477 chiSquaredTestWithFunction(data, rows, gaussianPDF, &stat, &sigLevel);
481 if (testCode == KS_TEST)
482 ksTestWithFunction(data, rows, poissonCDF, &stat, &sigLevel);
484 chiSquaredTestWithFunction(data, rows, poissonPDF, &stat, &sigLevel);
488 SDDS_Bomb(
"must have at least one degree of freedom for Student distribution tests");
490 if (sigmaName && sigmaName[icol]) {
493 for (row = 0; row < rows; row++)
495 data[row] = (data[row] - sampleMean) / data1[row];
498 for (row = 0; row < rows; row++)
499 data[row] -= sampleMean;
501 if (testCode == KS_TEST)
502 ksTestWithFunction(data, rows, studentCDF, &stat, &sigLevel);
504 chiSquaredTestWithFunction(data, rows, studentPDF, &stat, &sigLevel);
509 SDDS_Bomb(
"must have at least one degree of freedom for chi-squared distribution tests");
510 if (testCode == KS_TEST)
511 ksTestWithFunction(data, rows, chiSquaredCDF, &stat, &sigLevel);
513 chiSquaredTestWithFunction(data, rows, chiSquaredPDF, &stat, &sigLevel);
516 SDDS_Bomb(
"Invalid distCode in compareToDistribution--this shouldn't happen");
520 if (!
SDDS_SetRowValues(&SDDSout, SDDS_PASS_BY_VALUE | SDDS_SET_BY_INDEX, icol, iStat, stat, iSigLevel, sigLevel, -1))
532void chiSquaredTestWithFunction(
double *data, int64_t rows,
double (*PDF)(
double x),
double *statReturn,
double *sigLevelReturn) {
533 SDDS_Bomb(
"Chi-squared distribution test not implemented yet---wouldn't you really like a nice K-S test instead?");
536void ksTestWithFunction(
double *data, int64_t rows,
double (*CDF)(
double x),
double *statReturn,
double *sigLevelReturn) {
537 double CDF1, CDF2, dCDF1, dCDF2, CDF0, dCDFmaximum;
541 dCDFmaximum = CDF1 = 0;
542 for (row = 0; row < rows; row++) {
543 CDF0 = (*CDF)(data[row]);
544 CDF2 = (row + 1.0) / rows;
552 if (dCDF1 > dCDFmaximum)
554 if (dCDF2 > dCDFmaximum)
557 *statReturn = dCDFmaximum;
558 *sigLevelReturn =
KS_Qfunction(sqrt((
double)rows) * dCDFmaximum);
SDDS (Self Describing Data Set) Data Types Definitions and Function Prototypes.
int32_t SDDS_SetRowValues(SDDS_DATASET *SDDS_dataset, int32_t mode, int64_t row,...)
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int32_t SDDS_SetParameters(SDDS_DATASET *SDDS_dataset, int32_t mode,...)
int32_t SDDS_SetColumn(SDDS_DATASET *SDDS_dataset, int32_t mode, void *data, int64_t rows,...)
Sets the values for one data column in the current data table of an SDDS dataset.
int32_t SDDS_InitializeOutput(SDDS_DATASET *SDDS_dataset, int32_t data_mode, int32_t lines_per_row, const char *description, const char *contents, const char *filename)
Initializes the SDDS output dataset.
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_DefineColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, int32_t field_length)
Defines a data column within the SDDS dataset.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_DefineParameter(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, char *fixed_value)
Defines a data parameter with a fixed string value.
int32_t SDDS_CheckColumn(SDDS_DATASET *SDDS_dataset, char *name, char *units, int32_t type, FILE *fp_message)
Checks if a column exists in the SDDS dataset with the specified name, units, and type.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
int32_t SDDS_StringIsBlank(char *s)
Checks if a string is blank (contains only whitespace characters).
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
int32_t SDDS_CheckParameter(SDDS_DATASET *SDDS_dataset, char *name, char *units, int32_t type, FILE *fp_message)
Checks if a parameter exists in the SDDS dataset with the specified name, units, and type.
int32_t SDDS_CopyString(char **target, const char *source)
Copies a source string to a target string with memory allocation.
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
#define SDDS_STRING
Identifier for the string data type.
#define SDDS_LONG
Identifier for the signed 32-bit integer data type.
#define SDDS_ANY_NUMERIC_TYPE
Special identifier used by SDDS_Check*() routines to accept any numeric type.
#define SDDS_DOUBLE
Identifier for the double data type.
Utility functions for SDDS dataset manipulation and string array operations.
double betaInc(double a, double b, double x)
Compute the incomplete beta function.
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
double gammaQ(double a, double x)
Compute the regularized upper incomplete gamma function Q(a,x).
double KS_Qfunction(double lambda)
Compute the Q-function for the Kolmogorov-Smirnov test.
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
long computeMomentsThreaded(double *mean, double *rms, double *standDev, double *meanAbsoluteDev, double *x, long n, long numThreads)
Computes the mean, RMS, standard deviation, and mean absolute deviation of an array using multiple th...
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
long processPipeOption(char **item, long items, unsigned long *flags)
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
long scanItemList(unsigned long *flags, char **item, long *items, unsigned long mode,...)
Scans a list of items and assigns values based on provided keywords and types.
long scan_item_list(unsigned long *flags, char **item, long *items,...)
Scans a list of items without flag extension and assigns values based on provided keywords and types.
int double_cmpasc(const void *a, const void *b)
Compare two doubles in ascending order.