SDDS ToolKit Programs and Libraries for C and Python
All Classes Files Functions Variables Macros Pages
sddsdistest.c
Go to the documentation of this file.
1/**
2 * @file sddsdistest.c
3 * @brief Statistical distribution testing tool for SDDS datasets.
4 *
5 * @details
6 * This program performs statistical tests on columns of data from an SDDS file
7 * against specified distributions (Gaussian, Poisson, Student's t, or Chi-Squared).
8 * It supports both Kolmogorov-Smirnov (KS) and Chi-Squared tests, along with the
9 * ability to use user-defined distributions provided via external files.
10 *
11 * @section Usage
12 * ```
13 * sddsdistest [<inputfile>] [<outputfile>]
14 * [-pipe=[in][,out]]
15 * -column=<name>[,sigma=<name>]...
16 * [-exclude=<name>[,...]]
17 * [-degreesOfFreedom={<value>|@<parameterName>}]
18 * [-test={ks|chisquared}]
19 * [-fileDistribution=<filename>,<indepName>,<depenName>]
20 * [-gaussian]
21 * [-poisson]
22 * [-student]
23 * [-chisquared]
24 * [-majorOrder=row|column]
25 * [-threads=<number>]
26 * ```
27 *
28 * @section Options
29 * | Required | Description |
30 * |---------------------------------------|---------------------------------------------------------------------------------------|
31 * | `-column` | Specifies the columns to test, with optional sigma columns for error handling. |
32 *
33 * | Optional | Description |
34 * |---------------------------------------|---------------------------------------------------------------------------------------|
35 * | `-pipe` | Enables piping for input and/or output data streams. |
36 * | `-exclude` | Excludes specified columns from testing. |
37 * | `-degreesOfFreedom` | Specifies degrees of freedom as a fixed value or a parameter. |
38 * | `-test` | Selects the test to perform: Kolmogorov-Smirnov or Chi-Squared. |
39 * | `-fileDistribution` | Uses a user-defined distribution from a file. |
40 * | `-gaussian` | Tests against a Gaussian distribution. |
41 * | `-poisson` | Tests against a Poisson distribution. |
42 * | `-student` | Tests against a Student's t distribution. |
43 * | `-chisquared` | Tests against a Chi-Squared distribution. |
44 * | `-majorOrder` | Specifies the data ordering: 'row' or 'column'. |
45 * | `-threads` | Number of threads for parallel computations. |
46 *
47 * @subsection Incompatibilities
48 * - The `-test` option is incompatible with specifying multiple distributions (e.g., `-gaussian` and `-poisson`).
49 * - Only one of the following may be specified:
50 * - `-fileDistribution`
51 * - `-gaussian`
52 * - `-poisson`
53 * - `-student`
54 * - `-chisquared`
55 *
56 * @copyright
57 * - (c) 2002 The University of Chicago, as Operator of Argonne National Laboratory.
58 * - (c) 2002 The Regents of the University of California, as Operator of Los Alamos National Laboratory.
59 *
60 * @license
61 * This file is distributed under the terms of the Software License Agreement
62 * found in the file LICENSE included with this distribution.
63 *
64 * @author
65 * M. Borland, R. Soliday, Xuesong Jiao, H. Shang
66 */
67
68#include "mdb.h"
69#include "SDDS.h"
70#include "SDDSutils.h"
71#include "scan.h"
72
73#if defined(_WIN32)
74# if _MSC_VER <= 1600
75# include "fdlibm.h"
76# endif
77#endif
78
79void compareToFileDistribution(char *output, long testCode, SDDS_DATASET *SDDSin, char **columnName, long columnNames, char *distFile, char *distFileIndep, char *distFileDepen);
80void compareToDistribution(char *output, long testCode, SDDS_DATASET *SDDSin, char **columnName, char **sigmaName, long columnNames, long distCode, long degreesFree, char *dofParameter, short columnMajorOrder, int threads);
81void ksTestWithFunction(double *data, int64_t rows, double (*CDF)(double x), double *statReturn, double *sigLevelReturn);
82void chiSquaredTestWithFunction(double *data, int64_t rows, double (*PDF)(double x), double *statReturn, double *sigLevelReturn);
83
84static char *USAGE =
85 "sddsdistest [<input>] [<output>]\n"
86 " [-pipe=[in][,out]]\n"
87 " -column=<name>[,sigma=<name>]...\n"
88 " -exclude=<name>[,...]\n"
89 " [-degreesOfFreedom={<value>|@<parameterName>}]\n"
90 " [-test={ks|chisquared}]\n"
91 " [{\n"
92 " -fileDistribution=<filename>,<indepName>,<depenName> |\n"
93 " -gaussian |\n"
94 " -poisson |\n"
95 " -student |\n"
96 " -chisquared\n"
97 " }]\n"
98 " [-majorOrder=row|column]\n"
99 " [-threads=<number>]\n\n"
100 "Description:\n"
101 " Tests data columns against specified statistical distributions using the\n"
102 " Kolmogorov-Smirnov or Chi-Squared tests.\n\n"
103 "Options:\n"
104 " <input> Input SDDS file. If omitted, standard input is used.\n"
105 " <output> Output SDDS file. If omitted, standard output is used.\n"
106 " -pipe=[in][,out] Use pipe for input and/or output.\n"
107 " -column=<name>[,sigma=<name>]...\n"
108 " Specify one or more columns to test, optionally with\n"
109 " corresponding sigma (error) columns.\n"
110 " -exclude=<name>[,...] Exclude specified columns from testing.\n"
111 " -degreesOfFreedom={<value>|@<parameterName>}\n"
112 " Specify degrees of freedom as a fixed value or reference\n"
113 " a parameter in the input SDDS file.\n"
114 " -test={ks|chisquared} Choose the statistical test to perform: 'ks' for\n"
115 " Kolmogorov-Smirnov or 'chisquared' for Chi-Squared.\n"
116 " -fileDistribution=<filename>,<indepName>,<depenName>\n"
117 " Use a user-defined distribution from a file.\n"
118 " -gaussian Test against a Gaussian distribution.\n"
119 " -poisson Test against a Poisson distribution.\n"
120 " -student Test against a Student's t distribution.\n"
121 " -chisquared Test against a Chi-Squared distribution.\n"
122 " -majorOrder=row|column Specify data ordering: 'row' for row-major or 'column'\n"
123 " for column-major.\n"
124 " -threads=<number> Number of threads to use for computations.\n\n"
125 "Program Information:\n"
126 " Program by M. Borland. (" __DATE__ " " __TIME__ ", SVN revision: " SVN_VERSION ")\n";
127
128/* Enumeration for option types */
129enum option_type {
130 CLO_PIPE,
131 CLO_COLUMN,
132 CLO_TEST,
133 CLO_FILEDIST,
134 CLO_GAUSSIAN,
135 CLO_POISSON,
136 CLO_STUDENT,
137 CLO_CHISQUARED,
138 CLO_DOF,
139 CLO_EXCLUDE,
140 CLO_MAJOR_ORDER,
141 CLO_THREADS,
142 N_OPTIONS
143};
144
145static char *option[N_OPTIONS] = {
146 "pipe",
147 "column",
148 "test",
149 "filedistribution",
150 "gaussian",
151 "poisson",
152 "student",
153 "chisquared",
154 "degreesoffreedom",
155 "exclude",
156 "majorOrder",
157 "threads"
158};
159
160#define KS_TEST 0
161#define CHI_TEST 1
162#define N_TESTS 2
163static char *testChoice[N_TESTS] = {
164 "ks",
165 "chisquared",
166};
167
168int main(int argc, char **argv) {
169 int iArg;
170 unsigned long dummyFlags, pipeFlags, majorOrderFlag;
171 SCANNED_ARG *scanned;
172 SDDS_DATASET SDDSin;
173 char *input = NULL, *output = NULL, *distFile = NULL, **columnName = NULL, **sigmaName = NULL, **excludeName = NULL, *distFileIndep = NULL, *distFileDepen = NULL;
174 long testCode = 0, distCode = -1, code, degreesFree = -1, columnNames = 0, excludeNames = 0;
175 char *dofParameter = NULL;
176 short columnMajorOrder = -1;
177 int threads = 1;
178
180 argc = scanargs(&scanned, argc, argv);
181 if (argc < 3)
182 bomb(NULL, USAGE);
183
184 pipeFlags = 0;
185
186 for (iArg = 1; iArg < argc; iArg++) {
187 if (scanned[iArg].arg_type == OPTION) {
188 /* process options here */
189 code = match_string(scanned[iArg].list[0], option, N_OPTIONS, 0);
190 switch (code) {
191 case CLO_MAJOR_ORDER:
192 majorOrderFlag = 0;
193 scanned[iArg].n_items--;
194 if (scanned[iArg].n_items > 0 &&
195 !scanItemList(&majorOrderFlag, scanned[iArg].list + 1, &scanned[iArg].n_items, 0,
196 "row", -1, NULL, 0, SDDS_ROW_MAJOR_ORDER,
197 "column", -1, NULL, 0, SDDS_COLUMN_MAJOR_ORDER, NULL)) {
198 SDDS_Bomb("invalid -majorOrder syntax/values");
199 }
200 if (majorOrderFlag & SDDS_COLUMN_MAJOR_ORDER)
201 columnMajorOrder = 1;
202 else if (majorOrderFlag & SDDS_ROW_MAJOR_ORDER)
203 columnMajorOrder = 0;
204 break;
205 case CLO_PIPE:
206 if (!processPipeOption(scanned[iArg].list + 1, scanned[iArg].n_items - 1, &pipeFlags))
207 SDDS_Bomb("invalid -pipe syntax/values");
208 break;
209 case CLO_COLUMN:
210 if ((scanned[iArg].n_items != 2 && scanned[iArg].n_items != 3) ||
211 SDDS_StringIsBlank(scanned[iArg].list[1])) {
212 SDDS_Bomb("invalid -column syntax/values");
213 }
214 moveToStringArray(&columnName, &columnNames, scanned[iArg].list + 1, 1);
215 sigmaName = SDDS_Realloc(sigmaName, sizeof(*sigmaName) * columnNames);
216 if (scanned[iArg].n_items == 3) {
217 scanned[iArg].n_items -= 2;
218 if (!scan_item_list(&dummyFlags, scanned[iArg].list + 2, &scanned[iArg].n_items, "sigma",
219 SDDS_STRING, sigmaName + columnNames - 1, 1, 1, NULL) ||
220 dummyFlags != 1 ||
221 SDDS_StringIsBlank(sigmaName[columnNames - 1])) {
222 SDDS_Bomb("invalid -column syntax/values");
223 }
224 } else {
225 sigmaName[columnNames - 1] = NULL;
226 }
227 break;
228 case CLO_TEST:
229 if (scanned[iArg].n_items != 2 ||
230 (testCode = match_string(scanned[iArg].list[1], testChoice, N_TESTS, 0)) < 0) {
231 SDDS_Bomb("invalid -test syntax/values");
232 }
233 break;
234 case CLO_FILEDIST:
235 if (scanned[iArg].n_items != 4) {
236 SDDS_Bomb("too few qualifiers for -fileDistribution");
237 }
238 if (SDDS_StringIsBlank(distFile = scanned[iArg].list[1]) ||
239 SDDS_StringIsBlank(distFileIndep = scanned[iArg].list[2]) ||
240 SDDS_StringIsBlank(distFileDepen = scanned[iArg].list[3])) {
241 SDDS_Bomb("invalid -fileDistribution values");
242 }
243 break;
244 case CLO_GAUSSIAN:
245 case CLO_POISSON:
246 case CLO_STUDENT:
247 case CLO_CHISQUARED:
248 distCode = code;
249 break;
250 case CLO_DOF:
251 if (scanned[iArg].n_items != 2) {
252 SDDS_Bomb("too few qualifiers for -degreesOfFreedom");
253 }
254 if (scanned[iArg].list[1][0] == '@') {
255 if (!SDDS_CopyString(&dofParameter, scanned[iArg].list[1] + 1)) {
256 SDDS_Bomb("memory allocation failure");
257 }
258 } else if (sscanf(scanned[iArg].list[1], "%ld", &degreesFree) != 1 || degreesFree <= 1) {
259 SDDS_Bomb("invalid degrees-of-freedom given for -student/-chiSquared");
260 }
261 break;
262 case CLO_EXCLUDE:
263 if (scanned[iArg].n_items < 2 || SDDS_StringIsBlank(scanned[iArg].list[1])) {
264 SDDS_Bomb("invalid -exclude syntax/values");
265 }
266 moveToStringArray(&excludeName, &excludeNames, scanned[iArg].list + 1, scanned[iArg].n_items - 1);
267 break;
268 case CLO_THREADS:
269 if (scanned[iArg].n_items != 2 ||
270 !sscanf(scanned[iArg].list[1], "%d", &threads) ||
271 threads < 1) {
272 SDDS_Bomb("invalid -threads syntax");
273 }
274 break;
275 default:
276 fprintf(stderr, "error: unknown/ambiguous option: %s (%s)\n", scanned[iArg].list[0], argv[0]);
277 exit(EXIT_FAILURE);
278 }
279 } else {
280 if (!input)
281 input = scanned[iArg].list[0];
282 else if (!output)
283 output = scanned[iArg].list[0];
284 else
285 SDDS_Bomb("too many filenames seen");
286 }
287 }
288
289 processFilenames("sddsdistest", &input, &output, pipeFlags, 0, NULL);
290
291 if (!SDDS_InitializeInput(&SDDSin, input))
292 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
293 if (!columnNames)
294 SDDS_Bomb("-column option must be supplied");
295 if (!(columnNames = expandColumnPairNames(&SDDSin, &columnName, &sigmaName, columnNames, excludeName, excludeNames, FIND_NUMERIC_TYPE, 0))) {
296 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
297 SDDS_Bomb("named columns nonexistent or nonnumerical");
298 }
299 if (dofParameter && SDDS_CheckParameter(&SDDSin, dofParameter, NULL, SDDS_ANY_NUMERIC_TYPE, stderr) != SDDS_CHECK_OKAY)
300 SDDS_Bomb("degrees-of-freedom parameter not found");
301
302 if (distFile)
303 compareToFileDistribution(output, testCode, &SDDSin, columnName, columnNames, distFile, distFileIndep, distFileDepen);
304 else
305 compareToDistribution(output, testCode, &SDDSin, columnName, sigmaName, columnNames, distCode, degreesFree, dofParameter, columnMajorOrder, threads);
306
307 if (!SDDS_Terminate(&SDDSin)) {
308 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
309 return EXIT_FAILURE;
310 }
311 return EXIT_SUCCESS;
312}
313
314void compareToFileDistribution(char *output, long testCode, SDDS_DATASET *SDDSin, char **columnName, long columnNames, char *distFile, char *distFileIndep, char *distFileDepen) {
315 SDDS_DATASET SDDSdist;
316 if (!SDDS_InitializeInput(&SDDSdist, distFile))
317 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
318 if (SDDS_CheckColumn(&SDDSdist, distFileIndep, NULL, SDDS_ANY_NUMERIC_TYPE, stderr) != SDDS_CHECK_OKAY ||
319 SDDS_CheckColumn(&SDDSdist, distFileDepen, NULL, SDDS_ANY_NUMERIC_TYPE, stderr) != SDDS_CHECK_OKAY)
320 exit(EXIT_FAILURE);
321 if (!SDDS_Terminate(&SDDSdist)) {
322 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
323 exit(EXIT_FAILURE);
324 }
325 SDDS_Bomb("-fileDistribution option not implemented yet");
326}
327
328static double sampleMean, sampleStDev;
329static int32_t DOF;
330
331double gaussianPDF(double x) {
332 double z;
333 z = (x - sampleMean) / sampleStDev;
334 return exp(-z * z / 2) / sqrt(PIx2);
335}
336
337double gaussianCDF(double x) {
338 long zSign;
339 double z;
340 z = (x - sampleMean) / sampleStDev;
341 zSign = 1;
342 if (z < 0) {
343 zSign = -1;
344 z = -z;
345 }
346 return (1 + zSign * erf(z / SQRT2)) / 2.0;
347}
348
349#define POISSON_ACCURACY (1e-8)
350
351double poissonPDF(double xd) {
352 long x;
353
354 if ((x = xd) < 0)
355 return 0;
356 return exp(-sampleMean + x * log(sampleMean) - lgamma(x + 1.0));
357}
358
359double poissonCDF(double xd) {
360 double CDF, term, accuracy;
361 long x, n;
362
363 if (xd < 0)
364 xd = 0;
365 x = xd;
366 xd = x;
367 term = 1;
368 CDF = 1;
369 accuracy = POISSON_ACCURACY / exp(-sampleMean);
370 for (n = 1; n <= x; n++) {
371 term *= sampleMean / n;
372 CDF += term;
373 if (term < accuracy)
374 break;
375 }
376 return CDF * exp(-sampleMean);
377}
378
379double studentPDF(double t) {
380 return exp(-0.5 * (DOF + 1) * log(1 + t * t / DOF) + lgamma((DOF + 1.0) / 2.0) - lgamma(DOF / 2.0)) / sqrt(PI * DOF);
381}
382
383double studentCDF(double t) {
384 if (t > 0)
385 return 1 - betaInc(DOF / 2.0, 0.5, DOF / (DOF + t * t)) / 2;
386 else
387 return betaInc(DOF / 2.0, 0.5, DOF / (DOF + t * t)) / 2;
388}
389
390#define LOG2 0.693147180559945
391
392double chiSquaredPDF(double x) {
393 double chiSqr, DOFover2;
394
395 if (x < 0)
396 return 0;
397 chiSqr = x * DOF / sampleMean;
398 DOFover2 = DOF / 2.0;
399 return exp((DOFover2 - 1.0) * log(chiSqr) - chiSqr / 2 - DOFover2 * LOG2 - lgamma(DOFover2));
400}
401
402double chiSquaredCDF(double x) {
403 double chiSqr;
404
405 if (x < 0)
406 x = 0;
407 chiSqr = x * DOF / sampleMean;
408 return 1 - gammaQ(DOF / 2.0, chiSqr / 2.0);
409}
410
411void compareToDistribution(char *output, long testCode, SDDS_DATASET *SDDSin, char **columnName, char **sigmaName, long columnNames, long distCode, long degreesFree, char *dofParameter, short columnMajorOrder, int threads) {
412 SDDS_DATASET SDDSout;
413 double *data, *data1, stat, sigLevel;
414 long iStat, iSigLevel, iCount, iColumnName;
415 long icol;
416 int64_t rows, row;
417 iStat = iSigLevel = iCount = iColumnName = 0;
418 if (!SDDS_InitializeOutput(&SDDSout, SDDS_BINARY, 0, NULL, "sddsdistest output", output) ||
419 0 > SDDS_DefineParameter(&SDDSout, "distestDistribution", NULL, NULL, "sddsdistest distribution name", NULL,
420 SDDS_STRING, option[distCode]) ||
421 0 > SDDS_DefineParameter(&SDDSout, "distestTest", NULL, NULL, "sddsdistest test name", NULL, SDDS_STRING, testChoice[testCode]) ||
422 0 > (iCount = SDDS_DefineParameter(&SDDSout, "Count", NULL, NULL, "Number of data points", NULL, SDDS_LONG, 0))) {
423 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
424 }
425 if (columnMajorOrder != -1)
426 SDDSout.layout.data_mode.column_major = columnMajorOrder;
427 else
428 SDDSout.layout.data_mode.column_major = SDDSin->layout.data_mode.column_major;
429 switch (testCode) {
430 case KS_TEST:
431 if (0 > (iColumnName = SDDS_DefineColumn(&SDDSout, "ColumnName", NULL, NULL, "Column analysed by sddsdistest",
432 NULL, SDDS_STRING, 0)) ||
433 0 > (iStat = SDDS_DefineColumn(&SDDSout, "D", NULL, NULL, "Kolmogorov-Smirnov D statistic", NULL, SDDS_DOUBLE, 0)) ||
434 0 > (iSigLevel = SDDS_DefineColumn(&SDDSout, "distestSigLevel", "P(D$ba$n>D)", NULL, "Probability of exceeding D", NULL, SDDS_DOUBLE, 0))) {
435 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
436 }
437 break;
438 case CHI_TEST:
439 if (0 > (iColumnName = SDDS_DefineColumn(&SDDSout, "ColumnName", NULL, NULL, "Column analysed by sddsdistest",
440 NULL, SDDS_STRING, 0)) ||
441 0 > (iStat = SDDS_DefineParameter(&SDDSout, "ChiSquared", "$gh$r$a2$n", NULL, "Chi-squared statistic", NULL,
442 SDDS_DOUBLE, 0)) ||
443 0 > (iSigLevel = SDDS_DefineParameter(&SDDSout, "distestSigLevel", "P($gh$r$a2$n$ba$n>$gh$r$a2$n)", NULL, "Probability of exceeding $gh$r$a2$n", NULL, SDDS_DOUBLE, 0))) {
444 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
445 }
446 break;
447 default:
448 SDDS_Bomb("Invalid testCode seen in compareToDistribution--this shouldn't happen.");
449 break;
450 }
451 if (!SDDS_WriteLayout(&SDDSout))
452 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
453 while (SDDS_ReadPage(SDDSin) > 0) {
454 if (!SDDS_StartPage(&SDDSout, columnNames))
455 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
456 rows = SDDS_CountRowsOfInterest(SDDSin);
457 for (icol = 0; icol < columnNames; icol++) {
458 stat = 0;
459 sigLevel = 1;
460 if (!SDDS_SetColumn(&SDDSout, SDDS_SET_BY_INDEX, columnName, columnNames, iColumnName) ||
461 !SDDS_SetParameters(&SDDSout, SDDS_SET_BY_INDEX | SDDS_PASS_BY_VALUE, iCount, rows, -1)) {
462 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
463 }
464 if (rows >= 2) {
465 if (!(data = SDDS_GetColumnInDoubles(SDDSin, columnName[icol])))
466 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
467 if (dofParameter)
468 SDDS_GetParameterAsLong(SDDSin, dofParameter, &DOF);
469 else
470 DOF = degreesFree;
471 switch (distCode) {
472 case CLO_GAUSSIAN:
473 computeMomentsThreaded(&sampleMean, NULL, &sampleStDev, NULL, data, rows, threads);
474 if (testCode == KS_TEST)
475 ksTestWithFunction(data, rows, gaussianCDF, &stat, &sigLevel);
476 else
477 chiSquaredTestWithFunction(data, rows, gaussianPDF, &stat, &sigLevel);
478 break;
479 case CLO_POISSON:
480 computeMomentsThreaded(&sampleMean, NULL, NULL, NULL, data, rows, threads);
481 if (testCode == KS_TEST)
482 ksTestWithFunction(data, rows, poissonCDF, &stat, &sigLevel);
483 else
484 chiSquaredTestWithFunction(data, rows, poissonPDF, &stat, &sigLevel);
485 break;
486 case CLO_STUDENT:
487 if (DOF < 1)
488 SDDS_Bomb("must have at least one degree of freedom for Student distribution tests");
489 computeMomentsThreaded(&sampleMean, NULL, NULL, NULL, data, rows, threads);
490 if (sigmaName && sigmaName[icol]) {
491 if (!(data1 = SDDS_GetColumnInDoubles(SDDSin, sigmaName[icol])))
492 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
493 for (row = 0; row < rows; row++)
494 /* compute t = (x-mu)/sigma */
495 data[row] = (data[row] - sampleMean) / data1[row];
496 free(data1);
497 } else {
498 for (row = 0; row < rows; row++)
499 data[row] -= sampleMean;
500 }
501 if (testCode == KS_TEST)
502 ksTestWithFunction(data, rows, studentCDF, &stat, &sigLevel);
503 else
504 chiSquaredTestWithFunction(data, rows, studentPDF, &stat, &sigLevel);
505 break;
506 case CLO_CHISQUARED:
507 computeMomentsThreaded(&sampleMean, NULL, NULL, NULL, data, rows, threads);
508 if (DOF < 1)
509 SDDS_Bomb("must have at least one degree of freedom for chi-squared distribution tests");
510 if (testCode == KS_TEST)
511 ksTestWithFunction(data, rows, chiSquaredCDF, &stat, &sigLevel);
512 else
513 chiSquaredTestWithFunction(data, rows, chiSquaredPDF, &stat, &sigLevel);
514 break;
515 default:
516 SDDS_Bomb("Invalid distCode in compareToDistribution--this shouldn't happen");
517 break;
518 }
519 }
520 if (!SDDS_SetRowValues(&SDDSout, SDDS_PASS_BY_VALUE | SDDS_SET_BY_INDEX, icol, iStat, stat, iSigLevel, sigLevel, -1))
521 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
522 }
523 if (!SDDS_WritePage(&SDDSout))
524 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
525 }
526 if (!SDDS_Terminate(&SDDSout)) {
527 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
528 exit(EXIT_FAILURE);
529 }
530}
531
532void chiSquaredTestWithFunction(double *data, int64_t rows, double (*PDF)(double x), double *statReturn, double *sigLevelReturn) {
533 SDDS_Bomb("Chi-squared distribution test not implemented yet---wouldn't you really like a nice K-S test instead?");
534}
535
536void ksTestWithFunction(double *data, int64_t rows, double (*CDF)(double x), double *statReturn, double *sigLevelReturn) {
537 double CDF1, CDF2, dCDF1, dCDF2, CDF0, dCDFmaximum;
538 int64_t row;
539
540 qsort((void *)data, rows, sizeof(*data), double_cmpasc);
541 dCDFmaximum = CDF1 = 0;
542 for (row = 0; row < rows; row++) {
543 CDF0 = (*CDF)(data[row]);
544 CDF2 = (row + 1.0) / rows;
545 dCDF1 = CDF0 - CDF1;
546 if (dCDF1 < 0)
547 dCDF1 = -dCDF1;
548 dCDF2 = CDF0 - CDF2;
549 if (dCDF2 < 0)
550 dCDF2 = -dCDF2;
551 CDF1 = CDF2;
552 if (dCDF1 > dCDFmaximum)
553 dCDFmaximum = dCDF1;
554 if (dCDF2 > dCDFmaximum)
555 dCDFmaximum = dCDF2;
556 }
557 *statReturn = dCDFmaximum;
558 *sigLevelReturn = KS_Qfunction(sqrt((double)rows) * dCDFmaximum);
559}
SDDS (Self Describing Data Set) Data Types Definitions and Function Prototypes.
int32_t SDDS_SetRowValues(SDDS_DATASET *SDDS_dataset, int32_t mode, int64_t row,...)
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int32_t SDDS_SetParameters(SDDS_DATASET *SDDS_dataset, int32_t mode,...)
int32_t SDDS_SetColumn(SDDS_DATASET *SDDS_dataset, int32_t mode, void *data, int64_t rows,...)
Sets the values for one data column in the current data table of an SDDS dataset.
int32_t * SDDS_GetParameterAsLong(SDDS_DATASET *SDDS_dataset, char *parameter_name, int32_t *memory)
Retrieves the value of a specified parameter as a 32-bit integer from the current data table of a dat...
int64_t SDDS_CountRowsOfInterest(SDDS_DATASET *SDDS_dataset)
Counts the number of rows marked as "of interest" in the current data table.
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_InitializeOutput(SDDS_DATASET *SDDS_dataset, int32_t data_mode, int32_t lines_per_row, const char *description, const char *contents, const char *filename)
Initializes the SDDS output dataset.
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_DefineColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, int32_t field_length)
Defines a data column within the SDDS dataset.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_DefineParameter(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, char *fixed_value)
Defines a data parameter with a fixed string value.
int32_t SDDS_CheckColumn(SDDS_DATASET *SDDS_dataset, char *name, char *units, int32_t type, FILE *fp_message)
Checks if a column exists in the SDDS dataset with the specified name, units, and type.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
int32_t SDDS_StringIsBlank(char *s)
Checks if a string is blank (contains only whitespace characters).
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
int32_t SDDS_CheckParameter(SDDS_DATASET *SDDS_dataset, char *name, char *units, int32_t type, FILE *fp_message)
Checks if a parameter exists in the SDDS dataset with the specified name, units, and type.
int32_t SDDS_CopyString(char **target, const char *source)
Copies a source string to a target string with memory allocation.
Definition SDDS_utils.c:856
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
Definition SDDS_utils.c:677
#define SDDS_STRING
Identifier for the string data type.
Definition SDDStypes.h:85
#define SDDS_LONG
Identifier for the signed 32-bit integer data type.
Definition SDDStypes.h:61
#define SDDS_ANY_NUMERIC_TYPE
Special identifier used by SDDS_Check*() routines to accept any numeric type.
Definition SDDStypes.h:157
#define SDDS_DOUBLE
Identifier for the double data type.
Definition SDDStypes.h:37
Utility functions for SDDS dataset manipulation and string array operations.
double betaInc(double a, double b, double x)
Compute the incomplete beta function.
Definition betai.c:67
void bomb(char *error, char *usage)
Reports error messages to the terminal and aborts the program.
Definition bomb.c:26
double gammaQ(double a, double x)
Compute the regularized upper incomplete gamma function Q(a,x).
Definition gammai.c:64
double KS_Qfunction(double lambda)
Compute the Q-function for the Kolmogorov-Smirnov test.
Definition kstests.c:57
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
long computeMomentsThreaded(double *mean, double *rms, double *standDev, double *meanAbsoluteDev, double *x, long n, long numThreads)
Computes the mean, RMS, standard deviation, and mean absolute deviation of an array using multiple th...
Definition moments.c:127
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390
long scanItemList(unsigned long *flags, char **item, long *items, unsigned long mode,...)
Scans a list of items and assigns values based on provided keywords and types.
long scan_item_list(unsigned long *flags, char **item, long *items,...)
Scans a list of items without flag extension and assigns values based on provided keywords and types.
int double_cmpasc(const void *a, const void *b)
Compare two doubles in ascending order.