SDDS ToolKit Programs and Libraries for C and Python
All Classes Files Functions Variables Macros Pages
sddskde.c
Go to the documentation of this file.
1/**
2 * @file sddskde.c
3 * @brief Kernel Density Estimation for SDDS Data.
4 *
5 * @details
6 * This program performs Kernel Density Estimation (KDE) for one-dimensional data
7 * read from an SDDS file. It computes probability density functions (PDFs) and cumulative
8 * distribution functions (CDFs) for the specified columns and writes the results to
9 * an SDDS output file. It uses Gaussian kernel functions for KDE calculations and supports
10 * options for selecting columns and customizing the margin for data extension.
11 *
12 * @section Usage
13 * ```
14 * sddskde [<inputfile>] [<outputfile>]
15 * [-pipe=[input][,output]]
16 * -column=<list of columns>
17 * [-margin=<value>]
18 * ```
19 *
20 * @section Options
21 * | Required | Description |
22 * |---------------------|---------------------------------------------------------------------------------------|
23 * | `-column` | Specifies the columns for KDE calculations. Comma-separated and supports wildcards. |
24 *
25 * | Optional | Description |
26 * |---------------------|-----------------------------------------------------------------|
27 * | `-pipe` | Enable SDDS Toolkit piping for input/output. |
28 * | `-margin` | Set the margin as a ratio to extend data range (default 0.3). |
29 *
30 * @copyright
31 * - (c) 2002 The University of Chicago, as Operator of Argonne National Laboratory.
32 * - (c) 2002 The Regents of the University of California, as Operator of Los Alamos National Laboratory.
33 *
34 * @license
35 * This file is distributed under the terms of the Software License Agreement
36 * found in the file LICENSE included with this distribution.
37 *
38 * @authors
39 * Yipeng Sun, H. Shang, M. Borland, R. Soliday
40 */
41
42#include <math.h>
43#include <gsl/gsl_statistics.h>
44#include <gsl/gsl_sort.h>
45#include <gsl/gsl_math.h>
46#include "SDDS.h"
47#include "mdb.h"
48#include "scan.h"
49#include "SDDSutils.h"
50
51double bandwidth(double data[], int M);
52double gaussiankernelfunction(double sample);
53double kerneldensityestimate(double *trainingdata, double sample, int64_t n);
54double *linearspace(double initial, double final, int N);
55
56/* Enumeration for option types */
57enum option_type {
58 SET_COLUMN,
59 SET_PIPE,
60 SET_MARGIN,
61 N_OPTIONS
62};
63
64static char *option[N_OPTIONS] = {
65 "column",
66 "pipe",
67 "margin",
68};
69
70static const char *usage =
71 "sddskde [<inputfile>] [<outputfile>]\n"
72 " [-pipe=[input][,output]]\n"
73 " -column=<list of columns>\n"
74 " [-margin=<value>]\n"
75 "Options:\n"
76 "-column provide column names separated by commas, wild card accepted.\n"
77 "-margin provide the ratio to extend the original data, default 0.3.\n"
78 "-pipe The standard SDDS Toolkit pipe option.\n\n"
79 "sddskde performs kernel density estimation for one-dimensional data.\n"
80 "Program by Yipeng Sun and Hairong Shang (" __DATE__ " " __TIME__ ", SVN revision: " SVN_VERSION ").\n";
81
82int main(int argc, char **argv) {
83 int n_test = 100;
84 double min_temp, max_temp;
85 double gap, lower, upper;
86 double margin = 0.3;
87 SDDS_DATASET sdds_in, sdds_out;
88 char *input_file = NULL, *output_file = NULL, **column = NULL;
89 long tmpfile_used = 0, columns = 0, no_warnings = 1;
90 unsigned long pipe_flags;
91 long i, i_arg, j;
92 SCANNED_ARG *s_arg;
93 double *column_data = NULL, *pdf = NULL, *cdf = NULL;
94 char buffer_pdf[1024], buffer_cdf[1024], buffer_pdf_units[1024];
95 int64_t rows;
96
98 argc = scanargs(&s_arg, argc, argv);
99 pipe_flags = 0;
100
101 if (argc < 2) {
102 fprintf(stderr, "%s", usage);
103 exit(EXIT_FAILURE);
104 }
105
106 for (i_arg = 1; i_arg < argc; i_arg++) {
107 if (s_arg[i_arg].arg_type == OPTION) {
108 delete_chars(s_arg[i_arg].list[0], "_");
109 switch (match_string(s_arg[i_arg].list[0], option, N_OPTIONS, 0)) {
110 case SET_COLUMN:
111 columns = s_arg[i_arg].n_items - 1;
112 column = realloc(column, sizeof(*column) * columns);
113 for (i = 1; i < s_arg[i_arg].n_items; i++) {
114 column[i - 1] = s_arg[i_arg].list[i];
115 }
116 break;
117 case SET_MARGIN:
118 if (s_arg[i_arg].n_items != 2) {
119 SDDS_Bomb("Invalid -margin option!");
120 }
121 if (!get_double(&margin, s_arg[i_arg].list[1])) {
122 SDDS_Bomb("Invalid -margin value provided!");
123 }
124 break;
125 case SET_PIPE:
126 if (!processPipeOption(s_arg[i_arg].list + 1, s_arg[i_arg].n_items - 1, &pipe_flags)) {
127 fprintf(stderr, "Error (%s): invalid -pipe syntax\n", argv[0]);
128 return EXIT_FAILURE;
129 }
130 break;
131 default:
132 fprintf(stderr, "Unknown option: %s\n", s_arg[i_arg].list[0]);
133 exit(EXIT_FAILURE);
134 }
135 } else {
136 if (!input_file) {
137 input_file = s_arg[i_arg].list[0];
138 } else if (!output_file) {
139 output_file = s_arg[i_arg].list[0];
140 } else {
141 fprintf(stderr, "Error (%s): too many filenames\n", argv[0]);
142 return EXIT_FAILURE;
143 }
144 }
145 }
146
147 processFilenames("sddskde", &input_file, &output_file, pipe_flags, no_warnings, &tmpfile_used);
148
149 if (!columns) {
150 fprintf(stderr, "%s", usage);
151 SDDS_Bomb("No column provided!");
152 }
153
154 if (!SDDS_InitializeInput(&sdds_in, input_file)) {
155 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
156 return EXIT_FAILURE;
157 }
158
159 if ((columns = expandColumnPairNames(&sdds_in, &column, NULL, columns, NULL, 0, FIND_NUMERIC_TYPE, 0)) <= 0) {
160 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
161 SDDS_Bomb("No columns selected.");
162 }
163
164 if (!SDDS_InitializeOutput(&sdds_out, SDDS_BINARY, 1, NULL, NULL, output_file)) {
165 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
166 }
167
168 for (i = 0; i < columns; i++) {
169 char *units = NULL;
170 if (SDDS_GetColumnInformation(&sdds_in, "units", &units, SDDS_GET_BY_NAME, column[i]) != SDDS_STRING) {
171 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
172 }
173
174 if (units && strlen(units)) {
175 snprintf(buffer_pdf_units, sizeof(buffer_pdf_units), "1/(%s)", units);
176 } else {
177 buffer_pdf_units[0] = '\0';
178 }
179
180 free(units);
181
182 snprintf(buffer_pdf, sizeof(buffer_pdf), "%sPDF", column[i]);
183 snprintf(buffer_cdf, sizeof(buffer_cdf), "%sCDF", column[i]);
184
185 if (!SDDS_TransferColumnDefinition(&sdds_out, &sdds_in, column[i], NULL) ||
186 SDDS_DefineColumn(&sdds_out, buffer_pdf, NULL, buffer_pdf_units, NULL, NULL, SDDS_DOUBLE, 0) < 0 ||
187 SDDS_DefineColumn(&sdds_out, buffer_cdf, NULL, NULL, NULL, NULL, SDDS_DOUBLE, 0) < 0) {
188 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
189 }
190 }
191
192 if (!SDDS_WriteLayout(&sdds_out)) {
193 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
194 }
195
196 pdf = malloc(sizeof(*pdf) * n_test);
197 cdf = malloc(sizeof(*cdf) * n_test);
198
199 while (SDDS_ReadPage(&sdds_in) >= 0) {
200 if ((rows = SDDS_CountRowsOfInterest(&sdds_in))) {
201 if (!SDDS_StartPage(&sdds_out, n_test)) {
202 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
203 }
204
205 for (i = 0; i < columns; i++) {
206 snprintf(buffer_pdf, sizeof(buffer_pdf), "%sPDF", column[i]);
207 snprintf(buffer_cdf, sizeof(buffer_cdf), "%sCDF", column[i]);
208
209 if (!(column_data = SDDS_GetColumnInDoubles(&sdds_in, column[i]))) {
210 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
211 }
212
213 find_min_max(&min_temp, &max_temp, column_data, rows);
214 gap = max_temp - min_temp;
215 lower = min_temp - gap * margin;
216 upper = max_temp + gap * margin;
217
218 double *x_array = linearspace(lower, upper, n_test);
219
220 for (j = 0; j < n_test; j++) {
221 pdf[j] = kerneldensityestimate(column_data, x_array[j], rows);
222 cdf[j] = pdf[j];
223 }
224
225 for (j = 1; j < n_test; j++) {
226 cdf[j] += cdf[j - 1];
227 }
228
229 for (j = 0; j < n_test; j++) {
230 cdf[j] = cdf[j] / cdf[n_test - 1];
231 }
232
233 if (!SDDS_SetColumnFromDoubles(&sdds_out, SDDS_SET_BY_NAME, pdf, n_test, buffer_pdf) ||
234 !SDDS_SetColumnFromDoubles(&sdds_out, SDDS_SET_BY_NAME, cdf, n_test, buffer_cdf) ||
235 !SDDS_SetColumnFromDoubles(&sdds_out, SDDS_SET_BY_NAME, x_array, n_test, column[i])) {
236 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
237 }
238
239 free(column_data);
240 column_data = NULL;
241
242 free(x_array);
243 x_array = NULL;
244 }
245 }
246
247 if (!SDDS_WritePage(&sdds_out)) {
248 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
249 }
250 }
251
252 free(pdf);
253 free(cdf);
254
255 if (!SDDS_Terminate(&sdds_in) || !SDDS_Terminate(&sdds_out)) {
256 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
257 }
258
259 if (tmpfile_used && !replaceFileAndBackUp(input_file, output_file)) {
260 return EXIT_FAILURE;
261 }
262
263 free(column);
264
265 return EXIT_SUCCESS;
266}
267
268double bandwidth(double data[], int M) {
269 double sigma, min_val, bwidth, interquartile_range;
270 double hspread_three, hspread_one;
271
272 gsl_sort(data, 1, M);
273 sigma = gsl_stats_sd(data, 1, M);
274 hspread_three = gsl_stats_quantile_from_sorted_data(data, 1, M, 0.750);
275 hspread_one = gsl_stats_quantile_from_sorted_data(data, 1, M, 0.250);
276 interquartile_range = hspread_three - hspread_one;
277 min_val = GSL_MIN(interquartile_range / 1.339999, sigma);
278 bwidth = 0.90 * min_val * pow(M, -0.20);
279
280 return bwidth;
281}
282
283double gaussiankernelfunction(double sample) {
284 double k;
285 k = exp(-(gsl_pow_2(sample) / 2.0));
286 k = k / (M_SQRT2 * sqrt(M_PI));
287
288 return k;
289}
290
291double kerneldensityestimate(double *trainingdata, double sample, int64_t n) {
292 int64_t i;
293 double pdf = 0.0, h, bwidth;
294
295 bwidth = bandwidth(trainingdata, n);
296 h = GSL_MAX(bwidth, 2e-6);
297
298 for (i = 0; i < n; i++) {
299 pdf += gaussiankernelfunction((trainingdata[i] - sample) / h);
300 }
301
302 pdf = pdf / (n * h);
303
304 return pdf;
305}
306
307double *linearspace(double start, double end, int N) {
308 double *x;
309 int i;
310 double step;
311
312 x = calloc(N, sizeof(double));
313 step = (end - start) / (double)(N - 1);
314
315 x[0] = start;
316 for (i = 1; i < N; i++) {
317 x[i] = x[i - 1] + step;
318 }
319 x[N - 1] = end;
320
321 return x;
322}
SDDS (Self Describing Data Set) Data Types Definitions and Function Prototypes.
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int32_t SDDS_SetColumnFromDoubles(SDDS_DATASET *SDDS_dataset, int32_t mode, double *data, int64_t rows,...)
Sets the values for a single data column using double-precision floating-point numbers.
int64_t SDDS_CountRowsOfInterest(SDDS_DATASET *SDDS_dataset)
Counts the number of rows marked as "of interest" in the current data table.
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int32_t SDDS_GetColumnInformation(SDDS_DATASET *SDDS_dataset, char *field_name, void *memory, int32_t mode,...)
Retrieves information about a specified column in the SDDS dataset.
Definition SDDS_info.c:41
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_InitializeOutput(SDDS_DATASET *SDDS_dataset, int32_t data_mode, int32_t lines_per_row, const char *description, const char *contents, const char *filename)
Initializes the SDDS output dataset.
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_DefineColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, int32_t field_length)
Defines a data column within the SDDS dataset.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_TransferColumnDefinition(SDDS_DATASET *target, SDDS_DATASET *source, char *name, char *newName)
Transfers a column definition from a source dataset to a target dataset.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
#define SDDS_STRING
Identifier for the string data type.
Definition SDDStypes.h:85
#define SDDS_DOUBLE
Identifier for the double data type.
Definition SDDStypes.h:37
Utility functions for SDDS dataset manipulation and string array operations.
int get_double(double *dptr, char *s)
Parses a double value from the given string.
Definition data_scan.c:40
char * delete_chars(char *s, char *t)
Removes all occurrences of characters found in string t from string s.
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
Definition findMinMax.c:33
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
long replaceFileAndBackUp(char *file, char *replacement)
Replaces a file with a replacement file and creates a backup of the original.
Definition replacefile.c:75
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390