SDDSlib
Loading...
Searching...
No Matches
sddskde.c
Go to the documentation of this file.
1/**
2 * @file sddskde.c
3 * @brief Kernel Density Estimation for SDDS Data.
4 *
5 * This program performs Kernel Density Estimation (KDE) for one-dimensional data
6 * read from an SDDS file and outputs the resulting probability density functions (PDFs)
7 * and cumulative distribution functions (CDFs) into an output SDDS file.
8 *
9 * ### Key Features:
10 * - Supports customizable input and output file specifications.
11 * - Provides options for defining column data, margin values, and piping input/output.
12 * - Implements KDE using Gaussian kernel functions.
13 * - Computes and outputs both PDF and CDF for specified columns.
14 *
15 * ### Usage:
16 * ```
17 * sddskde input.sdds output.sdds -col=<columns> [-margin=<value>] [-pipe]
18 * ```
19 * - `-col=<columns>`: Specify column names (comma-separated, wildcards accepted).
20 * - `-margin=<value>`: Extend the data range by a specified ratio (default: 0.3).
21 * - `-pipe`: Enable SDDS Toolkit pipe options.
22 *
23 * ### Example:
24 * ```
25 * sddskde input.sdds output.sdds -col=Column1,Column2 -margin=0.2
26 * ```
27 *
28 * @copyright
29 * - (c) 2002 The University of Chicago, as Operator of Argonne National Laboratory.
30 * - (c) 2002 The Regents of the University of California, as Operator of Los Alamos National Laboratory.
31 *
32 * @license
33 * This file is distributed under the terms of the Software License Agreement
34 * found in the file LICENSE included with this distribution.
35 *
36 * @author Yipeng Sun, H. Shang, M. Borland, R. Soliday
37 */
38
39#include <math.h>
40#include <gsl/gsl_statistics.h>
41#include <gsl/gsl_sort.h>
42#include <gsl/gsl_math.h>
43#include "SDDS.h"
44#include "mdb.h"
45#include "scan.h"
46#include "SDDSutils.h"
47
48double bandwidth(double data[], int M);
49double gaussiankernelfunction(double sample);
50double kerneldensityestimate(double *trainingdata, double sample, int64_t n);
51double *linearspace(double initial, double final, int N);
52
53/* Enumeration for option types */
54enum option_type {
55 SET_COLUMN,
56 SET_PIPE,
57 SET_MARGIN,
58 N_OPTIONS
59};
60
61static char *option[N_OPTIONS] = {
62 "column",
63 "pipe",
64 "margin",
65};
66
67static const char *usage =
68 "sddskde [<inputfile>] [<outputfile>] [-column=<list of columns>] [-pipe] [-margin=<value>]\n"
69 "-column provide column names separated by commas, wild card accepted.\n"
70 "-margin provide the ratio to extend the original data, default 0.3.\n"
71 "-pipe The standard SDDS Toolkit pipe option.\n\n"
72 "sddskde performs kernel density estimation for one-dimensional data.\n"
73 "Program by Yipeng Sun and Hairong Shang (" __DATE__ " " __TIME__ ", SVN revision: " SVN_VERSION ").\n";
74
75int main(int argc, char **argv) {
76 int n_test = 100;
77 double min_temp, max_temp;
78 double gap, lower, upper;
79 double margin = 0.3;
80 SDDS_DATASET sdds_in, sdds_out;
81 char *input_file = NULL, *output_file = NULL, **column = NULL;
82 long tmpfile_used = 0, columns = 0, no_warnings = 1;
83 unsigned long pipe_flags;
84 long i, i_arg, j;
85 SCANNED_ARG *s_arg;
86 double *column_data = NULL, *pdf = NULL, *cdf = NULL;
87 char buffer_pdf[1024], buffer_cdf[1024], buffer_pdf_units[1024];
88 int64_t rows;
89
91 argc = scanargs(&s_arg, argc, argv);
92 pipe_flags = 0;
93
94 if (argc < 2) {
95 fprintf(stderr, "%s", usage);
96 exit(EXIT_FAILURE);
97 }
98
99 for (i_arg = 1; i_arg < argc; i_arg++) {
100 if (s_arg[i_arg].arg_type == OPTION) {
101 delete_chars(s_arg[i_arg].list[0], "_");
102 switch (match_string(s_arg[i_arg].list[0], option, N_OPTIONS, 0)) {
103 case SET_COLUMN:
104 columns = s_arg[i_arg].n_items - 1;
105 column = realloc(column, sizeof(*column) * columns);
106 for (i = 1; i < s_arg[i_arg].n_items; i++) {
107 column[i - 1] = s_arg[i_arg].list[i];
108 }
109 break;
110 case SET_MARGIN:
111 if (s_arg[i_arg].n_items != 2) {
112 SDDS_Bomb("Invalid -margin option!");
113 }
114 if (!get_double(&margin, s_arg[i_arg].list[1])) {
115 SDDS_Bomb("Invalid -margin value provided!");
116 }
117 break;
118 case SET_PIPE:
119 if (!processPipeOption(s_arg[i_arg].list + 1, s_arg[i_arg].n_items - 1, &pipe_flags)) {
120 fprintf(stderr, "Error (%s): invalid -pipe syntax\n", argv[0]);
121 return EXIT_FAILURE;
122 }
123 break;
124 default:
125 fprintf(stderr, "Unknown option: %s\n", s_arg[i_arg].list[0]);
126 exit(EXIT_FAILURE);
127 }
128 } else {
129 if (!input_file) {
130 input_file = s_arg[i_arg].list[0];
131 } else if (!output_file) {
132 output_file = s_arg[i_arg].list[0];
133 } else {
134 fprintf(stderr, "Error (%s): too many filenames\n", argv[0]);
135 return EXIT_FAILURE;
136 }
137 }
138 }
139
140 processFilenames("sddskde", &input_file, &output_file, pipe_flags, no_warnings, &tmpfile_used);
141
142 if (!columns) {
143 fprintf(stderr, "%s", usage);
144 SDDS_Bomb("No column provided!");
145 }
146
147 if (!SDDS_InitializeInput(&sdds_in, input_file)) {
148 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
149 return EXIT_FAILURE;
150 }
151
152 if ((columns = expandColumnPairNames(&sdds_in, &column, NULL, columns, NULL, 0, FIND_NUMERIC_TYPE, 0)) <= 0) {
153 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
154 SDDS_Bomb("No columns selected.");
155 }
156
157 if (!SDDS_InitializeOutput(&sdds_out, SDDS_BINARY, 1, NULL, NULL, output_file)) {
158 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
159 }
160
161 for (i = 0; i < columns; i++) {
162 char *units = NULL;
163 if (SDDS_GetColumnInformation(&sdds_in, "units", &units, SDDS_GET_BY_NAME, column[i]) != SDDS_STRING) {
164 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
165 }
166
167 if (units && strlen(units)) {
168 snprintf(buffer_pdf_units, sizeof(buffer_pdf_units), "1/(%s)", units);
169 } else {
170 buffer_pdf_units[0] = '\0';
171 }
172
173 free(units);
174
175 snprintf(buffer_pdf, sizeof(buffer_pdf), "%sPDF", column[i]);
176 snprintf(buffer_cdf, sizeof(buffer_cdf), "%sCDF", column[i]);
177
178 if (!SDDS_TransferColumnDefinition(&sdds_out, &sdds_in, column[i], NULL) ||
179 SDDS_DefineColumn(&sdds_out, buffer_pdf, NULL, buffer_pdf_units, NULL, NULL, SDDS_DOUBLE, 0) < 0 ||
180 SDDS_DefineColumn(&sdds_out, buffer_cdf, NULL, NULL, NULL, NULL, SDDS_DOUBLE, 0) < 0) {
181 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
182 }
183 }
184
185 if (!SDDS_WriteLayout(&sdds_out)) {
186 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
187 }
188
189 pdf = malloc(sizeof(*pdf) * n_test);
190 cdf = malloc(sizeof(*cdf) * n_test);
191
192 while (SDDS_ReadPage(&sdds_in) >= 0) {
193 if ((rows = SDDS_CountRowsOfInterest(&sdds_in))) {
194 if (!SDDS_StartPage(&sdds_out, n_test)) {
195 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
196 }
197
198 for (i = 0; i < columns; i++) {
199 snprintf(buffer_pdf, sizeof(buffer_pdf), "%sPDF", column[i]);
200 snprintf(buffer_cdf, sizeof(buffer_cdf), "%sCDF", column[i]);
201
202 if (!(column_data = SDDS_GetColumnInDoubles(&sdds_in, column[i]))) {
203 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
204 }
205
206 find_min_max(&min_temp, &max_temp, column_data, rows);
207 gap = max_temp - min_temp;
208 lower = min_temp - gap * margin;
209 upper = max_temp + gap * margin;
210
211 double *x_array = linearspace(lower, upper, n_test);
212
213 for (j = 0; j < n_test; j++) {
214 pdf[j] = kerneldensityestimate(column_data, x_array[j], rows);
215 cdf[j] = pdf[j];
216 }
217
218 for (j = 1; j < n_test; j++) {
219 cdf[j] += cdf[j - 1];
220 }
221
222 for (j = 0; j < n_test; j++) {
223 cdf[j] = cdf[j] / cdf[n_test - 1];
224 }
225
226 if (!SDDS_SetColumnFromDoubles(&sdds_out, SDDS_SET_BY_NAME, pdf, n_test, buffer_pdf) ||
227 !SDDS_SetColumnFromDoubles(&sdds_out, SDDS_SET_BY_NAME, cdf, n_test, buffer_cdf) ||
228 !SDDS_SetColumnFromDoubles(&sdds_out, SDDS_SET_BY_NAME, x_array, n_test, column[i])) {
229 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
230 }
231
232 free(column_data);
233 column_data = NULL;
234
235 free(x_array);
236 x_array = NULL;
237 }
238 }
239
240 if (!SDDS_WritePage(&sdds_out)) {
241 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
242 }
243 }
244
245 free(pdf);
246 free(cdf);
247
248 if (!SDDS_Terminate(&sdds_in) || !SDDS_Terminate(&sdds_out)) {
249 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
250 }
251
252 if (tmpfile_used && !replaceFileAndBackUp(input_file, output_file)) {
253 return EXIT_FAILURE;
254 }
255
256 free(column);
257
258 return EXIT_SUCCESS;
259}
260
261double bandwidth(double data[], int M) {
262 double sigma, min_val, bwidth, interquartile_range;
263 double hspread_three, hspread_one;
264
265 gsl_sort(data, 1, M);
266 sigma = gsl_stats_sd(data, 1, M);
267 hspread_three = gsl_stats_quantile_from_sorted_data(data, 1, M, 0.750);
268 hspread_one = gsl_stats_quantile_from_sorted_data(data, 1, M, 0.250);
269 interquartile_range = hspread_three - hspread_one;
270 min_val = GSL_MIN(interquartile_range / 1.339999, sigma);
271 bwidth = 0.90 * min_val * pow(M, -0.20);
272
273 return bwidth;
274}
275
276double gaussiankernelfunction(double sample) {
277 double k;
278 k = exp(-(gsl_pow_2(sample) / 2.0));
279 k = k / (M_SQRT2 * sqrt(M_PI));
280
281 return k;
282}
283
284double kerneldensityestimate(double *trainingdata, double sample, int64_t n) {
285 int64_t i;
286 double pdf = 0.0, h, bwidth;
287
288 bwidth = bandwidth(trainingdata, n);
289 h = GSL_MAX(bwidth, 2e-6);
290
291 for (i = 0; i < n; i++) {
292 pdf += gaussiankernelfunction((trainingdata[i] - sample) / h);
293 }
294
295 pdf = pdf / (n * h);
296
297 return pdf;
298}
299
300double *linearspace(double start, double end, int N) {
301 double *x;
302 int i;
303 double step;
304
305 x = calloc(N, sizeof(double));
306 step = (end - start) / (double)(N - 1);
307
308 x[0] = start;
309 for (i = 1; i < N; i++) {
310 x[i] = x[i - 1] + step;
311 }
312 x[N - 1] = end;
313
314 return x;
315}
SDDS (Self Describing Data Set) Data Types Definitions and Function Prototypes.
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int32_t SDDS_SetColumnFromDoubles(SDDS_DATASET *SDDS_dataset, int32_t mode, double *data, int64_t rows,...)
Sets the values for a single data column using double-precision floating-point numbers.
int64_t SDDS_CountRowsOfInterest(SDDS_DATASET *SDDS_dataset)
Counts the number of rows marked as "of interest" in the current data table.
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int32_t SDDS_GetColumnInformation(SDDS_DATASET *SDDS_dataset, char *field_name, void *memory, int32_t mode,...)
Retrieves information about a specified column in the SDDS dataset.
Definition SDDS_info.c:41
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_InitializeOutput(SDDS_DATASET *SDDS_dataset, int32_t data_mode, int32_t lines_per_row, const char *description, const char *contents, const char *filename)
Initializes the SDDS output dataset.
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_DefineColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, int32_t field_length)
Defines a data column within the SDDS dataset.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_TransferColumnDefinition(SDDS_DATASET *target, SDDS_DATASET *source, char *name, char *newName)
Transfers a column definition from a source dataset to a target dataset.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
#define SDDS_STRING
Identifier for the string data type.
Definition SDDStypes.h:85
#define SDDS_DOUBLE
Identifier for the double data type.
Definition SDDStypes.h:37
int get_double(double *dptr, char *s)
Parses a double value from the given string.
Definition data_scan.c:40
char * delete_chars(char *s, char *t)
Removes all occurrences of characters found in string t from string s.
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
Definition findMinMax.c:33
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
long replaceFileAndBackUp(char *file, char *replacement)
Replaces a file with a replacement file and creates a backup of the original.
Definition replacefile.c:75
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390