SDDS ToolKit Programs and Libraries for C and Python
All Classes Files Functions Variables Macros Pages
sddskde.c File Reference

Detailed Description

Kernel Density Estimation for SDDS Data.

This program performs Kernel Density Estimation (KDE) for one-dimensional data read from an SDDS file. It computes probability density functions (PDFs) and cumulative distribution functions (CDFs) for the specified columns and writes the results to an SDDS output file. It uses Gaussian kernel functions for KDE calculations and supports options for selecting columns and customizing the margin for data extension.

Usage

sddskde [<inputfile>] [<outputfile>]
[-pipe=[input][,output]]
-column=<list of columns>
[-margin=<value>]

Options

Required Description
-column Specifies the columns for KDE calculations. Comma-separated and supports wildcards.
Optional Description
-pipe Enable SDDS Toolkit piping for input/output.
-margin Set the margin as a ratio to extend data range (default 0.3).
License
This file is distributed under the terms of the Software License Agreement found in the file LICENSE included with this distribution.
Authors
Yipeng Sun, H. Shang, M. Borland, R. Soliday

Definition in file sddskde.c.

#include <math.h>
#include <gsl/gsl_statistics.h>
#include <gsl/gsl_sort.h>
#include <gsl/gsl_math.h>
#include "SDDS.h"
#include "mdb.h"
#include "scan.h"
#include "SDDSutils.h"

Go to the source code of this file.

Functions

double bandwidth (double data[], int M)
 
double gaussiankernelfunction (double sample)
 
double kerneldensityestimate (double *trainingdata, double sample, int64_t n)
 
double * linearspace (double initial, double final, int N)
 
int main (int argc, char **argv)
 

Function Documentation

◆ bandwidth()

double bandwidth ( double data[],
int M )

Definition at line 268 of file sddskde.c.

268 {
269 double sigma, min_val, bwidth, interquartile_range;
270 double hspread_three, hspread_one;
271
272 gsl_sort(data, 1, M);
273 sigma = gsl_stats_sd(data, 1, M);
274 hspread_three = gsl_stats_quantile_from_sorted_data(data, 1, M, 0.750);
275 hspread_one = gsl_stats_quantile_from_sorted_data(data, 1, M, 0.250);
276 interquartile_range = hspread_three - hspread_one;
277 min_val = GSL_MIN(interquartile_range / 1.339999, sigma);
278 bwidth = 0.90 * min_val * pow(M, -0.20);
279
280 return bwidth;
281}

◆ gaussiankernelfunction()

double gaussiankernelfunction ( double sample)

Definition at line 283 of file sddskde.c.

283 {
284 double k;
285 k = exp(-(gsl_pow_2(sample) / 2.0));
286 k = k / (M_SQRT2 * sqrt(M_PI));
287
288 return k;
289}

◆ kerneldensityestimate()

double kerneldensityestimate ( double * trainingdata,
double sample,
int64_t n )

Definition at line 291 of file sddskde.c.

291 {
292 int64_t i;
293 double pdf = 0.0, h, bwidth;
294
295 bwidth = bandwidth(trainingdata, n);
296 h = GSL_MAX(bwidth, 2e-6);
297
298 for (i = 0; i < n; i++) {
299 pdf += gaussiankernelfunction((trainingdata[i] - sample) / h);
300 }
301
302 pdf = pdf / (n * h);
303
304 return pdf;
305}

◆ linearspace()

double * linearspace ( double initial,
double final,
int N )

Definition at line 307 of file sddskde.c.

307 {
308 double *x;
309 int i;
310 double step;
311
312 x = calloc(N, sizeof(double));
313 step = (end - start) / (double)(N - 1);
314
315 x[0] = start;
316 for (i = 1; i < N; i++) {
317 x[i] = x[i - 1] + step;
318 }
319 x[N - 1] = end;
320
321 return x;
322}

◆ main()

int main ( int argc,
char ** argv )

Definition at line 82 of file sddskde.c.

82 {
83 int n_test = 100;
84 double min_temp, max_temp;
85 double gap, lower, upper;
86 double margin = 0.3;
87 SDDS_DATASET sdds_in, sdds_out;
88 char *input_file = NULL, *output_file = NULL, **column = NULL;
89 long tmpfile_used = 0, columns = 0, no_warnings = 1;
90 unsigned long pipe_flags;
91 long i, i_arg, j;
92 SCANNED_ARG *s_arg;
93 double *column_data = NULL, *pdf = NULL, *cdf = NULL;
94 char buffer_pdf[1024], buffer_cdf[1024], buffer_pdf_units[1024];
95 int64_t rows;
96
98 argc = scanargs(&s_arg, argc, argv);
99 pipe_flags = 0;
100
101 if (argc < 2) {
102 fprintf(stderr, "%s", usage);
103 exit(EXIT_FAILURE);
104 }
105
106 for (i_arg = 1; i_arg < argc; i_arg++) {
107 if (s_arg[i_arg].arg_type == OPTION) {
108 delete_chars(s_arg[i_arg].list[0], "_");
109 switch (match_string(s_arg[i_arg].list[0], option, N_OPTIONS, 0)) {
110 case SET_COLUMN:
111 columns = s_arg[i_arg].n_items - 1;
112 column = realloc(column, sizeof(*column) * columns);
113 for (i = 1; i < s_arg[i_arg].n_items; i++) {
114 column[i - 1] = s_arg[i_arg].list[i];
115 }
116 break;
117 case SET_MARGIN:
118 if (s_arg[i_arg].n_items != 2) {
119 SDDS_Bomb("Invalid -margin option!");
120 }
121 if (!get_double(&margin, s_arg[i_arg].list[1])) {
122 SDDS_Bomb("Invalid -margin value provided!");
123 }
124 break;
125 case SET_PIPE:
126 if (!processPipeOption(s_arg[i_arg].list + 1, s_arg[i_arg].n_items - 1, &pipe_flags)) {
127 fprintf(stderr, "Error (%s): invalid -pipe syntax\n", argv[0]);
128 return EXIT_FAILURE;
129 }
130 break;
131 default:
132 fprintf(stderr, "Unknown option: %s\n", s_arg[i_arg].list[0]);
133 exit(EXIT_FAILURE);
134 }
135 } else {
136 if (!input_file) {
137 input_file = s_arg[i_arg].list[0];
138 } else if (!output_file) {
139 output_file = s_arg[i_arg].list[0];
140 } else {
141 fprintf(stderr, "Error (%s): too many filenames\n", argv[0]);
142 return EXIT_FAILURE;
143 }
144 }
145 }
146
147 processFilenames("sddskde", &input_file, &output_file, pipe_flags, no_warnings, &tmpfile_used);
148
149 if (!columns) {
150 fprintf(stderr, "%s", usage);
151 SDDS_Bomb("No column provided!");
152 }
153
154 if (!SDDS_InitializeInput(&sdds_in, input_file)) {
155 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
156 return EXIT_FAILURE;
157 }
158
159 if ((columns = expandColumnPairNames(&sdds_in, &column, NULL, columns, NULL, 0, FIND_NUMERIC_TYPE, 0)) <= 0) {
160 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
161 SDDS_Bomb("No columns selected.");
162 }
163
164 if (!SDDS_InitializeOutput(&sdds_out, SDDS_BINARY, 1, NULL, NULL, output_file)) {
165 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
166 }
167
168 for (i = 0; i < columns; i++) {
169 char *units = NULL;
170 if (SDDS_GetColumnInformation(&sdds_in, "units", &units, SDDS_GET_BY_NAME, column[i]) != SDDS_STRING) {
171 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
172 }
173
174 if (units && strlen(units)) {
175 snprintf(buffer_pdf_units, sizeof(buffer_pdf_units), "1/(%s)", units);
176 } else {
177 buffer_pdf_units[0] = '\0';
178 }
179
180 free(units);
181
182 snprintf(buffer_pdf, sizeof(buffer_pdf), "%sPDF", column[i]);
183 snprintf(buffer_cdf, sizeof(buffer_cdf), "%sCDF", column[i]);
184
185 if (!SDDS_TransferColumnDefinition(&sdds_out, &sdds_in, column[i], NULL) ||
186 SDDS_DefineColumn(&sdds_out, buffer_pdf, NULL, buffer_pdf_units, NULL, NULL, SDDS_DOUBLE, 0) < 0 ||
187 SDDS_DefineColumn(&sdds_out, buffer_cdf, NULL, NULL, NULL, NULL, SDDS_DOUBLE, 0) < 0) {
188 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
189 }
190 }
191
192 if (!SDDS_WriteLayout(&sdds_out)) {
193 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
194 }
195
196 pdf = malloc(sizeof(*pdf) * n_test);
197 cdf = malloc(sizeof(*cdf) * n_test);
198
199 while (SDDS_ReadPage(&sdds_in) >= 0) {
200 if ((rows = SDDS_CountRowsOfInterest(&sdds_in))) {
201 if (!SDDS_StartPage(&sdds_out, n_test)) {
202 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
203 }
204
205 for (i = 0; i < columns; i++) {
206 snprintf(buffer_pdf, sizeof(buffer_pdf), "%sPDF", column[i]);
207 snprintf(buffer_cdf, sizeof(buffer_cdf), "%sCDF", column[i]);
208
209 if (!(column_data = SDDS_GetColumnInDoubles(&sdds_in, column[i]))) {
210 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
211 }
212
213 find_min_max(&min_temp, &max_temp, column_data, rows);
214 gap = max_temp - min_temp;
215 lower = min_temp - gap * margin;
216 upper = max_temp + gap * margin;
217
218 double *x_array = linearspace(lower, upper, n_test);
219
220 for (j = 0; j < n_test; j++) {
221 pdf[j] = kerneldensityestimate(column_data, x_array[j], rows);
222 cdf[j] = pdf[j];
223 }
224
225 for (j = 1; j < n_test; j++) {
226 cdf[j] += cdf[j - 1];
227 }
228
229 for (j = 0; j < n_test; j++) {
230 cdf[j] = cdf[j] / cdf[n_test - 1];
231 }
232
233 if (!SDDS_SetColumnFromDoubles(&sdds_out, SDDS_SET_BY_NAME, pdf, n_test, buffer_pdf) ||
234 !SDDS_SetColumnFromDoubles(&sdds_out, SDDS_SET_BY_NAME, cdf, n_test, buffer_cdf) ||
235 !SDDS_SetColumnFromDoubles(&sdds_out, SDDS_SET_BY_NAME, x_array, n_test, column[i])) {
236 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
237 }
238
239 free(column_data);
240 column_data = NULL;
241
242 free(x_array);
243 x_array = NULL;
244 }
245 }
246
247 if (!SDDS_WritePage(&sdds_out)) {
248 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
249 }
250 }
251
252 free(pdf);
253 free(cdf);
254
255 if (!SDDS_Terminate(&sdds_in) || !SDDS_Terminate(&sdds_out)) {
256 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
257 }
258
259 if (tmpfile_used && !replaceFileAndBackUp(input_file, output_file)) {
260 return EXIT_FAILURE;
261 }
262
263 free(column);
264
265 return EXIT_SUCCESS;
266}
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int32_t SDDS_SetColumnFromDoubles(SDDS_DATASET *SDDS_dataset, int32_t mode, double *data, int64_t rows,...)
Sets the values for a single data column using double-precision floating-point numbers.
int64_t SDDS_CountRowsOfInterest(SDDS_DATASET *SDDS_dataset)
Counts the number of rows marked as "of interest" in the current data table.
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int32_t SDDS_GetColumnInformation(SDDS_DATASET *SDDS_dataset, char *field_name, void *memory, int32_t mode,...)
Retrieves information about a specified column in the SDDS dataset.
Definition SDDS_info.c:41
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_InitializeOutput(SDDS_DATASET *SDDS_dataset, int32_t data_mode, int32_t lines_per_row, const char *description, const char *contents, const char *filename)
Initializes the SDDS output dataset.
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_DefineColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, int32_t field_length)
Defines a data column within the SDDS dataset.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_TransferColumnDefinition(SDDS_DATASET *target, SDDS_DATASET *source, char *name, char *newName)
Transfers a column definition from a source dataset to a target dataset.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
#define SDDS_STRING
Identifier for the string data type.
Definition SDDStypes.h:85
#define SDDS_DOUBLE
Identifier for the double data type.
Definition SDDStypes.h:37
int get_double(double *dptr, char *s)
Parses a double value from the given string.
Definition data_scan.c:40
char * delete_chars(char *s, char *t)
Removes all occurrences of characters found in string t from string s.
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
Definition findMinMax.c:33
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
long replaceFileAndBackUp(char *file, char *replacement)
Replaces a file with a replacement file and creates a backup of the original.
Definition replacefile.c:75
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390