SDDSlib
Loading...
Searching...
No Matches
sddskde.c File Reference

Kernel Density Estimation for SDDS Data. More...

#include <math.h>
#include <gsl/gsl_statistics.h>
#include <gsl/gsl_sort.h>
#include <gsl/gsl_math.h>
#include "SDDS.h"
#include "mdb.h"
#include "scan.h"
#include "SDDSutils.h"

Go to the source code of this file.

Enumerations

enum  option_type { SET_COLUMN , SET_PIPE , SET_MARGIN , N_OPTIONS }
 

Functions

double bandwidth (double data[], int M)
 
double gaussiankernelfunction (double sample)
 
double kerneldensityestimate (double *trainingdata, double sample, int64_t n)
 
double * linearspace (double initial, double final, int N)
 
int main (int argc, char **argv)
 

Variables

static char * option [N_OPTIONS]
 
static const char * usage
 

Detailed Description

Kernel Density Estimation for SDDS Data.

This program performs Kernel Density Estimation (KDE) for one-dimensional data read from an SDDS file and outputs the resulting probability density functions (PDFs) and cumulative distribution functions (CDFs) into an output SDDS file.

Key Features:

  • Supports customizable input and output file specifications.
  • Provides options for defining column data, margin values, and piping input/output.
  • Implements KDE using Gaussian kernel functions.
  • Computes and outputs both PDF and CDF for specified columns.

Usage:

sddskde input.sdds output.sdds -col=<columns> [-margin=<value>] [-pipe]
  • -col=<columns>: Specify column names (comma-separated, wildcards accepted).
  • -margin=<value>: Extend the data range by a specified ratio (default: 0.3).
  • -pipe: Enable SDDS Toolkit pipe options.

Example:

sddskde input.sdds output.sdds -col=Column1,Column2 -margin=0.2
License
This file is distributed under the terms of the Software License Agreement found in the file LICENSE included with this distribution.
Author
Yipeng Sun, H. Shang, M. Borland, R. Soliday

Definition in file sddskde.c.

Enumeration Type Documentation

◆ option_type

enum option_type

Definition at line 54 of file sddskde.c.

54 {
55 SET_COLUMN,
56 SET_PIPE,
57 SET_MARGIN,
58 N_OPTIONS
59};

Function Documentation

◆ bandwidth()

double bandwidth ( double data[],
int M )

Definition at line 261 of file sddskde.c.

261 {
262 double sigma, min_val, bwidth, interquartile_range;
263 double hspread_three, hspread_one;
264
265 gsl_sort(data, 1, M);
266 sigma = gsl_stats_sd(data, 1, M);
267 hspread_three = gsl_stats_quantile_from_sorted_data(data, 1, M, 0.750);
268 hspread_one = gsl_stats_quantile_from_sorted_data(data, 1, M, 0.250);
269 interquartile_range = hspread_three - hspread_one;
270 min_val = GSL_MIN(interquartile_range / 1.339999, sigma);
271 bwidth = 0.90 * min_val * pow(M, -0.20);
272
273 return bwidth;
274}

◆ gaussiankernelfunction()

double gaussiankernelfunction ( double sample)

Definition at line 276 of file sddskde.c.

276 {
277 double k;
278 k = exp(-(gsl_pow_2(sample) / 2.0));
279 k = k / (M_SQRT2 * sqrt(M_PI));
280
281 return k;
282}

◆ kerneldensityestimate()

double kerneldensityestimate ( double * trainingdata,
double sample,
int64_t n )

Definition at line 284 of file sddskde.c.

284 {
285 int64_t i;
286 double pdf = 0.0, h, bwidth;
287
288 bwidth = bandwidth(trainingdata, n);
289 h = GSL_MAX(bwidth, 2e-6);
290
291 for (i = 0; i < n; i++) {
292 pdf += gaussiankernelfunction((trainingdata[i] - sample) / h);
293 }
294
295 pdf = pdf / (n * h);
296
297 return pdf;
298}

◆ linearspace()

double * linearspace ( double initial,
double final,
int N )

Definition at line 300 of file sddskde.c.

300 {
301 double *x;
302 int i;
303 double step;
304
305 x = calloc(N, sizeof(double));
306 step = (end - start) / (double)(N - 1);
307
308 x[0] = start;
309 for (i = 1; i < N; i++) {
310 x[i] = x[i - 1] + step;
311 }
312 x[N - 1] = end;
313
314 return x;
315}

◆ main()

int main ( int argc,
char ** argv )

Definition at line 75 of file sddskde.c.

75 {
76 int n_test = 100;
77 double min_temp, max_temp;
78 double gap, lower, upper;
79 double margin = 0.3;
80 SDDS_DATASET sdds_in, sdds_out;
81 char *input_file = NULL, *output_file = NULL, **column = NULL;
82 long tmpfile_used = 0, columns = 0, no_warnings = 1;
83 unsigned long pipe_flags;
84 long i, i_arg, j;
85 SCANNED_ARG *s_arg;
86 double *column_data = NULL, *pdf = NULL, *cdf = NULL;
87 char buffer_pdf[1024], buffer_cdf[1024], buffer_pdf_units[1024];
88 int64_t rows;
89
91 argc = scanargs(&s_arg, argc, argv);
92 pipe_flags = 0;
93
94 if (argc < 2) {
95 fprintf(stderr, "%s", usage);
96 exit(EXIT_FAILURE);
97 }
98
99 for (i_arg = 1; i_arg < argc; i_arg++) {
100 if (s_arg[i_arg].arg_type == OPTION) {
101 delete_chars(s_arg[i_arg].list[0], "_");
102 switch (match_string(s_arg[i_arg].list[0], option, N_OPTIONS, 0)) {
103 case SET_COLUMN:
104 columns = s_arg[i_arg].n_items - 1;
105 column = realloc(column, sizeof(*column) * columns);
106 for (i = 1; i < s_arg[i_arg].n_items; i++) {
107 column[i - 1] = s_arg[i_arg].list[i];
108 }
109 break;
110 case SET_MARGIN:
111 if (s_arg[i_arg].n_items != 2) {
112 SDDS_Bomb("Invalid -margin option!");
113 }
114 if (!get_double(&margin, s_arg[i_arg].list[1])) {
115 SDDS_Bomb("Invalid -margin value provided!");
116 }
117 break;
118 case SET_PIPE:
119 if (!processPipeOption(s_arg[i_arg].list + 1, s_arg[i_arg].n_items - 1, &pipe_flags)) {
120 fprintf(stderr, "Error (%s): invalid -pipe syntax\n", argv[0]);
121 return EXIT_FAILURE;
122 }
123 break;
124 default:
125 fprintf(stderr, "Unknown option: %s\n", s_arg[i_arg].list[0]);
126 exit(EXIT_FAILURE);
127 }
128 } else {
129 if (!input_file) {
130 input_file = s_arg[i_arg].list[0];
131 } else if (!output_file) {
132 output_file = s_arg[i_arg].list[0];
133 } else {
134 fprintf(stderr, "Error (%s): too many filenames\n", argv[0]);
135 return EXIT_FAILURE;
136 }
137 }
138 }
139
140 processFilenames("sddskde", &input_file, &output_file, pipe_flags, no_warnings, &tmpfile_used);
141
142 if (!columns) {
143 fprintf(stderr, "%s", usage);
144 SDDS_Bomb("No column provided!");
145 }
146
147 if (!SDDS_InitializeInput(&sdds_in, input_file)) {
148 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
149 return EXIT_FAILURE;
150 }
151
152 if ((columns = expandColumnPairNames(&sdds_in, &column, NULL, columns, NULL, 0, FIND_NUMERIC_TYPE, 0)) <= 0) {
153 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
154 SDDS_Bomb("No columns selected.");
155 }
156
157 if (!SDDS_InitializeOutput(&sdds_out, SDDS_BINARY, 1, NULL, NULL, output_file)) {
158 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
159 }
160
161 for (i = 0; i < columns; i++) {
162 char *units = NULL;
163 if (SDDS_GetColumnInformation(&sdds_in, "units", &units, SDDS_GET_BY_NAME, column[i]) != SDDS_STRING) {
164 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
165 }
166
167 if (units && strlen(units)) {
168 snprintf(buffer_pdf_units, sizeof(buffer_pdf_units), "1/(%s)", units);
169 } else {
170 buffer_pdf_units[0] = '\0';
171 }
172
173 free(units);
174
175 snprintf(buffer_pdf, sizeof(buffer_pdf), "%sPDF", column[i]);
176 snprintf(buffer_cdf, sizeof(buffer_cdf), "%sCDF", column[i]);
177
178 if (!SDDS_TransferColumnDefinition(&sdds_out, &sdds_in, column[i], NULL) ||
179 SDDS_DefineColumn(&sdds_out, buffer_pdf, NULL, buffer_pdf_units, NULL, NULL, SDDS_DOUBLE, 0) < 0 ||
180 SDDS_DefineColumn(&sdds_out, buffer_cdf, NULL, NULL, NULL, NULL, SDDS_DOUBLE, 0) < 0) {
181 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
182 }
183 }
184
185 if (!SDDS_WriteLayout(&sdds_out)) {
186 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
187 }
188
189 pdf = malloc(sizeof(*pdf) * n_test);
190 cdf = malloc(sizeof(*cdf) * n_test);
191
192 while (SDDS_ReadPage(&sdds_in) >= 0) {
193 if ((rows = SDDS_CountRowsOfInterest(&sdds_in))) {
194 if (!SDDS_StartPage(&sdds_out, n_test)) {
195 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
196 }
197
198 for (i = 0; i < columns; i++) {
199 snprintf(buffer_pdf, sizeof(buffer_pdf), "%sPDF", column[i]);
200 snprintf(buffer_cdf, sizeof(buffer_cdf), "%sCDF", column[i]);
201
202 if (!(column_data = SDDS_GetColumnInDoubles(&sdds_in, column[i]))) {
203 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
204 }
205
206 find_min_max(&min_temp, &max_temp, column_data, rows);
207 gap = max_temp - min_temp;
208 lower = min_temp - gap * margin;
209 upper = max_temp + gap * margin;
210
211 double *x_array = linearspace(lower, upper, n_test);
212
213 for (j = 0; j < n_test; j++) {
214 pdf[j] = kerneldensityestimate(column_data, x_array[j], rows);
215 cdf[j] = pdf[j];
216 }
217
218 for (j = 1; j < n_test; j++) {
219 cdf[j] += cdf[j - 1];
220 }
221
222 for (j = 0; j < n_test; j++) {
223 cdf[j] = cdf[j] / cdf[n_test - 1];
224 }
225
226 if (!SDDS_SetColumnFromDoubles(&sdds_out, SDDS_SET_BY_NAME, pdf, n_test, buffer_pdf) ||
227 !SDDS_SetColumnFromDoubles(&sdds_out, SDDS_SET_BY_NAME, cdf, n_test, buffer_cdf) ||
228 !SDDS_SetColumnFromDoubles(&sdds_out, SDDS_SET_BY_NAME, x_array, n_test, column[i])) {
229 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
230 }
231
232 free(column_data);
233 column_data = NULL;
234
235 free(x_array);
236 x_array = NULL;
237 }
238 }
239
240 if (!SDDS_WritePage(&sdds_out)) {
241 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
242 }
243 }
244
245 free(pdf);
246 free(cdf);
247
248 if (!SDDS_Terminate(&sdds_in) || !SDDS_Terminate(&sdds_out)) {
249 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
250 }
251
252 if (tmpfile_used && !replaceFileAndBackUp(input_file, output_file)) {
253 return EXIT_FAILURE;
254 }
255
256 free(column);
257
258 return EXIT_SUCCESS;
259}
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int32_t SDDS_SetColumnFromDoubles(SDDS_DATASET *SDDS_dataset, int32_t mode, double *data, int64_t rows,...)
Sets the values for a single data column using double-precision floating-point numbers.
int64_t SDDS_CountRowsOfInterest(SDDS_DATASET *SDDS_dataset)
Counts the number of rows marked as "of interest" in the current data table.
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int32_t SDDS_GetColumnInformation(SDDS_DATASET *SDDS_dataset, char *field_name, void *memory, int32_t mode,...)
Retrieves information about a specified column in the SDDS dataset.
Definition SDDS_info.c:41
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_InitializeOutput(SDDS_DATASET *SDDS_dataset, int32_t data_mode, int32_t lines_per_row, const char *description, const char *contents, const char *filename)
Initializes the SDDS output dataset.
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_DefineColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, int32_t field_length)
Defines a data column within the SDDS dataset.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_TransferColumnDefinition(SDDS_DATASET *target, SDDS_DATASET *source, char *name, char *newName)
Transfers a column definition from a source dataset to a target dataset.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
#define SDDS_STRING
Identifier for the string data type.
Definition SDDStypes.h:85
#define SDDS_DOUBLE
Identifier for the double data type.
Definition SDDStypes.h:37
int get_double(double *dptr, char *s)
Parses a double value from the given string.
Definition data_scan.c:40
char * delete_chars(char *s, char *t)
Removes all occurrences of characters found in string t from string s.
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
Definition findMinMax.c:33
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
long replaceFileAndBackUp(char *file, char *replacement)
Replaces a file with a replacement file and creates a backup of the original.
Definition replacefile.c:75
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390

Variable Documentation

◆ option

char* option[N_OPTIONS]
static
Initial value:
= {
"column",
"pipe",
"margin",
}

Definition at line 61 of file sddskde.c.

61 {
62 "column",
63 "pipe",
64 "margin",
65};

◆ usage

const char* usage
static
Initial value:
=
"sddskde [<inputfile>] [<outputfile>] [-column=<list of columns>] [-pipe] [-margin=<value>]\n"
"-column provide column names separated by commas, wild card accepted.\n"
"-margin provide the ratio to extend the original data, default 0.3.\n"
"-pipe The standard SDDS Toolkit pipe option.\n\n"
"sddskde performs kernel density estimation for one-dimensional data.\n"
"Program by Yipeng Sun and Hairong Shang (" __DATE__ " " __TIME__ ", SVN revision: " SVN_VERSION ").\n"

Definition at line 67 of file sddskde.c.