SDDS ToolKit Programs and Libraries for C and Python
All Classes Files Functions Variables Macros Pages
sddskde2d.c File Reference

Detailed Description

Performs kernel density estimation (KDE) for two-dimensional data using the SDDS library.

This program processes input data from an SDDS file, applies kernel density estimation to two selected columns, and outputs the resulting density estimates to a new SDDS file. It supports flexible options such as specifying columns, margins, and whether to use the same scales across data pages.

Usage

sddskde2d [<inputfile>] [<outputfile>]
[-pipe=[input][,output]]
-column=<column1,column2>
[-samescales]
[-margin=<value>]

Options

Required Description
-column Specify two column names for KDE, separated by a comma. Wildcards accepted.
Option Description
-pipe Utilize the standard SDDS Toolkit pipe option.
-samescales Use the same X and Y ranges for all output pages.
-margin Ratio to extend the original data range (default: 0.05).

Features

  • Reads input data from SDDS files and writes results in SDDS format.
  • Allows users to specify the columns for KDE using wildcards.
  • Handles margins and ensures proper scaling of the data.
  • Supports generating grids for density estimation.
  • Implements bandwidth selection and Gaussian kernel functions.
License
This file is distributed under the terms of the Software License Agreement found in the file LICENSE included with this distribution.
Author
Yipeng Sun, M. Borland, R. Soliday

Definition in file sddskde2d.c.

#include <math.h>
#include <gsl/gsl_statistics.h>
#include <gsl/gsl_sort.h>
#include <gsl/gsl_math.h>
#include "SDDS.h"
#include "mdb.h"
#include "scan.h"
#include "SDDSutils.h"

Go to the source code of this file.

Functions

double bandwidth (double data[], int64_t M)
 
double gaussiankernelfunction (double sample)
 
double kerneldensityestimate (double *trainingdata_x, double *trainingdata_y, double sample_x, double sample_y, int64_t n)
 
double * gridX (double start, double end, int N)
 
double * gridY (double start, double end, int N)
 
int main (int argc, char **argv)
 

Function Documentation

◆ bandwidth()

double bandwidth ( double data[],
int64_t M )

Definition at line 268 of file sddskde2d.c.

268 {
269 double sigma, bwidth, silver_factor;
270 sigma = gsl_stats_sd(data, 1, M);
271 silver_factor = pow(M, -0.16666666);
272 bwidth = pow(silver_factor, 2.0) * pow(sigma, 2.0);
273 return bwidth;
274}

◆ gaussiankernelfunction()

double gaussiankernelfunction ( double sample)

Definition at line 276 of file sddskde2d.c.

276 {
277 double k;
278 k = exp(-sample / 2.0);
279 k = k / (2.0 * M_PI);
280 return k;
281}

◆ gridX()

double * gridX ( double start,
double end,
int N )

Definition at line 300 of file sddskde2d.c.

300 {
301 double *x;
302 int i, j, n_grid;
303 double step;
304
305 n_grid = N * N;
306 x = (double *)calloc(n_grid, sizeof(double));
307 step = (end - start) / (double)(N - 1);
308
309 i = 0;
310 for (j = 0; j < N; j++) {
311 for (i = 0; i < N; i++) {
312 x[i + j * N] = start + i * step;
313 }
314 }
315 return x;
316}

◆ gridY()

double * gridY ( double start,
double end,
int N )

Definition at line 318 of file sddskde2d.c.

318 {
319 double *x;
320 int i, j, n_grid;
321 double step;
322
323 n_grid = N * N;
324 x = (double *)calloc(n_grid, sizeof(double));
325 step = (end - start) / (double)(N - 1);
326
327 i = 0;
328 for (j = 0; j < N; j++) {
329 for (i = 0; i < N; i++) {
330 x[i + j * N] = start + j * step;
331 }
332 }
333 return x;
334}

◆ kerneldensityestimate()

double kerneldensityestimate ( double * trainingdata_x,
double * trainingdata_y,
double sample_x,
double sample_y,
int64_t n )

Definition at line 283 of file sddskde2d.c.

284 {
285 int64_t i;
286 double pdf, hx, hy, z;
287
288 hx = bandwidth(trainingdata_x, n);
289 hy = bandwidth(trainingdata_y, n);
290 pdf = 0.0;
291 for (i = 0; i < n; i++) {
292 z = (trainingdata_x[i] - sample_x) * (trainingdata_x[i] - sample_x) / hx;
293 z += (trainingdata_y[i] - sample_y) * (trainingdata_y[i] - sample_y) / hy;
294 pdf += gaussiankernelfunction(z);
295 }
296 pdf = pdf / (n * sqrt(hx) * sqrt(hy));
297 return pdf;
298}

◆ main()

int main ( int argc,
char ** argv )

Definition at line 97 of file sddskde2d.c.

97 {
98 int n_test, n_total, same_scales;
99 double lowerx, upperx;
100 double lowery, uppery;
101 double margin = 0.05;
102 SDDS_DATASET SDDSin, SDDSout;
103 char *input_file, *output_file, **column;
104 long tmpfile_used = 0, columns, no_warnings = 1;
105 unsigned long pipe_flags;
106 long i, i_arg, j;
107 SCANNED_ARG *s_arg;
108 double **column_data_x, **column_data_y, *pdf;
109 int64_t *rows, rows0;
110 int32_t n_pages, i_page;
111
112 input_file = output_file = NULL;
113 pdf = NULL;
114 columns = 0;
115 column = NULL;
116 same_scales = 0;
117
118 n_test = 50;
119 n_total = n_test * n_test;
120
122 argc = scanargs(&s_arg, argc, argv);
123 pipe_flags = 0;
124 tmpfile_used = 0;
125 if (argc < 2) {
126 fprintf(stderr, "%s", usage);
127 exit(1);
128 }
129 for (i_arg = 1; i_arg < argc; i_arg++) {
130 if (s_arg[i_arg].arg_type == OPTION) {
131 delete_chars(s_arg[i_arg].list[0], "_");
132 switch (match_string(s_arg[i_arg].list[0], option, N_OPTIONS, 0)) {
133 case SET_COLUMN:
134 columns = s_arg[i_arg].n_items - 1;
135 column = trealloc(column, sizeof(*column) * columns);
136 for (i = 1; i < s_arg[i_arg].n_items; i++)
137 column[i - 1] = s_arg[i_arg].list[i];
138 break;
139 case SET_MARGIN:
140 if (s_arg[i_arg].n_items != 2)
141 SDDS_Bomb("Invalid -margin option. Too many qualifiers.");
142 if (!get_double(&margin, s_arg[i_arg].list[1]))
143 SDDS_Bomb("Invalid -margin value provided.");
144 break;
145 case SET_SAME_SCALES:
146 if (s_arg[i_arg].n_items != 1)
147 SDDS_Bomb("Invalid -sameScales option. No qualifiers are accepted.");
148 same_scales = 1;
149 break;
150 case SET_PIPE:
151 if (!processPipeOption(s_arg[i_arg].list + 1, s_arg[i_arg].n_items - 1, &pipe_flags)) {
152 fprintf(stderr, "Error (%s): invalid -pipe syntax\n", argv[0]);
153 return 1;
154 }
155 break;
156 }
157 } else {
158 if (!input_file)
159 input_file = s_arg[i_arg].list[0];
160 else if (output_file == NULL)
161 output_file = s_arg[i_arg].list[0];
162 else {
163 fprintf(stderr, "Error (%s): too many filenames\n", argv[0]);
164 return 1;
165 }
166 }
167 }
168 processFilenames("sddskde2d", &input_file, &output_file, pipe_flags, no_warnings, &tmpfile_used);
169 if (!columns) {
170 fprintf(stderr, "%s", usage);
171 SDDS_Bomb("No column provided!");
172 }
173 if (!SDDS_InitializeInput(&SDDSin, input_file)) {
174 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
175 return 1;
176 }
177 if ((columns = expandColumnPairNames(&SDDSin, &column, NULL, columns, NULL, 0, FIND_NUMERIC_TYPE, 0)) <= 0) {
178 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
179 SDDS_Bomb("no columns selected.");
180 }
181 if (columns > 2) {
182 fprintf(stderr, "%s", usage);
183 SDDS_Bomb("Only 2 columns may be accepted.");
184 }
185 if (!SDDS_InitializeOutput(&SDDSout, SDDS_BINARY, 1, NULL, NULL, output_file))
186 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
187 if (!SDDS_TransferColumnDefinition(&SDDSout, &SDDSin, column[0], NULL) ||
188 !SDDS_TransferColumnDefinition(&SDDSout, &SDDSin, column[1], NULL) ||
189 SDDS_DefineColumn(&SDDSout, "PDF", NULL, NULL, NULL, NULL, SDDS_DOUBLE, 0) < 0)
190 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
191 if (!SDDS_WriteLayout(&SDDSout))
192 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
193
194 /* Read all the data in case we need to use fixed scales */
195 n_pages = 0;
196 column_data_x = NULL;
197 column_data_y = NULL;
198 rows = NULL;
199 while (SDDS_ReadPage(&SDDSin) >= 0) {
200 if ((rows0 = SDDS_CountRowsOfInterest(&SDDSin))) {
201 column_data_x = (double **)SDDS_Realloc(column_data_x, sizeof(*column_data_x) * (n_pages + 1));
202 column_data_y = (double **)SDDS_Realloc(column_data_y, sizeof(*column_data_y) * (n_pages + 1));
203 rows = (int64_t *)SDDS_Realloc(rows, sizeof(*rows) * (n_pages + 1));
204 rows[n_pages] = rows0;
205 if (!(column_data_x[n_pages] = SDDS_GetColumnInDoubles(&SDDSin, column[0])))
206 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
207 if (!(column_data_y[n_pages] = SDDS_GetColumnInDoubles(&SDDSin, column[1])))
208 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
209 n_pages++;
210 }
211 }
212 if (n_pages == 0)
213 SDDS_Bomb("No data in file");
214
215 if (same_scales) {
216 double llx, lly, uux, uuy;
217 lowerx = lowery = DBL_MAX;
218 upperx = uppery = -DBL_MAX;
219 for (i_page = 0; i_page < n_pages; i_page++) {
220 find_min_max(&llx, &uux, column_data_x[i_page], rows[i_page]);
221 lowerx = MIN(llx, lowerx);
222 upperx = MAX(uux, upperx);
223 find_min_max(&lly, &uuy, column_data_y[i_page], rows[i_page]);
224 lowery = MIN(lly, lowery);
225 uppery = MAX(uuy, uppery);
226 }
227 }
228
229 pdf = malloc(sizeof(*pdf) * n_total);
230 for (i_page = 0; i_page < n_pages; i_page++) {
231 if (!SDDS_StartPage(&SDDSout, n_total))
232 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
233 if (!same_scales) {
234 find_min_max(&lowerx, &upperx, column_data_x[i_page], rows[i_page]);
235 find_min_max(&lowery, &uppery, column_data_y[i_page], rows[i_page]);
236 }
237 double *x_array = gridX(lowerx, upperx, n_test);
238 double *y_array = gridY(lowery, uppery, n_test);
239 for (j = 0; j < n_total; j++) {
240 pdf[j] = kerneldensityestimate(column_data_x[i_page], column_data_y[i_page], x_array[j], y_array[j], rows[i_page]);
241 }
242 if (!SDDS_SetColumnFromDoubles(&SDDSout, SDDS_SET_BY_NAME, x_array, n_total, column[0]))
243 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
244 if (!SDDS_SetColumnFromDoubles(&SDDSout, SDDS_SET_BY_NAME, y_array, n_total, column[1]))
245 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
246 if (!SDDS_SetColumnFromDoubles(&SDDSout, SDDS_SET_BY_NAME, pdf, n_total, "PDF"))
247 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
248 free(column_data_x[i_page]);
249 free(column_data_y[i_page]);
250 free(x_array);
251 free(y_array);
252 if (!SDDS_WritePage(&SDDSout))
253 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
254 }
255 free(pdf);
256 free(column_data_x);
257 free(column_data_y);
258 free(rows);
259 if (!SDDS_Terminate(&SDDSin) || !SDDS_Terminate(&SDDSout))
260 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
261 if (tmpfile_used && !replaceFileAndBackUp(input_file, output_file)) {
262 return 1;
263 }
264 free(column);
265 return 0;
266}
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int32_t SDDS_SetColumnFromDoubles(SDDS_DATASET *SDDS_dataset, int32_t mode, double *data, int64_t rows,...)
Sets the values for a single data column using double-precision floating-point numbers.
int64_t SDDS_CountRowsOfInterest(SDDS_DATASET *SDDS_dataset)
Counts the number of rows marked as "of interest" in the current data table.
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_InitializeOutput(SDDS_DATASET *SDDS_dataset, int32_t data_mode, int32_t lines_per_row, const char *description, const char *contents, const char *filename)
Initializes the SDDS output dataset.
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_DefineColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, int32_t field_length)
Defines a data column within the SDDS dataset.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_TransferColumnDefinition(SDDS_DATASET *target, SDDS_DATASET *source, char *name, char *newName)
Transfers a column definition from a source dataset to a target dataset.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
Definition SDDS_utils.c:677
#define SDDS_DOUBLE
Identifier for the double data type.
Definition SDDStypes.h:37
void * trealloc(void *old_ptr, uint64_t size_of_block)
Reallocates a memory block to a new size.
Definition array.c:181
int get_double(double *dptr, char *s)
Parses a double value from the given string.
Definition data_scan.c:40
char * delete_chars(char *s, char *t)
Removes all occurrences of characters found in string t from string s.
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
Definition findMinMax.c:33
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
long replaceFileAndBackUp(char *file, char *replacement)
Replaces a file with a replacement file and creates a backup of the original.
Definition replacefile.c:75
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390