SDDSlib
Loading...
Searching...
No Matches
sddskde2d.c File Reference

Performs kernel density estimation (KDE) for two-dimensional data using the SDDS library. More...

#include <math.h>
#include <gsl/gsl_statistics.h>
#include <gsl/gsl_sort.h>
#include <gsl/gsl_math.h>
#include "SDDS.h"
#include "mdb.h"
#include "scan.h"
#include "SDDSutils.h"

Go to the source code of this file.

Enumerations

enum  option_type {
  SET_COLUMN , SET_PIPE , SET_MARGIN , SET_SAME_SCALES ,
  N_OPTIONS
}
 

Functions

double bandwidth (double data[], int64_t M)
 
double gaussiankernelfunction (double sample)
 
double kerneldensityestimate (double *trainingdata_x, double *trainingdata_y, double sample_x, double sample_y, int64_t n)
 
double * gridX (double start, double end, int N)
 
double * gridY (double start, double end, int N)
 
int main (int argc, char **argv)
 

Variables

char * option [N_OPTIONS]
 
static char * usage
 

Detailed Description

Performs kernel density estimation (KDE) for two-dimensional data using the SDDS library.

This program processes input data from an SDDS file, applies kernel density estimation to two selected columns, and outputs the resulting density estimates to a new SDDS file. The program supports flexible options such as specifying columns, margins, and whether to use the same scales across data pages.

Features

  • Reads input data from SDDS files and writes results in SDDS format.
  • Allows users to specify the columns for KDE using wildcards.
  • Handles margins and ensures proper scaling of the data.
  • Supports generating grids for density estimation.
  • Implements bandwidth selection and Gaussian kernel functions.

Usage

sddskde2d [<inputfile>] [<outputfile>]
[-column=<column1,column2>]
[-samescales]
[-pipe]
[-margin=<value>]

Options

  • -column=<column1,column2>: Specify two column names for KDE, separated by a comma. Wildcards are accepted.
  • -margin=<value>: Ratio to extend the original data (default: 0.05).
  • -samescales: Use the same X and Y ranges for all output pages.
  • -pipe: Utilize the standard SDDS Toolkit pipe option.

Example

sddskde2d data.sdds output.sdds -column=x,y -margin=0.1 -samescales
License
This file is distributed under the terms of the Software License Agreement found in the file LICENSE included with this distribution.
Author
Yipeng Sun, M. Borland, R. Soliday

Definition in file sddskde2d.c.

Enumeration Type Documentation

◆ option_type

enum option_type

Definition at line 65 of file sddskde2d.c.

65 {
66 SET_COLUMN,
67 SET_PIPE,
68 SET_MARGIN,
69 SET_SAME_SCALES,
70 N_OPTIONS
71};

Function Documentation

◆ bandwidth()

double bandwidth ( double data[],
int64_t M )

Definition at line 266 of file sddskde2d.c.

266 {
267 double sigma, bwidth, silver_factor;
268 sigma = gsl_stats_sd(data, 1, M);
269 silver_factor = pow(M, -0.16666666);
270 bwidth = pow(silver_factor, 2.0) * pow(sigma, 2.0);
271 return bwidth;
272}

◆ gaussiankernelfunction()

double gaussiankernelfunction ( double sample)

Definition at line 274 of file sddskde2d.c.

274 {
275 double k;
276 k = exp(-sample / 2.0);
277 k = k / (2.0 * M_PI);
278 return k;
279}

◆ gridX()

double * gridX ( double start,
double end,
int N )

Definition at line 298 of file sddskde2d.c.

298 {
299 double *x;
300 int i, j, n_grid;
301 double step;
302
303 n_grid = N * N;
304 x = (double *)calloc(n_grid, sizeof(double));
305 step = (end - start) / (double)(N - 1);
306
307 i = 0;
308 for (j = 0; j < N; j++) {
309 for (i = 0; i < N; i++) {
310 x[i + j * N] = start + i * step;
311 }
312 }
313 return x;
314}

◆ gridY()

double * gridY ( double start,
double end,
int N )

Definition at line 316 of file sddskde2d.c.

316 {
317 double *x;
318 int i, j, n_grid;
319 double step;
320
321 n_grid = N * N;
322 x = (double *)calloc(n_grid, sizeof(double));
323 step = (end - start) / (double)(N - 1);
324
325 i = 0;
326 for (j = 0; j < N; j++) {
327 for (i = 0; i < N; i++) {
328 x[i + j * N] = start + j * step;
329 }
330 }
331 return x;
332}

◆ kerneldensityestimate()

double kerneldensityestimate ( double * trainingdata_x,
double * trainingdata_y,
double sample_x,
double sample_y,
int64_t n )

Definition at line 281 of file sddskde2d.c.

282 {
283 int64_t i;
284 double pdf, hx, hy, z;
285
286 hx = bandwidth(trainingdata_x, n);
287 hy = bandwidth(trainingdata_y, n);
288 pdf = 0.0;
289 for (i = 0; i < n; i++) {
290 z = (trainingdata_x[i] - sample_x) * (trainingdata_x[i] - sample_x) / hx;
291 z += (trainingdata_y[i] - sample_y) * (trainingdata_y[i] - sample_y) / hy;
292 pdf += gaussiankernelfunction(z);
293 }
294 pdf = pdf / (n * sqrt(hx) * sqrt(hy));
295 return pdf;
296}

◆ main()

int main ( int argc,
char ** argv )

Definition at line 95 of file sddskde2d.c.

95 {
96 int n_test, n_total, same_scales;
97 double lowerx, upperx;
98 double lowery, uppery;
99 double margin = 0.05;
100 SDDS_DATASET SDDSin, SDDSout;
101 char *input_file, *output_file, **column;
102 long tmpfile_used = 0, columns, no_warnings = 1;
103 unsigned long pipe_flags;
104 long i, i_arg, j;
105 SCANNED_ARG *s_arg;
106 double **column_data_x, **column_data_y, *pdf;
107 int64_t *rows, rows0;
108 int32_t n_pages, i_page;
109
110 input_file = output_file = NULL;
111 pdf = NULL;
112 columns = 0;
113 column = NULL;
114 same_scales = 0;
115
116 n_test = 50;
117 n_total = n_test * n_test;
118
120 argc = scanargs(&s_arg, argc, argv);
121 pipe_flags = 0;
122 tmpfile_used = 0;
123 if (argc < 2) {
124 fprintf(stderr, "%s", usage);
125 exit(1);
126 }
127 for (i_arg = 1; i_arg < argc; i_arg++) {
128 if (s_arg[i_arg].arg_type == OPTION) {
129 delete_chars(s_arg[i_arg].list[0], "_");
130 switch (match_string(s_arg[i_arg].list[0], option, N_OPTIONS, 0)) {
131 case SET_COLUMN:
132 columns = s_arg[i_arg].n_items - 1;
133 column = trealloc(column, sizeof(*column) * columns);
134 for (i = 1; i < s_arg[i_arg].n_items; i++)
135 column[i - 1] = s_arg[i_arg].list[i];
136 break;
137 case SET_MARGIN:
138 if (s_arg[i_arg].n_items != 2)
139 SDDS_Bomb("Invalid -margin option. Too many qualifiers.");
140 if (!get_double(&margin, s_arg[i_arg].list[1]))
141 SDDS_Bomb("Invalid -margin value provided.");
142 break;
143 case SET_SAME_SCALES:
144 if (s_arg[i_arg].n_items != 1)
145 SDDS_Bomb("Invalid -sameScales option. No qualifiers are accepted.");
146 same_scales = 1;
147 break;
148 case SET_PIPE:
149 if (!processPipeOption(s_arg[i_arg].list + 1, s_arg[i_arg].n_items - 1, &pipe_flags)) {
150 fprintf(stderr, "Error (%s): invalid -pipe syntax\n", argv[0]);
151 return 1;
152 }
153 break;
154 }
155 } else {
156 if (!input_file)
157 input_file = s_arg[i_arg].list[0];
158 else if (output_file == NULL)
159 output_file = s_arg[i_arg].list[0];
160 else {
161 fprintf(stderr, "Error (%s): too many filenames\n", argv[0]);
162 return 1;
163 }
164 }
165 }
166 processFilenames("sddskde2d", &input_file, &output_file, pipe_flags, no_warnings, &tmpfile_used);
167 if (!columns) {
168 fprintf(stderr, "%s", usage);
169 SDDS_Bomb("No column provided!");
170 }
171 if (!SDDS_InitializeInput(&SDDSin, input_file)) {
172 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
173 return 1;
174 }
175 if ((columns = expandColumnPairNames(&SDDSin, &column, NULL, columns, NULL, 0, FIND_NUMERIC_TYPE, 0)) <= 0) {
176 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
177 SDDS_Bomb("no columns selected.");
178 }
179 if (columns > 2) {
180 fprintf(stderr, "%s", usage);
181 SDDS_Bomb("Only 2 columns may be accepted.");
182 }
183 if (!SDDS_InitializeOutput(&SDDSout, SDDS_BINARY, 1, NULL, NULL, output_file))
184 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
185 if (!SDDS_TransferColumnDefinition(&SDDSout, &SDDSin, column[0], NULL) ||
186 !SDDS_TransferColumnDefinition(&SDDSout, &SDDSin, column[1], NULL) ||
187 SDDS_DefineColumn(&SDDSout, "PDF", NULL, NULL, NULL, NULL, SDDS_DOUBLE, 0) < 0)
188 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
189 if (!SDDS_WriteLayout(&SDDSout))
190 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
191
192 /* Read all the data in case we need to use fixed scales */
193 n_pages = 0;
194 column_data_x = NULL;
195 column_data_y = NULL;
196 rows = NULL;
197 while (SDDS_ReadPage(&SDDSin) >= 0) {
198 if ((rows0 = SDDS_CountRowsOfInterest(&SDDSin))) {
199 column_data_x = (double **)SDDS_Realloc(column_data_x, sizeof(*column_data_x) * (n_pages + 1));
200 column_data_y = (double **)SDDS_Realloc(column_data_y, sizeof(*column_data_y) * (n_pages + 1));
201 rows = (int64_t *)SDDS_Realloc(rows, sizeof(*rows) * (n_pages + 1));
202 rows[n_pages] = rows0;
203 if (!(column_data_x[n_pages] = SDDS_GetColumnInDoubles(&SDDSin, column[0])))
204 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
205 if (!(column_data_y[n_pages] = SDDS_GetColumnInDoubles(&SDDSin, column[1])))
206 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
207 n_pages++;
208 }
209 }
210 if (n_pages == 0)
211 SDDS_Bomb("No data in file");
212
213 if (same_scales) {
214 double llx, lly, uux, uuy;
215 lowerx = lowery = DBL_MAX;
216 upperx = uppery = -DBL_MAX;
217 for (i_page = 0; i_page < n_pages; i_page++) {
218 find_min_max(&llx, &uux, column_data_x[i_page], rows[i_page]);
219 lowerx = MIN(llx, lowerx);
220 upperx = MAX(uux, upperx);
221 find_min_max(&lly, &uuy, column_data_y[i_page], rows[i_page]);
222 lowery = MIN(lly, lowery);
223 uppery = MAX(uuy, uppery);
224 }
225 }
226
227 pdf = malloc(sizeof(*pdf) * n_total);
228 for (i_page = 0; i_page < n_pages; i_page++) {
229 if (!SDDS_StartPage(&SDDSout, n_total))
230 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
231 if (!same_scales) {
232 find_min_max(&lowerx, &upperx, column_data_x[i_page], rows[i_page]);
233 find_min_max(&lowery, &uppery, column_data_y[i_page], rows[i_page]);
234 }
235 double *x_array = gridX(lowerx, upperx, n_test);
236 double *y_array = gridY(lowery, uppery, n_test);
237 for (j = 0; j < n_total; j++) {
238 pdf[j] = kerneldensityestimate(column_data_x[i_page], column_data_y[i_page], x_array[j], y_array[j], rows[i_page]);
239 }
240 if (!SDDS_SetColumnFromDoubles(&SDDSout, SDDS_SET_BY_NAME, x_array, n_total, column[0]))
241 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
242 if (!SDDS_SetColumnFromDoubles(&SDDSout, SDDS_SET_BY_NAME, y_array, n_total, column[1]))
243 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
244 if (!SDDS_SetColumnFromDoubles(&SDDSout, SDDS_SET_BY_NAME, pdf, n_total, "PDF"))
245 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
246 free(column_data_x[i_page]);
247 free(column_data_y[i_page]);
248 free(x_array);
249 free(y_array);
250 if (!SDDS_WritePage(&SDDSout))
251 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
252 }
253 free(pdf);
254 free(column_data_x);
255 free(column_data_y);
256 free(rows);
257 if (!SDDS_Terminate(&SDDSin) || !SDDS_Terminate(&SDDSout))
258 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
259 if (tmpfile_used && !replaceFileAndBackUp(input_file, output_file)) {
260 return 1;
261 }
262 free(column);
263 return 0;
264}
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int32_t SDDS_SetColumnFromDoubles(SDDS_DATASET *SDDS_dataset, int32_t mode, double *data, int64_t rows,...)
Sets the values for a single data column using double-precision floating-point numbers.
int64_t SDDS_CountRowsOfInterest(SDDS_DATASET *SDDS_dataset)
Counts the number of rows marked as "of interest" in the current data table.
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_InitializeOutput(SDDS_DATASET *SDDS_dataset, int32_t data_mode, int32_t lines_per_row, const char *description, const char *contents, const char *filename)
Initializes the SDDS output dataset.
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_DefineColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, int32_t field_length)
Defines a data column within the SDDS dataset.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_TransferColumnDefinition(SDDS_DATASET *target, SDDS_DATASET *source, char *name, char *newName)
Transfers a column definition from a source dataset to a target dataset.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
Definition SDDS_utils.c:677
#define SDDS_DOUBLE
Identifier for the double data type.
Definition SDDStypes.h:37
void * trealloc(void *old_ptr, uint64_t size_of_block)
Reallocates a memory block to a new size.
Definition array.c:181
int get_double(double *dptr, char *s)
Parses a double value from the given string.
Definition data_scan.c:40
char * delete_chars(char *s, char *t)
Removes all occurrences of characters found in string t from string s.
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
Definition findMinMax.c:33
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
long replaceFileAndBackUp(char *file, char *replacement)
Replaces a file with a replacement file and creates a backup of the original.
Definition replacefile.c:75
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390

Variable Documentation

◆ option

char* option[N_OPTIONS]
Initial value:
= {
"column",
"pipe",
"margin",
"samescales"}

Definition at line 73 of file sddskde2d.c.

73 {
74 "column",
75 "pipe",
76 "margin",
77 "samescales"};

◆ usage

char* usage
static
Initial value:
=
"Usage: sddskde2d [<inputfile>] [<outputfile>] \n"
" [-column=<column1,column2>] \n"
" [-samescales] \n"
" [-pipe] \n"
" [-margin=<value>]\n\n"
"Options:\n"
" -column Specify two column names separated by a comma. Wildcards are accepted.\n"
" -margin Ratio to extend the original data (default: 0.05).\n"
" -samescales Use the same X and Y ranges for all output pages.\n"
" -pipe Utilize the standard SDDS Toolkit pipe option.\n\n"
"Description:\n"
" sddskde2d performs kernel density estimation for two-dimensional data.\n\n"
"Author:\n"
" Yipeng Sun (" __DATE__ " " __TIME__ ", SVN revision: " SVN_VERSION ")\n"

Definition at line 79 of file sddskde2d.c.