SDDS ToolKit Programs and Libraries for C and Python
All Classes Files Functions Variables Macros Pages
sddskde2d.c
Go to the documentation of this file.
1/**
2 * @file sddskde2d.c
3 * @brief Performs kernel density estimation (KDE) for two-dimensional data using the SDDS library.
4 *
5 * @details
6 * This program processes input data from an SDDS file, applies kernel density estimation
7 * to two selected columns, and outputs the resulting density estimates to a new SDDS file.
8 * It supports flexible options such as specifying columns, margins, and whether
9 * to use the same scales across data pages.
10 *
11 * @section Usage
12 * ```
13 * sddskde2d [<inputfile>] [<outputfile>]
14 * [-pipe=[input][,output]]
15 * -column=<column1,column2>
16 * [-samescales]
17 * [-margin=<value>]
18 * ```
19 *
20 * @section Options
21 * | Required | Description |
22 * |---------------------|---------------------------------------------------------------------------------------|
23 * | `-column` | Specify two column names for KDE, separated by a comma. Wildcards accepted. |
24 *
25 * | Option | Description |
26 * |------------------------|-------------------------------------------------------------------------|
27 * | `-pipe` | Utilize the standard SDDS Toolkit pipe option. |
28 * | `-samescales` | Use the same X and Y ranges for all output pages. |
29 * | `-margin` | Ratio to extend the original data range (default: 0.05). |
30 *
31 * ### Features
32 * - Reads input data from SDDS files and writes results in SDDS format.
33 * - Allows users to specify the columns for KDE using wildcards.
34 * - Handles margins and ensures proper scaling of the data.
35 * - Supports generating grids for density estimation.
36 * - Implements bandwidth selection and Gaussian kernel functions.
37 *
38 * @copyright
39 * - (c) 2002 The University of Chicago, as Operator of Argonne National Laboratory.
40 * - (c) 2002 The Regents of the University of California, as Operator of Los Alamos National Laboratory.
41 *
42 * @license
43 * This file is distributed under the terms of the Software License Agreement
44 * found in the file LICENSE included with this distribution.
45 *
46 * @author
47 * Yipeng Sun, M. Borland, R. Soliday
48 */
49
50#include <math.h>
51#include <gsl/gsl_statistics.h>
52#include <gsl/gsl_sort.h>
53#include <gsl/gsl_math.h>
54#include "SDDS.h"
55#include "mdb.h"
56#include "scan.h"
57#include "SDDSutils.h"
58
59double bandwidth(double data[], int64_t M);
60double gaussiankernelfunction(double sample);
61double kerneldensityestimate(double *trainingdata_x, double *trainingdata_y,
62 double sample_x, double sample_y, int64_t n);
63double *gridX(double start, double end, int N);
64double *gridY(double start, double end, int N);
65
66/* Enumeration for option types */
67enum option_type {
68 SET_COLUMN,
69 SET_PIPE,
70 SET_MARGIN,
71 SET_SAME_SCALES,
72 N_OPTIONS
73};
74
75char *option[N_OPTIONS] = {
76 "column",
77 "pipe",
78 "margin",
79 "samescales"};
80
81static char *usage =
82 "Usage: sddskde2d [<inputfile>] [<outputfile>] \n"
83 " [-pipe=[input][,output]] \n"
84 " -column=<column1,column2> \n"
85 " [-samescales] \n"
86 " [-margin=<value>]\n\n"
87 "Options:\n"
88 " -column Specify two column names separated by a comma. Wildcards are accepted.\n"
89 " -margin Ratio to extend the original data (default: 0.05).\n"
90 " -samescales Use the same X and Y ranges for all output pages.\n"
91 " -pipe Utilize the standard SDDS Toolkit pipe option.\n\n"
92 "Description:\n"
93 " sddskde2d performs kernel density estimation for two-dimensional data.\n\n"
94 "Author:\n"
95 " Yipeng Sun (" __DATE__ " " __TIME__ ", SVN revision: " SVN_VERSION ")\n";
96
97int main(int argc, char **argv) {
98 int n_test, n_total, same_scales;
99 double lowerx, upperx;
100 double lowery, uppery;
101 double margin = 0.05;
102 SDDS_DATASET SDDSin, SDDSout;
103 char *input_file, *output_file, **column;
104 long tmpfile_used = 0, columns, no_warnings = 1;
105 unsigned long pipe_flags;
106 long i, i_arg, j;
107 SCANNED_ARG *s_arg;
108 double **column_data_x, **column_data_y, *pdf;
109 int64_t *rows, rows0;
110 int32_t n_pages, i_page;
111
112 input_file = output_file = NULL;
113 pdf = NULL;
114 columns = 0;
115 column = NULL;
116 same_scales = 0;
117
118 n_test = 50;
119 n_total = n_test * n_test;
120
122 argc = scanargs(&s_arg, argc, argv);
123 pipe_flags = 0;
124 tmpfile_used = 0;
125 if (argc < 2) {
126 fprintf(stderr, "%s", usage);
127 exit(1);
128 }
129 for (i_arg = 1; i_arg < argc; i_arg++) {
130 if (s_arg[i_arg].arg_type == OPTION) {
131 delete_chars(s_arg[i_arg].list[0], "_");
132 switch (match_string(s_arg[i_arg].list[0], option, N_OPTIONS, 0)) {
133 case SET_COLUMN:
134 columns = s_arg[i_arg].n_items - 1;
135 column = trealloc(column, sizeof(*column) * columns);
136 for (i = 1; i < s_arg[i_arg].n_items; i++)
137 column[i - 1] = s_arg[i_arg].list[i];
138 break;
139 case SET_MARGIN:
140 if (s_arg[i_arg].n_items != 2)
141 SDDS_Bomb("Invalid -margin option. Too many qualifiers.");
142 if (!get_double(&margin, s_arg[i_arg].list[1]))
143 SDDS_Bomb("Invalid -margin value provided.");
144 break;
145 case SET_SAME_SCALES:
146 if (s_arg[i_arg].n_items != 1)
147 SDDS_Bomb("Invalid -sameScales option. No qualifiers are accepted.");
148 same_scales = 1;
149 break;
150 case SET_PIPE:
151 if (!processPipeOption(s_arg[i_arg].list + 1, s_arg[i_arg].n_items - 1, &pipe_flags)) {
152 fprintf(stderr, "Error (%s): invalid -pipe syntax\n", argv[0]);
153 return 1;
154 }
155 break;
156 }
157 } else {
158 if (!input_file)
159 input_file = s_arg[i_arg].list[0];
160 else if (output_file == NULL)
161 output_file = s_arg[i_arg].list[0];
162 else {
163 fprintf(stderr, "Error (%s): too many filenames\n", argv[0]);
164 return 1;
165 }
166 }
167 }
168 processFilenames("sddskde2d", &input_file, &output_file, pipe_flags, no_warnings, &tmpfile_used);
169 if (!columns) {
170 fprintf(stderr, "%s", usage);
171 SDDS_Bomb("No column provided!");
172 }
173 if (!SDDS_InitializeInput(&SDDSin, input_file)) {
174 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors);
175 return 1;
176 }
177 if ((columns = expandColumnPairNames(&SDDSin, &column, NULL, columns, NULL, 0, FIND_NUMERIC_TYPE, 0)) <= 0) {
178 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
179 SDDS_Bomb("no columns selected.");
180 }
181 if (columns > 2) {
182 fprintf(stderr, "%s", usage);
183 SDDS_Bomb("Only 2 columns may be accepted.");
184 }
185 if (!SDDS_InitializeOutput(&SDDSout, SDDS_BINARY, 1, NULL, NULL, output_file))
186 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
187 if (!SDDS_TransferColumnDefinition(&SDDSout, &SDDSin, column[0], NULL) ||
188 !SDDS_TransferColumnDefinition(&SDDSout, &SDDSin, column[1], NULL) ||
189 SDDS_DefineColumn(&SDDSout, "PDF", NULL, NULL, NULL, NULL, SDDS_DOUBLE, 0) < 0)
190 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
191 if (!SDDS_WriteLayout(&SDDSout))
192 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
193
194 /* Read all the data in case we need to use fixed scales */
195 n_pages = 0;
196 column_data_x = NULL;
197 column_data_y = NULL;
198 rows = NULL;
199 while (SDDS_ReadPage(&SDDSin) >= 0) {
200 if ((rows0 = SDDS_CountRowsOfInterest(&SDDSin))) {
201 column_data_x = (double **)SDDS_Realloc(column_data_x, sizeof(*column_data_x) * (n_pages + 1));
202 column_data_y = (double **)SDDS_Realloc(column_data_y, sizeof(*column_data_y) * (n_pages + 1));
203 rows = (int64_t *)SDDS_Realloc(rows, sizeof(*rows) * (n_pages + 1));
204 rows[n_pages] = rows0;
205 if (!(column_data_x[n_pages] = SDDS_GetColumnInDoubles(&SDDSin, column[0])))
206 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
207 if (!(column_data_y[n_pages] = SDDS_GetColumnInDoubles(&SDDSin, column[1])))
208 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
209 n_pages++;
210 }
211 }
212 if (n_pages == 0)
213 SDDS_Bomb("No data in file");
214
215 if (same_scales) {
216 double llx, lly, uux, uuy;
217 lowerx = lowery = DBL_MAX;
218 upperx = uppery = -DBL_MAX;
219 for (i_page = 0; i_page < n_pages; i_page++) {
220 find_min_max(&llx, &uux, column_data_x[i_page], rows[i_page]);
221 lowerx = MIN(llx, lowerx);
222 upperx = MAX(uux, upperx);
223 find_min_max(&lly, &uuy, column_data_y[i_page], rows[i_page]);
224 lowery = MIN(lly, lowery);
225 uppery = MAX(uuy, uppery);
226 }
227 }
228
229 pdf = malloc(sizeof(*pdf) * n_total);
230 for (i_page = 0; i_page < n_pages; i_page++) {
231 if (!SDDS_StartPage(&SDDSout, n_total))
232 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
233 if (!same_scales) {
234 find_min_max(&lowerx, &upperx, column_data_x[i_page], rows[i_page]);
235 find_min_max(&lowery, &uppery, column_data_y[i_page], rows[i_page]);
236 }
237 double *x_array = gridX(lowerx, upperx, n_test);
238 double *y_array = gridY(lowery, uppery, n_test);
239 for (j = 0; j < n_total; j++) {
240 pdf[j] = kerneldensityestimate(column_data_x[i_page], column_data_y[i_page], x_array[j], y_array[j], rows[i_page]);
241 }
242 if (!SDDS_SetColumnFromDoubles(&SDDSout, SDDS_SET_BY_NAME, x_array, n_total, column[0]))
243 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
244 if (!SDDS_SetColumnFromDoubles(&SDDSout, SDDS_SET_BY_NAME, y_array, n_total, column[1]))
245 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
246 if (!SDDS_SetColumnFromDoubles(&SDDSout, SDDS_SET_BY_NAME, pdf, n_total, "PDF"))
247 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
248 free(column_data_x[i_page]);
249 free(column_data_y[i_page]);
250 free(x_array);
251 free(y_array);
252 if (!SDDS_WritePage(&SDDSout))
253 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
254 }
255 free(pdf);
256 free(column_data_x);
257 free(column_data_y);
258 free(rows);
259 if (!SDDS_Terminate(&SDDSin) || !SDDS_Terminate(&SDDSout))
260 SDDS_PrintErrors(stderr, SDDS_VERBOSE_PrintErrors | SDDS_EXIT_PrintErrors);
261 if (tmpfile_used && !replaceFileAndBackUp(input_file, output_file)) {
262 return 1;
263 }
264 free(column);
265 return 0;
266}
267
268double bandwidth(double data[], int64_t M) {
269 double sigma, bwidth, silver_factor;
270 sigma = gsl_stats_sd(data, 1, M);
271 silver_factor = pow(M, -0.16666666);
272 bwidth = pow(silver_factor, 2.0) * pow(sigma, 2.0);
273 return bwidth;
274}
275
276double gaussiankernelfunction(double sample) {
277 double k;
278 k = exp(-sample / 2.0);
279 k = k / (2.0 * M_PI);
280 return k;
281}
282
283double kerneldensityestimate(double *trainingdata_x, double *trainingdata_y,
284 double sample_x, double sample_y, int64_t n) {
285 int64_t i;
286 double pdf, hx, hy, z;
287
288 hx = bandwidth(trainingdata_x, n);
289 hy = bandwidth(trainingdata_y, n);
290 pdf = 0.0;
291 for (i = 0; i < n; i++) {
292 z = (trainingdata_x[i] - sample_x) * (trainingdata_x[i] - sample_x) / hx;
293 z += (trainingdata_y[i] - sample_y) * (trainingdata_y[i] - sample_y) / hy;
294 pdf += gaussiankernelfunction(z);
295 }
296 pdf = pdf / (n * sqrt(hx) * sqrt(hy));
297 return pdf;
298}
299
300double *gridX(double start, double end, int N) {
301 double *x;
302 int i, j, n_grid;
303 double step;
304
305 n_grid = N * N;
306 x = (double *)calloc(n_grid, sizeof(double));
307 step = (end - start) / (double)(N - 1);
308
309 i = 0;
310 for (j = 0; j < N; j++) {
311 for (i = 0; i < N; i++) {
312 x[i + j * N] = start + i * step;
313 }
314 }
315 return x;
316}
317
318double *gridY(double start, double end, int N) {
319 double *x;
320 int i, j, n_grid;
321 double step;
322
323 n_grid = N * N;
324 x = (double *)calloc(n_grid, sizeof(double));
325 step = (end - start) / (double)(N - 1);
326
327 i = 0;
328 for (j = 0; j < N; j++) {
329 for (i = 0; i < N; i++) {
330 x[i + j * N] = start + j * step;
331 }
332 }
333 return x;
334}
SDDS (Self Describing Data Set) Data Types Definitions and Function Prototypes.
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int32_t SDDS_SetColumnFromDoubles(SDDS_DATASET *SDDS_dataset, int32_t mode, double *data, int64_t rows,...)
Sets the values for a single data column using double-precision floating-point numbers.
int64_t SDDS_CountRowsOfInterest(SDDS_DATASET *SDDS_dataset)
Counts the number of rows marked as "of interest" in the current data table.
double * SDDS_GetColumnInDoubles(SDDS_DATASET *SDDS_dataset, char *column_name)
Retrieves the data of a specified numerical column as an array of doubles, considering only rows mark...
int32_t SDDS_InitializeInput(SDDS_DATASET *SDDS_dataset, char *filename)
Definition SDDS_input.c:49
int32_t SDDS_Terminate(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_ReadPage(SDDS_DATASET *SDDS_dataset)
int32_t SDDS_InitializeOutput(SDDS_DATASET *SDDS_dataset, int32_t data_mode, int32_t lines_per_row, const char *description, const char *contents, const char *filename)
Initializes the SDDS output dataset.
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_DefineColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, int32_t field_length)
Defines a data column within the SDDS dataset.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_TransferColumnDefinition(SDDS_DATASET *target, SDDS_DATASET *source, char *name, char *newName)
Transfers a column definition from a source dataset to a target dataset.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
Definition SDDS_utils.c:432
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
Definition SDDS_utils.c:288
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
Definition SDDS_utils.c:342
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
Definition SDDS_utils.c:677
#define SDDS_DOUBLE
Identifier for the double data type.
Definition SDDStypes.h:37
Utility functions for SDDS dataset manipulation and string array operations.
void * trealloc(void *old_ptr, uint64_t size_of_block)
Reallocates a memory block to a new size.
Definition array.c:181
int get_double(double *dptr, char *s)
Parses a double value from the given string.
Definition data_scan.c:40
char * delete_chars(char *s, char *t)
Removes all occurrences of characters found in string t from string s.
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
Definition findMinMax.c:33
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
long replaceFileAndBackUp(char *file, char *replacement)
Replaces a file with a replacement file and creates a backup of the original.
Definition replacefile.c:75
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
Definition scanargs.c:36
long processPipeOption(char **item, long items, unsigned long *flags)
Definition scanargs.c:356
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
Definition scanargs.c:390