49#include <gsl/gsl_statistics.h>
50#include <gsl/gsl_sort.h>
51#include <gsl/gsl_math.h>
57double bandwidth(
double data[], int64_t M);
58double gaussiankernelfunction(
double sample);
59double kerneldensityestimate(
double *trainingdata_x,
double *trainingdata_y,
60 double sample_x,
double sample_y, int64_t n);
61double *gridX(
double start,
double end,
int N);
62double *gridY(
double start,
double end,
int N);
73char *option[N_OPTIONS] = {
80 "Usage: sddskde2d [<inputfile>] [<outputfile>] \n"
81 " [-column=<column1,column2>] \n"
84 " [-margin=<value>]\n\n"
86 " -column Specify two column names separated by a comma. Wildcards are accepted.\n"
87 " -margin Ratio to extend the original data (default: 0.05).\n"
88 " -samescales Use the same X and Y ranges for all output pages.\n"
89 " -pipe Utilize the standard SDDS Toolkit pipe option.\n\n"
91 " sddskde2d performs kernel density estimation for two-dimensional data.\n\n"
93 " Yipeng Sun (" __DATE__
" " __TIME__
", SVN revision: " SVN_VERSION
")\n";
95int main(
int argc,
char **argv) {
96 int n_test, n_total, same_scales;
97 double lowerx, upperx;
98 double lowery, uppery;
101 char *input_file, *output_file, **column;
102 long tmpfile_used = 0, columns, no_warnings = 1;
103 unsigned long pipe_flags;
106 double **column_data_x, **column_data_y, *pdf;
107 int64_t *rows, rows0;
108 int32_t n_pages, i_page;
110 input_file = output_file = NULL;
117 n_total = n_test * n_test;
120 argc =
scanargs(&s_arg, argc, argv);
124 fprintf(stderr,
"%s", usage);
127 for (i_arg = 1; i_arg < argc; i_arg++) {
128 if (s_arg[i_arg].arg_type == OPTION) {
130 switch (
match_string(s_arg[i_arg].list[0], option, N_OPTIONS, 0)) {
132 columns = s_arg[i_arg].n_items - 1;
133 column =
trealloc(column,
sizeof(*column) * columns);
134 for (i = 1; i < s_arg[i_arg].n_items; i++)
135 column[i - 1] = s_arg[i_arg].list[i];
138 if (s_arg[i_arg].n_items != 2)
139 SDDS_Bomb(
"Invalid -margin option. Too many qualifiers.");
140 if (!
get_double(&margin, s_arg[i_arg].list[1]))
141 SDDS_Bomb(
"Invalid -margin value provided.");
143 case SET_SAME_SCALES:
144 if (s_arg[i_arg].n_items != 1)
145 SDDS_Bomb(
"Invalid -sameScales option. No qualifiers are accepted.");
149 if (!
processPipeOption(s_arg[i_arg].list + 1, s_arg[i_arg].n_items - 1, &pipe_flags)) {
150 fprintf(stderr,
"Error (%s): invalid -pipe syntax\n", argv[0]);
157 input_file = s_arg[i_arg].list[0];
158 else if (output_file == NULL)
159 output_file = s_arg[i_arg].list[0];
161 fprintf(stderr,
"Error (%s): too many filenames\n", argv[0]);
166 processFilenames(
"sddskde2d", &input_file, &output_file, pipe_flags, no_warnings, &tmpfile_used);
168 fprintf(stderr,
"%s", usage);
175 if ((columns = expandColumnPairNames(&SDDSin, &column, NULL, columns, NULL, 0, FIND_NUMERIC_TYPE, 0)) <= 0) {
180 fprintf(stderr,
"%s", usage);
181 SDDS_Bomb(
"Only 2 columns may be accepted.");
194 column_data_x = NULL;
195 column_data_y = NULL;
199 column_data_x = (
double **)
SDDS_Realloc(column_data_x,
sizeof(*column_data_x) * (n_pages + 1));
200 column_data_y = (
double **)
SDDS_Realloc(column_data_y,
sizeof(*column_data_y) * (n_pages + 1));
201 rows = (int64_t *)
SDDS_Realloc(rows,
sizeof(*rows) * (n_pages + 1));
202 rows[n_pages] = rows0;
214 double llx, lly, uux, uuy;
215 lowerx = lowery = DBL_MAX;
216 upperx = uppery = -DBL_MAX;
217 for (i_page = 0; i_page < n_pages; i_page++) {
218 find_min_max(&llx, &uux, column_data_x[i_page], rows[i_page]);
219 lowerx = MIN(llx, lowerx);
220 upperx = MAX(uux, upperx);
221 find_min_max(&lly, &uuy, column_data_y[i_page], rows[i_page]);
222 lowery = MIN(lly, lowery);
223 uppery = MAX(uuy, uppery);
227 pdf = malloc(
sizeof(*pdf) * n_total);
228 for (i_page = 0; i_page < n_pages; i_page++) {
232 find_min_max(&lowerx, &upperx, column_data_x[i_page], rows[i_page]);
233 find_min_max(&lowery, &uppery, column_data_y[i_page], rows[i_page]);
235 double *x_array = gridX(lowerx, upperx, n_test);
236 double *y_array = gridY(lowery, uppery, n_test);
237 for (j = 0; j < n_total; j++) {
238 pdf[j] = kerneldensityestimate(column_data_x[i_page], column_data_y[i_page], x_array[j], y_array[j], rows[i_page]);
246 free(column_data_x[i_page]);
247 free(column_data_y[i_page]);
266double bandwidth(
double data[], int64_t M) {
267 double sigma, bwidth, silver_factor;
268 sigma = gsl_stats_sd(data, 1, M);
269 silver_factor = pow(M, -0.16666666);
270 bwidth = pow(silver_factor, 2.0) * pow(sigma, 2.0);
274double gaussiankernelfunction(
double sample) {
276 k = exp(-sample / 2.0);
277 k = k / (2.0 * M_PI);
281double kerneldensityestimate(
double *trainingdata_x,
double *trainingdata_y,
282 double sample_x,
double sample_y, int64_t n) {
284 double pdf, hx, hy, z;
286 hx = bandwidth(trainingdata_x, n);
287 hy = bandwidth(trainingdata_y, n);
289 for (i = 0; i < n; i++) {
290 z = (trainingdata_x[i] - sample_x) * (trainingdata_x[i] - sample_x) / hx;
291 z += (trainingdata_y[i] - sample_y) * (trainingdata_y[i] - sample_y) / hy;
292 pdf += gaussiankernelfunction(z);
294 pdf = pdf / (n * sqrt(hx) * sqrt(hy));
298double *gridX(
double start,
double end,
int N) {
304 x = (
double *)calloc(n_grid,
sizeof(
double));
305 step = (end - start) / (
double)(N - 1);
308 for (j = 0; j < N; j++) {
309 for (i = 0; i < N; i++) {
310 x[i + j * N] = start + i * step;
316double *gridY(
double start,
double end,
int N) {
322 x = (
double *)calloc(n_grid,
sizeof(
double));
323 step = (end - start) / (
double)(N - 1);
326 for (j = 0; j < N; j++) {
327 for (i = 0; i < N; i++) {
328 x[i + j * N] = start + j * step;
SDDS (Self Describing Data Set) Data Types Definitions and Function Prototypes.
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int32_t SDDS_SetColumnFromDoubles(SDDS_DATASET *SDDS_dataset, int32_t mode, double *data, int64_t rows,...)
Sets the values for a single data column using double-precision floating-point numbers.
int32_t SDDS_InitializeOutput(SDDS_DATASET *SDDS_dataset, int32_t data_mode, int32_t lines_per_row, const char *description, const char *contents, const char *filename)
Initializes the SDDS output dataset.
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_DefineColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, int32_t field_length)
Defines a data column within the SDDS dataset.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_TransferColumnDefinition(SDDS_DATASET *target, SDDS_DATASET *source, char *name, char *newName)
Transfers a column definition from a source dataset to a target dataset.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
#define SDDS_DOUBLE
Identifier for the double data type.
void * trealloc(void *old_ptr, uint64_t size_of_block)
Reallocates a memory block to a new size.
int get_double(double *dptr, char *s)
Parses a double value from the given string.
char * delete_chars(char *s, char *t)
Removes all occurrences of characters found in string t from string s.
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
long replaceFileAndBackUp(char *file, char *replacement)
Replaces a file with a replacement file and creates a backup of the original.
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
long processPipeOption(char **item, long items, unsigned long *flags)
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)