43#include <gsl/gsl_statistics.h>
44#include <gsl/gsl_sort.h>
45#include <gsl/gsl_math.h>
51double bandwidth(
double data[],
int M);
52double gaussiankernelfunction(
double sample);
53double kerneldensityestimate(
double *trainingdata,
double sample, int64_t n);
54double *linearspace(
double initial,
double final,
int N);
64static char *option[N_OPTIONS] = {
70static const char *usage =
71 "sddskde [<inputfile>] [<outputfile>]\n"
72 " [-pipe=[input][,output]]\n"
73 " -column=<list of columns>\n"
74 " [-margin=<value>]\n"
76 "-column provide column names separated by commas, wild card accepted.\n"
77 "-margin provide the ratio to extend the original data, default 0.3.\n"
78 "-pipe The standard SDDS Toolkit pipe option.\n\n"
79 "sddskde performs kernel density estimation for one-dimensional data.\n"
80 "Program by Yipeng Sun and Hairong Shang (" __DATE__
" " __TIME__
", SVN revision: " SVN_VERSION
").\n";
82int main(
int argc,
char **argv) {
84 double min_temp, max_temp;
85 double gap, lower, upper;
88 char *input_file = NULL, *output_file = NULL, **column = NULL;
89 long tmpfile_used = 0, columns = 0, no_warnings = 1;
90 unsigned long pipe_flags;
93 double *column_data = NULL, *pdf = NULL, *cdf = NULL;
94 char buffer_pdf[1024], buffer_cdf[1024], buffer_pdf_units[1024];
102 fprintf(stderr,
"%s", usage);
106 for (i_arg = 1; i_arg < argc; i_arg++) {
107 if (s_arg[i_arg].arg_type == OPTION) {
109 switch (
match_string(s_arg[i_arg].list[0], option, N_OPTIONS, 0)) {
111 columns = s_arg[i_arg].n_items - 1;
112 column = realloc(column,
sizeof(*column) * columns);
113 for (i = 1; i < s_arg[i_arg].n_items; i++) {
114 column[i - 1] = s_arg[i_arg].list[i];
118 if (s_arg[i_arg].n_items != 2) {
121 if (!
get_double(&margin, s_arg[i_arg].list[1])) {
122 SDDS_Bomb(
"Invalid -margin value provided!");
126 if (!
processPipeOption(s_arg[i_arg].list + 1, s_arg[i_arg].n_items - 1, &pipe_flags)) {
127 fprintf(stderr,
"Error (%s): invalid -pipe syntax\n", argv[0]);
132 fprintf(stderr,
"Unknown option: %s\n", s_arg[i_arg].list[0]);
137 input_file = s_arg[i_arg].list[0];
138 }
else if (!output_file) {
139 output_file = s_arg[i_arg].list[0];
141 fprintf(stderr,
"Error (%s): too many filenames\n", argv[0]);
147 processFilenames(
"sddskde", &input_file, &output_file, pipe_flags, no_warnings, &tmpfile_used);
150 fprintf(stderr,
"%s", usage);
159 if ((columns = expandColumnPairNames(&sdds_in, &column, NULL, columns, NULL, 0, FIND_NUMERIC_TYPE, 0)) <= 0) {
168 for (i = 0; i < columns; i++) {
174 if (units && strlen(units)) {
175 snprintf(buffer_pdf_units,
sizeof(buffer_pdf_units),
"1/(%s)", units);
177 buffer_pdf_units[0] =
'\0';
182 snprintf(buffer_pdf,
sizeof(buffer_pdf),
"%sPDF", column[i]);
183 snprintf(buffer_cdf,
sizeof(buffer_cdf),
"%sCDF", column[i]);
196 pdf = malloc(
sizeof(*pdf) * n_test);
197 cdf = malloc(
sizeof(*cdf) * n_test);
205 for (i = 0; i < columns; i++) {
206 snprintf(buffer_pdf,
sizeof(buffer_pdf),
"%sPDF", column[i]);
207 snprintf(buffer_cdf,
sizeof(buffer_cdf),
"%sCDF", column[i]);
214 gap = max_temp - min_temp;
215 lower = min_temp - gap * margin;
216 upper = max_temp + gap * margin;
218 double *x_array = linearspace(lower, upper, n_test);
220 for (j = 0; j < n_test; j++) {
221 pdf[j] = kerneldensityestimate(column_data, x_array[j], rows);
225 for (j = 1; j < n_test; j++) {
226 cdf[j] += cdf[j - 1];
229 for (j = 0; j < n_test; j++) {
230 cdf[j] = cdf[j] / cdf[n_test - 1];
268double bandwidth(
double data[],
int M) {
269 double sigma, min_val, bwidth, interquartile_range;
270 double hspread_three, hspread_one;
272 gsl_sort(data, 1, M);
273 sigma = gsl_stats_sd(data, 1, M);
274 hspread_three = gsl_stats_quantile_from_sorted_data(data, 1, M, 0.750);
275 hspread_one = gsl_stats_quantile_from_sorted_data(data, 1, M, 0.250);
276 interquartile_range = hspread_three - hspread_one;
277 min_val = GSL_MIN(interquartile_range / 1.339999, sigma);
278 bwidth = 0.90 * min_val * pow(M, -0.20);
283double gaussiankernelfunction(
double sample) {
285 k = exp(-(gsl_pow_2(sample) / 2.0));
286 k = k / (M_SQRT2 * sqrt(M_PI));
291double kerneldensityestimate(
double *trainingdata,
double sample, int64_t n) {
293 double pdf = 0.0, h, bwidth;
295 bwidth = bandwidth(trainingdata, n);
296 h = GSL_MAX(bwidth, 2e-6);
298 for (i = 0; i < n; i++) {
299 pdf += gaussiankernelfunction((trainingdata[i] - sample) / h);
307double *linearspace(
double start,
double end,
int N) {
312 x = calloc(N,
sizeof(
double));
313 step = (end - start) / (
double)(N - 1);
316 for (i = 1; i < N; i++) {
317 x[i] = x[i - 1] + step;
SDDS (Self Describing Data Set) Data Types Definitions and Function Prototypes.
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int32_t SDDS_SetColumnFromDoubles(SDDS_DATASET *SDDS_dataset, int32_t mode, double *data, int64_t rows,...)
Sets the values for a single data column using double-precision floating-point numbers.
int32_t SDDS_GetColumnInformation(SDDS_DATASET *SDDS_dataset, char *field_name, void *memory, int32_t mode,...)
Retrieves information about a specified column in the SDDS dataset.
int32_t SDDS_InitializeOutput(SDDS_DATASET *SDDS_dataset, int32_t data_mode, int32_t lines_per_row, const char *description, const char *contents, const char *filename)
Initializes the SDDS output dataset.
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_DefineColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, int32_t field_length)
Defines a data column within the SDDS dataset.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_TransferColumnDefinition(SDDS_DATASET *target, SDDS_DATASET *source, char *name, char *newName)
Transfers a column definition from a source dataset to a target dataset.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
#define SDDS_STRING
Identifier for the string data type.
#define SDDS_DOUBLE
Identifier for the double data type.
Utility functions for SDDS dataset manipulation and string array operations.
int get_double(double *dptr, char *s)
Parses a double value from the given string.
char * delete_chars(char *s, char *t)
Removes all occurrences of characters found in string t from string s.
int find_min_max(double *min, double *max, double *list, int64_t n)
Finds the minimum and maximum values in a list of doubles.
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
long replaceFileAndBackUp(char *file, char *replacement)
Replaces a file with a replacement file and creates a backup of the original.
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
long processPipeOption(char **item, long items, unsigned long *flags)
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)