69char *option[N_OPTIONS] = {
70 "bins",
"lowerlimit",
"upperlimit",
"datacolumn",
71 "filter",
"sizeofbins",
"weightcolumn",
72 "normalize",
"statistics",
"sides",
"verbose",
"pipe",
"cdf",
"expand",
73 "majorOrder",
"regions",
"threads"
77 "Usage: sddshist [<inputfile>] [<outputfile>] [options]\n"
79 " -pipe=[input][,output] Use pipe for input and/or output.\n"
80 " -dataColumn=<column-name> Specify the column to histogram.\n"
81 " -bins=<number> Set the number of bins for the histogram.\n"
82 " -sizeOfBins=<value> Set the size of each bin.\n"
83 " -regions=filename=<filename>,position=<columnName>,name=<columnName>\n"
84 " Define region-based histogramming.\n"
85 " -lowerLimit=<value> Set the lower limit of the histogram.\n"
86 " -upperLimit=<value> Set the upper limit of the histogram.\n"
87 " -expand=<factor> Expand the range of the histogram by the given factor.\n"
88 " -filter=<column-name>,<lower>,<upper> Filter data points based on column values.\n"
89 " -weightColumn=<column-name> Weight the histogram with the specified column.\n"
90 " -sides[=<points>] Add sides to the histogram down to zero level.\n"
91 " -normalize[={sum|area|peak}] Normalize the histogram.\n"
92 " -cdf[=only] Include the CDF in the output. Use 'only' to exclude the histogram.\n"
93 " -threads=<number> Specify the number of threads to use.\n"
94 " -statistics Include statistical information in the output.\n"
95 " -verbose Enable informational printouts during processing.\n"
96 " -majorOrder=row|column Set the major order of data.\n";
98static char *additional_help =
"\n\
99bins Number of bins for the histogram.\n\
100sizeOfBins Size of each bin for the histogram.\n\
101regions File defining region-based histogramming with bin boundaries and names.\n\
102lowerLimit Lower limit of the histogram.\n\
103upperLimit Upper limit of the histogram.\n\
104expand Expand the range of the histogram by the given factor.\n\
105dataColumn Name of the column to histogram.\n\
106filter Include only data points where the specified column's value is between the given limits.\n\
107weightColumn Weight the histogram using the specified column's values.\n\
108normalize Normalize the histogram as specified (sum, area, peak).\n\
109statistics Include statistical information (mean, RMS, standard deviation) in the output.\n\
110sides Add sides to the histogram down to the zero level.\n\
111cdf Include the Cumulative Distribution Function (CDF) in the output. Use 'only' to exclude the histogram.\n\
112verbose Enable informational printouts during processing.\n\n\
113Program by Michael Borland. (" __DATE__
" " __TIME__
", SVN revision: " SVN_VERSION
")\n";
115#define NORMALIZE_PEAK 0
116#define NORMALIZE_AREA 1
117#define NORMALIZE_SUM 2
118#define NORMALIZE_NO 3
119#define N_NORMALIZE_OPTIONS 4
120char *normalize_option[N_NORMALIZE_OPTIONS] = {
121 "peak",
"area",
"sum",
"no"
124static int64_t filter(
double *x,
double *y,
double *filterData, int64_t npts,
double lower_filter,
double upper_filter);
126 char *dataColumn,
char *weightColumn,
char *filterColumn,
double lowerFilter,
127 double upperFilter,
SDDS_DATASET *regionTable,
char *regionNameColumn,
long doStats,
128 int64_t bins,
double binSize,
long normalizeMode,
short columnMajorOrder);
131static long iIndep, iFreq, iBins, iBinSize, iLoFilter, iUpFilter, iMean, iRMS, iStDev, iPoints, iCdf;
132static short cdfOnly, freOnly;
134int64_t readRegionFile(
SDDS_DATASET *SDDSin,
char *filename,
char *positionColumn,
char *nameColumn,
double **regionPosition,
char ***regionName);
135void classifyByRegion(
double *data,
double *weight, int64_t points,
double *histogram,
double *regionPosition, int64_t bins);
137int main(
int argc,
char **argv) {
139 long binsGiven, lowerLimitGiven, upperLimitGiven;
144 double *hist, *hist1;
148 double lowerLimit, upperLimit;
149 double givenLowerLimit, givenUpperLimit;
150 double range, binSize;
153 double mean, rms, standDev, mad;
154 char *filterColumn, *dataColumn, *weightColumn;
155 double lowerFilter = 0, upperFilter = 0;
157 SCANNED_ARG *scanned;
158 char *inputfile, *outputfile;
162 long normalizeMode, doSides, verbose, readCode;
164 unsigned long pipeFlags, majorOrderFlag, regionFlags = 0;
166 double expansionFactor = 0;
167 short columnMajorOrder = -1;
168 char *regionFilename = NULL, *regionPositionColumn = NULL, *regionNameColumn = NULL;
169 double *regionPosition = NULL;
170 int64_t nRegions = 0;
171 char **regionName = NULL;
176 argc =
scanargs(&scanned, argc, argv);
178 fprintf(stderr,
"%s\n", USAGE);
179 fputs(additional_help, stderr);
183 binsGiven = lowerLimitGiven = upperLimitGiven = 0;
184 binSize = doSides = 0;
185 inputfile = outputfile = NULL;
186 dataColumn = filterColumn = weightColumn = NULL;
187 doStats = verbose = 0;
188 normalizeMode = NORMALIZE_NO;
194 for (i = 1; i < argc; i++) {
195 if (scanned[i].arg_type == OPTION) {
196 switch (
match_string(scanned[i].list[0], option, N_OPTIONS, 0)) {
197 case SET_MAJOR_ORDER:
199 scanned[i].n_items--;
200 if (scanned[i].n_items > 0 && (!
scanItemList(&majorOrderFlag, scanned[i].list + 1, &scanned[i].n_items, 0,
"row", -1, NULL, 0, SDDS_ROW_MAJOR_ORDER,
"column", -1, NULL, 0, SDDS_COLUMN_MAJOR_ORDER, NULL)))
201 SDDS_Bomb(
"invalid -majorOrder syntax/values");
202 if (majorOrderFlag & SDDS_COLUMN_MAJOR_ORDER)
203 columnMajorOrder = 1;
204 else if (majorOrderFlag & SDDS_ROW_MAJOR_ORDER)
205 columnMajorOrder = 0;
209 SDDS_Bomb(
"-bins specified more than once");
211 if (sscanf(scanned[i].list[1],
"%" SCNd64, &bins) != 1 || bins <= 0)
216 SDDS_Bomb(
"-lowerLimit specified more than once");
218 if (sscanf(scanned[i].list[1],
"%lf", &givenLowerLimit) != 1)
219 SDDS_Bomb(
"invalid value for lowerLimit");
223 SDDS_Bomb(
"-upperLimit specified more than once");
225 if (sscanf(scanned[i].list[1],
"%lf", &givenUpperLimit) != 1)
226 SDDS_Bomb(
"invalid value for upperLimit");
230 if (sscanf(scanned[i].list[1],
"%lf", &expansionFactor) != 1 || expansionFactor <= 0)
235 SDDS_Bomb(
"-dataColumn specified more than once");
236 if (scanned[i].n_items != 2)
237 SDDS_Bomb(
"invalid -dataColumn syntax---supply name");
238 dataColumn = scanned[i].list[1];
242 SDDS_Bomb(
"multiple filter specifications not allowed");
243 if (scanned[i].n_items != 4 || sscanf(scanned[i].list[2],
"%lf", &lowerFilter) != 1 ||
244 sscanf(scanned[i].list[3],
"%lf", &upperFilter) != 1 || lowerFilter > upperFilter)
245 SDDS_Bomb(
"invalid -filter syntax/values");
246 filterColumn = scanned[i].list[1];
248 case SET_WEIGHTCOLUMN:
250 SDDS_Bomb(
"multiple weighting columns not allowed");
251 if (scanned[i].n_items != 2)
252 SDDS_Bomb(
"-weightColumn requires a column name");
253 weightColumn = scanned[i].list[1];
256 if (scanned[i].n_items == 1)
257 normalizeMode = NORMALIZE_SUM;
258 else if (scanned[i].n_items != 2 || (normalizeMode =
match_string(scanned[i].list[1], normalize_option, N_NORMALIZE_OPTIONS, 0)) < 0)
265 if (scanned[i].n_items == 1)
267 else if (scanned[i].n_items > 2 || (sscanf(scanned[i].list[1],
"%ld", &doSides) != 1 || doSides <= 0))
274 if (sscanf(scanned[i].list[1],
"%le", &binSize) != 1 || binSize <= 0)
282 if (scanned[i].n_items == 1)
285 if (scanned[i].n_items != 2)
287 cdf = scanned[i].list[1];
288 if (strcmp(cdf,
"only") != 0)
289 SDDS_Bomb(
"invalid -cdf value, it should be -cdf or -cdf=only");
294 case SET_REGION_FILE:
295 if (scanned[i].n_items != 4)
298 scanned[i].n_items -= 1;
299 if (!
scanItemList(®ionFlags, scanned[i].list + 1, &scanned[i].n_items, 0,
301 "position",
SDDS_STRING, ®ionPositionColumn, 1, 2,
302 "name",
SDDS_STRING, ®ionNameColumn, 1, 4, NULL) ||
303 regionFlags != (1 + 2 + 4) || !regionFilename || !regionPositionColumn || !regionNameColumn)
307 if (scanned[i].n_items != 2 ||
308 !sscanf(scanned[i].list[1],
"%d", &threads) || threads < 1)
312 fprintf(stderr,
"Error: option %s not recognized\n", scanned[i].list[0]);
319 inputfile = scanned[i].list[0];
320 else if (!outputfile)
321 outputfile = scanned[i].list[0];
329 if (binSize && binsGiven && regionFlags)
330 SDDS_Bomb(
"Provide only one of -bins, -sizeOfBins, or -regions");
334 SDDS_Bomb(
"-dataColumn must be specified");
337 if (!(nRegions = readRegionFile(&SDDSregion, regionFilename, regionPositionColumn, regionNameColumn, ®ionPosition, ®ionName)))
338 SDDS_Bomb(
"Problem with region file. Check existence and type of columns");
343 hist =
tmalloc(
sizeof(*hist) * (bins + 2 * doSides));
344 CDF = CDF1 =
tmalloc(
sizeof(*hist) * (bins + 2 * doSides));
345 indep =
tmalloc(
sizeof(*indep) * (bins + 2 * doSides));
353 if (!setupOutputFile(&outTable, outputfile, &inTable, inputfile, dataColumn, weightColumn, filterColumn, lowerFilter, upperFilter, &SDDSregion, regionNameColumn, doStats, bins, binSize, normalizeMode, columnMajorOrder))
356 data = weightData = filterData = NULL;
365 if (rows && filterColumn)
366 points = filter(data, weightData, filterData, rows, lowerFilter, upperFilter);
380 classifyByRegion(data, weightData, points, hist, regionPosition, bins);
383 if (!lowerLimitGiven) {
384 lowerLimit = (points > 0) ? data[0] : 0;
385 for (i = 0; i < points; i++)
386 if (lowerLimit > data[i])
387 lowerLimit = data[i];
389 lowerLimit = givenLowerLimit;
391 if (!upperLimitGiven) {
392 upperLimit = (points > 0) ? data[0] : 0;
393 for (i = 0; i < points; i++)
394 if (upperLimit < data[i])
395 upperLimit = data[i];
397 upperLimit = givenUpperLimit;
400 range = upperLimit - lowerLimit;
401 if (!lowerLimitGiven)
402 lowerLimit -= range * 1e-7;
403 if (!upperLimitGiven)
404 upperLimit += range * 1e-7;
405 if (upperLimit == lowerLimit) {
407 upperLimit += binSize / 2;
408 lowerLimit -= binSize / 2;
409 }
else if (fabs(upperLimit) < sqrt(DBL_MIN)) {
410 upperLimit = sqrt(DBL_MIN);
411 lowerLimit = -sqrt(DBL_MIN);
413 upperLimit += upperLimit * (1 + 2 * DBL_EPSILON);
414 lowerLimit -= upperLimit * (1 - 2 * DBL_EPSILON);
417 if (expansionFactor > 0) {
418 double center = (upperLimit + lowerLimit) / 2;
419 range = expansionFactor * (upperLimit - lowerLimit);
420 lowerLimit = center - range / 2;
421 upperLimit = center + range / 2;
423 dx = (upperLimit - lowerLimit) / bins;
427 range = ((range / binSize) + 1) * binSize;
428 middle = (lowerLimit + upperLimit) / 2;
429 lowerLimit = middle - range / 2;
430 upperLimit = middle + range / 2;
432 bins = range / binSize + 0.5;
433 if (bins < 1 && !doSides)
435 indep =
trealloc(indep,
sizeof(*indep) * (bins + 2 * doSides));
436 hist =
trealloc(hist,
sizeof(*hist) * (bins + 2 * doSides));
437 CDF =
trealloc(CDF,
sizeof(*hist) * (bins + 2 * doSides));
440 for (i = -doSides; i < bins + doSides; i++)
441 indep[i + doSides] = (i + 0.5) * dx + lowerLimit;
442 hist1 = hist + doSides;
443 CDF1 = CDF + doSides;
445 hist[0] = hist[bins + doSides] = 0;
449 pointsBinned =
make_histogram(hist1, bins, lowerLimit, upperLimit, data, points, 1);
455 for (i = 0; i < bins + doSides; i++) {
458 CDF1[0] = hist1[0] / sum;
459 for (i = 1; i < bins + doSides; i++) {
460 CDF1[i] = CDF1[i - 1] + hist1[i] / sum;
464 fprintf(stderr,
"%ld points of %" PRId64
" from page %ld histogrammed in %" PRId64
" bins\n", pointsBinned, rows, readCode, bins);
466 if (normalizeMode != NORMALIZE_NO) {
468 switch (normalizeMode) {
474 for (i = 0; i < bins; i++)
476 if (normalizeMode == NORMALIZE_AREA)
480 SDDS_Bomb(
"invalid normalize mode--consult programmer.");
484 for (i = 0; i < bins; i++)
493 !
SDDS_SetParameters(&outTable, SDDS_SET_BY_INDEX | SDDS_PASS_BY_VALUE, iBins, bins, iBinSize, dx, iPoints, pointsBinned, -1))
496 if (!
SDDS_SetColumn(&outTable, SDDS_SET_BY_INDEX, regionPosition, bins, iIndep) ||
497 !
SDDS_SetColumn(&outTable, SDDS_SET_BY_NAME, regionName, bins, regionNameColumn))
499 if (!freOnly && !
SDDS_SetColumn(&outTable, SDDS_SET_BY_INDEX, CDF, bins, iCdf))
501 if (!cdfOnly && !
SDDS_SetColumn(&outTable, SDDS_SET_BY_INDEX, hist, bins, iFreq))
507 (points && (!
SDDS_SetColumn(&outTable, SDDS_SET_BY_INDEX, indep, bins + 2 * doSides, iIndep))) ||
508 !
SDDS_SetParameters(&outTable, SDDS_SET_BY_INDEX | SDDS_PASS_BY_VALUE, iBins, bins, iBinSize, dx, iPoints, pointsBinned, -1))
511 if (points && !
SDDS_SetColumn(&outTable, SDDS_SET_BY_INDEX, CDF, bins + 2 * doSides, iCdf))
515 if (points && !
SDDS_SetColumn(&outTable, SDDS_SET_BY_INDEX, hist, bins + 2 * doSides, iFreq))
520 if (filterColumn && points &&
521 !
SDDS_SetParameters(&outTable, SDDS_SET_BY_INDEX | SDDS_PASS_BY_VALUE, iLoFilter, lowerFilter, iUpFilter, upperFilter, -1))
523 if (doStats && points &&
524 !
SDDS_SetParameters(&outTable, SDDS_SET_BY_INDEX | SDDS_PASS_BY_VALUE, iMean, mean, iRMS, rms, iStDev, standDev, -1))
535 data = weightData = filterData = NULL;
553static int64_t filter(
double *x,
double *y,
double *filterData, int64_t npts,
double lower_filter,
double upper_filter) {
555 static char *keep = NULL;
556 static int64_t maxPoints = 0;
558 if (maxPoints < npts)
559 keep =
trealloc(keep,
sizeof(*keep) * (maxPoints = npts));
561 for (i = 0; i < npts; i++) {
562 if (filterData[i] < lower_filter || filterData[i] > upper_filter)
568 for (i = j = 0; i < npts; i++)
575 filterData[j] = filterData[i];
584 char *dataColumn,
char *weightColumn,
char *filterColumn,
double lowerFilter,
585 double upperFilter,
SDDS_DATASET *regionTable,
char *regionNameColumn,
long doStats,
586 int64_t bins,
double binSize,
long normalizeMode,
short columnMajorOrder) {
587 char *symbol, *units, *dataUnits, *outputFormat;
593 if (columnMajorOrder != -1)
594 outTable->layout.data_mode.column_major = columnMajorOrder;
596 outTable->layout.data_mode.column_major = inTable->layout.data_mode.column_major;
612 switch (normalizeMode) {
614 symbol =
"RelativeFrequency";
618 symbol =
"NormalizedFrequency";
620 units =
tmalloc(
sizeof(*units) * (strlen(dataUnits) + 5));
621 if (strchr(dataUnits,
' '))
622 sprintf(units,
"1/(%s)", dataUnits);
624 sprintf(units,
"1/%s", dataUnits);
629 symbol =
"FractionalFrequency";
634 char *weightUnits = NULL;
637 symbol =
"WeightedNumberOfOccurrences";
640 symbol =
"NumberOfOccurrences";
652 sprintf(s,
"%sCdf", dataColumn);
680 buffer =
tmalloc(
sizeof(*buffer) * (strlen(dataColumn) + 20));
681 sprintf(buffer,
"%sMean", dataColumn);
684 sprintf(buffer,
"%sRms", dataColumn);
687 sprintf(buffer,
"%sStDev", dataColumn);
699int64_t readRegionFile(
SDDS_DATASET *SDDSin,
char *filename,
char *positionColumn,
char *nameColumn,
double **regionPosition,
char ***regionName) {
702 SDDS_ReadPage(SDDSin) != 1 || (rows = SDDS_RowCount(SDDSin)) < 1 ||
706 for (i = 1; i < rows; i++)
707 if ((*regionPosition)[i] <= (*regionPosition)[i - 1]) {
708 fprintf(stderr,
"sddshist: Error in region position data: row %" PRId64
" is %21.15e while row %" PRId64
" is %21.15e\n",
709 i - 1, (*regionPosition)[i - 1], i, (*regionPosition)[i]);
712 *regionPosition =
SDDS_Realloc(*regionPosition,
sizeof(**regionPosition) * (rows + 1));
713 (*regionPosition)[rows] = DBL_MAX;
714 *regionName =
SDDS_Realloc(*regionName,
sizeof(**regionName) * (rows + 1));
715 cp_str(&(*regionName)[rows],
"Beyond");
719void classifyByRegion(
double *data,
double *weight, int64_t points,
double *histogram,
double *regionPosition, int64_t bins) {
722 for (iBin = 0; iBin < bins; iBin++)
725 for (iData = 0; iData < points; iData++) {
727 for (iBin = 0; iBin < bins - 1; iBin++) {
728 if (data[iData] < regionPosition[iBin])
732 histogram[iBin] += weight[iData];
734 histogram[iBin] += 1;
SDDS (Self Describing Data Set) Data Types Definitions and Function Prototypes.
int32_t SDDS_CopyParameters(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source)
int32_t SDDS_StartPage(SDDS_DATASET *SDDS_dataset, int64_t expected_n_rows)
int32_t SDDS_SetParameters(SDDS_DATASET *SDDS_dataset, int32_t mode,...)
int32_t SDDS_SetColumn(SDDS_DATASET *SDDS_dataset, int32_t mode, void *data, int64_t rows,...)
Sets the values for one data column in the current data table of an SDDS dataset.
int32_t SDDS_ChangeColumnInformation(SDDS_DATASET *SDDS_dataset, char *field_name, void *memory, int32_t mode,...)
Modifies a specific field in a column definition within the SDDS dataset.
int32_t SDDS_GetColumnInformation(SDDS_DATASET *SDDS_dataset, char *field_name, void *memory, int32_t mode,...)
Retrieves information about a specified column in the SDDS dataset.
int32_t SDDS_InitializeOutput(SDDS_DATASET *SDDS_dataset, int32_t data_mode, int32_t lines_per_row, const char *description, const char *contents, const char *filename)
Initializes the SDDS output dataset.
int32_t SDDS_WritePage(SDDS_DATASET *SDDS_dataset)
Writes the current data table to the output file.
int32_t SDDS_DefineColumn(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, int32_t field_length)
Defines a data column within the SDDS dataset.
int32_t SDDS_WriteLayout(SDDS_DATASET *SDDS_dataset)
Writes the SDDS layout header to the output file.
int32_t SDDS_DefineParameter(SDDS_DATASET *SDDS_dataset, const char *name, const char *symbol, const char *units, const char *description, const char *format_string, int32_t type, char *fixed_value)
Defines a data parameter with a fixed string value.
int32_t SDDS_TransferColumnDefinition(SDDS_DATASET *target, SDDS_DATASET *source, char *name, char *newName)
Transfers a column definition from a source dataset to a target dataset.
int32_t SDDS_TransferAllParameterDefinitions(SDDS_DATASET *SDDS_target, SDDS_DATASET *SDDS_source, uint32_t mode)
Transfers all parameter definitions from a source dataset to a target dataset.
int32_t SDDS_GetColumnIndex(SDDS_DATASET *SDDS_dataset, char *name)
Retrieves the index of a named column in the SDDS dataset.
void SDDS_PrintErrors(FILE *fp, int32_t mode)
Prints recorded error messages to a specified file stream.
void SDDS_RegisterProgramName(const char *name)
Registers the executable program name for use in error messages.
int32_t SDDS_StringIsBlank(char *s)
Checks if a string is blank (contains only whitespace characters).
void SDDS_Bomb(char *message)
Terminates the program after printing an error message and recorded errors.
void * SDDS_Realloc(void *old_ptr, size_t new_size)
Reallocates memory to a new size.
#define SDDS_STRING
Identifier for the string data type.
#define SDDS_LONG
Identifier for the signed 32-bit integer data type.
#define SDDS_DOUBLE
Identifier for the double data type.
void * trealloc(void *old_ptr, uint64_t size_of_block)
Reallocates a memory block to a new size.
void * tmalloc(uint64_t size_of_block)
Allocates a memory block of the specified size with zero initialization.
char * cp_str(char **s, char *t)
Copies a string, allocating memory for storage.
double max_in_array(double *array, long n)
Finds the maximum value in an array of doubles.
long make_histogram_weighted(double *hist, long n_bins, double lo, double hi, double *data, long n_pts, long new_start, double *weight)
Compiles a weighted histogram from data points.
long make_histogram(double *hist, long n_bins, double lo, double hi, double *data, int64_t n_pts, long new_start)
Compiles a histogram from data points.
long match_string(char *string, char **option, long n_options, long mode)
Matches a given string against an array of option strings based on specified modes.
long computeWeightedMomentsThreaded(double *mean, double *rms, double *standDev, double *meanAbsoluteDev, double *x, double *w, long n, long numThreads)
Computes weighted statistical moments of an array using multiple threads.
long computeMomentsThreaded(double *mean, double *rms, double *standDev, double *meanAbsoluteDev, double *x, long n, long numThreads)
Computes the mean, RMS, standard deviation, and mean absolute deviation of an array using multiple th...
int scanargs(SCANNED_ARG **scanned, int argc, char **argv)
long processPipeOption(char **item, long items, unsigned long *flags)
void processFilenames(char *programName, char **input, char **output, unsigned long pipeFlags, long noWarnings, long *tmpOutputUsed)
long scanItemList(unsigned long *flags, char **item, long *items, unsigned long mode,...)
Scans a list of items and assigns values based on provided keywords and types.