Intermediate commit. So that those who are interested can read through the code!
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3878e37 --- /dev/null +++ b/.gitignore
@@ -0,0 +1,3 @@ +*.o +*.d +*.main
diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..29ea20d --- /dev/null +++ b/Makefile
@@ -0,0 +1,17 @@ +SRCS = median.c +MAIN = median.main.c +OBJS = $(patsubst %.c,%.o,$(SRCS)) +MAINO = $(patsubst %.c,%.o,$(MAIN)) +DEPS = $(patsubst %.c,%.d,$(SRCS) $(MAIN)) +EXECS = $(patsubst %.c,%,$(MAIN)) +all : $(EXECS) +clean : + rm -f $(OBJS) $(DEPS) $(MAINO) +%.o : %.c + gcc -MD -c $< +%.d : %.c + gcc -MD -c $< +%.main : %.main.o $(OBJS) + gcc $< $(OBJS) -o $@ + +-include $(DEPS)
diff --git a/median.c b/median.c new file mode 100644 index 0000000..face1ef --- /dev/null +++ b/median.c
@@ -0,0 +1,48 @@ +#include "median.h" +#include <strings.h> + +typedef struct metadata { + size_t buffer_size; + size_t nLevels; + unsigned char data[0]; +} metadata_t; + +median_error_t median_suggest_buffer_size(double epsilon, size_t maxN, size_t *suggested_size){ + // First check for sane parameters. + if (maxN <= 0) return MEDIAN_ERROR_INVALID_PARAMETERS; + // First compute the size of each buffer. + size_t n = (1.0/epsilon)+ 1; + // If n is too large, you are looking for too fine an approximation. + // This is not the right algorithm. Maybe consider random sampling instead. + if (n > 100000) return MEDIAN_ERROR_INVALID_PARAMETERS; + // Compute the log of maxN using a divide by 2 loop. + // There is no need to make this more complicated at this stage. It's not + // a deep loop function. + size_t nLevels = 1; + while(maxN = (maxN >> 1)) nLevels++; + // What is the size of the metadata structure + size_t extra = sizeof(metadata_t); + // This is the total. + *suggested_size = n * nLevels + extra; + + return MEDIAN_ERROR_OK; +} +median_error_t median_init_buffer(void * buffer, size_t buffer_size, double epsilon, size_t maxN, median_buffer_t *initialized_buffer){ + // Initialize. + size_t expected_size = 0; + median_error_t error = MEDIAN_ERROR_OK;; + // First run median_suggest_buffer_size + if ((error = median_suggest_buffer_size(epsilon,maxN,&expected_size)) != MEDIAN_ERROR_OK) return error; + // Ensure that the buffer is large enough. + if (expected_size > buffer_size) return MEDIAN_ERROR_BUFFER_TOO_SMALL; + metadata_t * m = (metadata_t *)buffer; + + m->buffer_size = buffer_size - sizeof(metadata_t); + m->nLevels = m->buffer_size/(maxN * sizeof(median_data_t)); + bzero(m->data, m->buffer_size); + + return error; +} + + +
diff --git a/median.h b/median.h new file mode 100644 index 0000000..90a3650 --- /dev/null +++ b/median.h
@@ -0,0 +1,113 @@ +#ifndef __MONITORING_MEDIAN_H_INCLUDED__ +#define __MONITORING_MEDIAN_H_INCLUDED__ +#include <stdint.h> +#include <stddef.h> + +// ------------------------------------------------------------------------ +// The core phylosophy of the interface is this. +// The code and data structure are completely separated. +// There are no mallocs in this codebase. +// This is so that when we have a long running process which is using this +// these methods to maintain statistics, there is no memory allocation or +// fragmentation going on. This means that the statistics are always valid +// and we do not have to figure out how report them in panic situations. +// Other than memory corruption issues, the state of the buffer will always +// be good, and we will never be in a state where an update could not occur +// because of memory pressure. +// ------------------------------------------------------------------------ + +// The ErrorCode returned by all +// functions in this file. +typedef int32_t median_error_t; + +// A buffer which contains all the state held +// within an approximate median calculator. +typedef void * median_buffer_t; + +// A single data element. For now this is defined +// to be a 64 bit integer. But in principle this could be +// anything at all. +typedef uint64_t median_data_t; + +// All the error codes. +// No Error. +static median_error_t MEDIAN_ERROR_OK = 0; + +// The parameters are meaningless (for example, asking for neg values +// for epsilon). +static median_error_t MEDIAN_ERROR_INVALID_PARAMETERS = -1; + +// The buffer allocated is too small for the parameters requested. +static median_error_t MEDIAN_ERROR_BUFFER_TOO_SMALL = -2; + +// Overflow because MaxN has been exceeded. +static median_error_t MEDIAN_ERROR_MAX_N_EXCEEDED = -3; + +// Indicates that the size of the largest possible dataset is not known. +static size_t MEDIAN_MAX_N_UNKNOWN = 0; + +// The default size of the largest possible dataset. +static size_t MEDIAN_MAX_N_DEFAULT = 1024*1024*1024; // 1G. + +// This routine returns a suggested size for a buffer to hold all the +// state needed for a median calculation. Here epsilon is the permissible error. +// maxN is the largest dataset that we anticipate. If maxN is set to +// MEDIAN_MAX_N_UNKNOWN (or zero), then we assume MEDIAN_MAX_N_DEFAULT to be the value of +// maxN. The value return parameter is conventionally the last one. In this +// case, it contains the suggested_size of the buffer, assuming that the return +// value is MEDIAN_ERROR_OK. +// In all reasonable calls, the returned value will be +// MEDIAN_ERROR_OK. +median_error_t median_suggest_buffer_size(double epsilon, size_t maxN, size_t *suggested_size); + +// This routine will init the buffer. It will check if the buffer meets the +// size restriction returned by #median_suggest_buffer_size(). If not, it will +// exit with MEDIAN_ERROR_BUFFER_TOO_SMALL. +// The method can also exit with MEDIAN_ERR)R_INVALID_PARAMETERS in case there +// are some meaningless parameters passed in the call. +// We assume that a buffer of length buffer_size (in bytes) has been allocated and is +// available at the pointer specified by buffer. +// The parameters to median_suggest_buffer_size are passed into the data +// initialization routine, and the init routine will store them inside the data +// structure, so that the values can be used during the operation +// of the data structure. +// The value return parameter initialized_buffer is identical to the buffer +// variable (i.e. *initialized_buffer == buffer), but is appropriately cast +// to indicate that it is initialized. +// In all reasonable calls, the returned value will be +// MEDIAN_ERROR_OK. +median_error_t median_init_buffer(void * buffer, size_t buffer_size, double epsilon, size_t maxN, median_buffer_t *initialized_buffer); + +// This routine inserts a data point into the median computation. It's +// implemented to be really cheap. However, it is not always constant time. +// There can be invocations of this function which take a bit longer, but +// in no case can it become onerous. The runtime is ammortizable to a constant +// per invocation. +// The first parameter, buffer, is the pointer to the buffer. +// The second parameter, data, is the data to be inserted. +// If the data inserted exceeds MaxN, an error might be returned. In such a +// case, MEDIAN_ERROR_MAX_N_EXCEEDED will be returned. There is no guarantee +// that the error will be returned as soon as MaxN is exceeded. But no error +// will be returned until it is. +// If the parameters do not make sense, for example buffer is 0x0, +// then MEDIAN_ERROR_INVALID_PARAMETERS will be returned. +// In all reasonable calls, the returned value will be +// MEDIAN_ERROR_OK. +median_error_t median_insert_data(median_buffer_t buffer, median_data_t data); + +// This routine returns the approximate median of the data seen thusfar. +// This is implemented to be really cheap as well, but not as cheap as the +// median_insert_data() function. So it should be used a bit less than the other +// case. +// The first parameter, buffer, is the buffer which contains the inserted data. +// The second parameter, approximate_median, is a value return parameter which +// will, on exit, contain the value of the approximate median. +// The only condition under which this method will return an error is if one of +// the parameters is invalid (for example, approximate_median is null). +// By and large, if there is no bug in the calling code, this method should not +// return an error. +// If it does return an error, the error will be +// MEDIAN_ERROR_INVALID_PARAMETERS +median_error_t median_output_median(median_buffer_t buffer, median_data_t *approximate_median); + +#endif // ._MONITORING_MEDIAN>H_INCLUDED__
diff --git a/median.main.c b/median.main.c new file mode 100644 index 0000000..4879c04 --- /dev/null +++ b/median.main.c
@@ -0,0 +1,9 @@ +#include <stdio.h> +#include "median.h" + +int main(int argc, char * argv[]) { + size_t n = 0; + median_suggest_buffer_size(.001, 1000, &n); + printf("%ld\n", n); + return 0; +}