Intermediate. Basic insert seems to be working.
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3878e37 --- /dev/null +++ b/.gitignore
@@ -0,0 +1,3 @@ +*.o +*.d +*.main
diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..27fbc86 --- /dev/null +++ b/Makefile
@@ -0,0 +1,19 @@ +SRCS = median.c \ + hex.c +MAIN = median.main.c +OBJS = $(patsubst %.c,%.o,$(SRCS)) +MAINO = $(patsubst %.c,%.o,$(MAIN)) +DEPS = $(patsubst %.c,%.d,$(SRCS) $(MAIN)) +EXECS = $(patsubst %.c,%,$(MAIN)) +FLAGS = -std=c99 -Wall +all : $(EXECS) +clean : + rm -f $(OBJS) $(DEPS) $(MAINO) $(EXECS) +%.o : %.c + gcc $(FLAGS) -MD -c $< +%.d : %.c + gcc $(FLAGS) -MD -c $< +%.main : %.main.o $(OBJS) + gcc $(FLAGS) $< $(OBJS) -o $@ + +-include $(DEPS)
diff --git a/README b/README new file mode 100644 index 0000000..51f845f --- /dev/null +++ b/README
@@ -0,0 +1,3 @@ +This is the repository for the median finding code. +Loosely based on the Patterson and Manku-Rajagopalan-Lindsay algorithms, but +somehow different (and better) than those.
diff --git a/hex.c b/hex.c new file mode 100644 index 0000000..cc42430 --- /dev/null +++ b/hex.c
@@ -0,0 +1,27 @@ +#include "hex.h" +#include <stdio.h> + +hexdmp_error_t stderr_hexdmp(const char * description, const unsigned char * data_ptr, size_t datasz){ + if(description) { + fprintf(stderr,"%s:\n", description); + } + if(datasz == 0) { + fprintf(stderr,"No Data, Size is 0"); + return HEXDMP_OK; + } + if(!data_ptr) { + fprintf(stderr,"No data pointer"); + return HEXDMP_ERROR; + } + for(size_t i = 0; i < datasz; i += 16) { + // First print the offset. + fprintf(stderr, "%08x", (unsigned) i); + // Now print the next (at most) 16 characters. + for(size_t j = 0; (j < 16) && (i+j) < datasz; j++) { + fprintf(stderr, " %02x", data_ptr[i+j]); + } + // Print a newline. + fprintf(stderr, "\n"); + } + return HEXDMP_OK; +}
diff --git a/hex.h b/hex.h new file mode 100644 index 0000000..d47281e --- /dev/null +++ b/hex.h
@@ -0,0 +1,11 @@ +#ifndef __MEDIAN_HEX_H_INCLUDED__ +#define __MEDIAN_HEX_H_INCLUDED__ +#include <strings.h> // size_t +typedef int hexdmp_error_t; +static const hexdmp_error_t HEXDMP_OK = 0; +static const hexdmp_error_t HEXDMP_ERROR = -1; + +// The function produces an output to stderr similar to the "hexdump" +// application. Useful in debugging things. +hexdmp_error_t stderr_hexdmp(const char * description, const unsigned char * data_ptr, size_t datasz); +#endif // __MEDIAN_HEX_H_INCLUDED__
diff --git a/median.c b/median.c new file mode 100644 index 0000000..9a6f183 --- /dev/null +++ b/median.c
@@ -0,0 +1,196 @@ +#include "median.h" +#include "hex.h" +#include <strings.h> +#include <stdio.h> + +// The structure of the data buffer is this: +// ---------------------------------------------------------------------------- +// | Metadata | BlockData | Data * nMembers | BlockData | Data * nMembers ..... +// ---------------------------------------------------------------------------- +// Where the number of blocks is nLevels and stored in Metadata, and nMembers +// is stored in Metadata as well. +// The buffer is cast into this struct. +typedef struct metadata { + unsigned int gid; // The next available global id + size_t nLevels; // Number of buffers available. + size_t nMembers; // The number of members in each block. + size_t currentBufferIdx; // The index of the currently active buffer. + unsigned char data[0]; // The start of the data. +} metadata_t; + +typedef struct bufferdata { + unsigned int id; // Block identity. + size_t level; // The level of this buffer. + size_t occupancy; // Number of occupants for this buffer. + median_data_t data[0]; // The data for this buffer. +} blockdata_t; + + +// ---------------------------------------------------------------------------- +// Helper functions. +// +// This turns epsilon into a suggested size for each block. +size_t calculate_nMembers(double epsilon) { + return (size_t) ((1.0 / epsilon) + 1); +} + +// This turns maxN into the number of levels needed. It's ceiling(log2(maxN)). +size_t calculate_nLevels(size_t maxN) { + size_t nLevels = 1; + while((maxN = (maxN >> 1))) nLevels++; + return nLevels; +} + +// This returns the blocksize. +size_t blocksize(metadata_t * m) { + return sizeof(blockdata_t) + m->nMembers * sizeof(median_data_t); +} + +// This returns the size of the entire buffer. +size_t buffersize(metadata_t * m) { + return sizeof(metadata_t) + m->nLevels * blocksize(m); +} + +// Returns the first byte address which is out of bounds. +unsigned char * out_of_bounds(metadata_t * m) { + return m->data + buffersize(m) - sizeof(blockdata_t); +} + +// Get the current block. +blockdata_t * get_current_block(metadata_t *m) { + size_t offset = m->currentBufferIdx * blocksize(m); + return (blockdata_t *)(m->data + offset); +} +// Does the current block have space? +unsigned int current_block_has_space(metadata_t* m) { + return (get_current_block(m)->occupancy < m->nMembers); +} + +// This will only be called in a safe situation. It's an internal function. +// Does not bound check! +// Do not call it unless you know what you are doing. +void insert_data_into_current_block(metadata_t *m, median_data_t data) { + blockdata_t * b = get_current_block(m); + b->data[b->occupancy++] = data; +} + +unsigned int occupy_available_empty_block(metadata_t * m) { + size_t idx = 0; + for(unsigned char * ptr = (unsigned char *) m->data; // Start of the data blocks. + ptr <= (m->data + buffersize(m) - blocksize(m)); // The next block will fit into the buffer. + ptr += blocksize(m)) { // Advance by a block + blockdata_t * b = (blockdata_t *)ptr; + if(b->occupancy < m->nMembers) { + // Empty block found! + m->currentBufferIdx = idx; + return 1; + } else { + idx++; + } + } + return 0; +} + +void create_empty_block(metadata_t * m) { + // TODO +} + +// Shift the current block. +void shift_current_block(metadata_t * m) { + if(!occupy_available_empty_block(m)) { + create_empty_block(m); + } +} + + +// ---------------------------------------------------------------------------- +// Implementation of the interface functions (specified in median.h). +median_error_t median_suggest_buffer_size(double epsilon, size_t maxN, size_t *suggested_size){ + // First check for sane parameters. + + // Check that maxN is not zero. + if (maxN <= 0) return MEDIAN_ERROR_INVALID_PARAMETERS; + + // If n is too large, (i.e. epsilon is too small), you are looking for too fine an approximation. + // This is not the right algorithm. Maybe consider random sampling instead. + if (calculate_nMembers(epsilon) > 100000) return MEDIAN_ERROR_INVALID_PARAMETERS; + + // Calculate the required buffer size for the right nLevels and nMembers + // value. + metadata_t m; + m.nLevels = calculate_nLevels(maxN); + m.nMembers = calculate_nMembers(epsilon); + *suggested_size = buffersize(&m); + + // Things worked. Return MEDIAN_OK + return MEDIAN_OK; +} +median_error_t median_init_buffer(void * buffer, size_t buffer_size, double epsilon, size_t maxN, median_buffer_t *initialized_buffer){ + + // Initialize. + size_t expectedSize = 0; + median_error_t error = MEDIAN_OK; + + // First run median_suggest_buffer_size + if ((error = median_suggest_buffer_size(epsilon,maxN,&expectedSize)) != MEDIAN_OK) return error; + // Ensure that the buffer is large enough. + if (expectedSize > buffer_size) return MEDIAN_ERROR_BUFFER_TOO_SMALL; + // Cast the buffer as metadata_t so that we can initialize the metadata block. + metadata_t * m = (metadata_t *)buffer; + // Initialize the metadata block. + m->gid = 1; + m->nMembers = calculate_nMembers(epsilon); + m->nLevels = calculate_nLevels(maxN); + m->currentBufferIdx = 0; + // Zero all the data following the metadata block. + // Strictly this should not be necessary, but I'm a coward! + bzero(m->data, buffersize(m) - sizeof(metadata_t)); + + // cast the data part as an array of blockdata_t fronted blocks. + // and initialize each of the blocks. + for(unsigned char * ptr = (unsigned char *) m->data; // Start of the data blocks. + ptr <= (m->data + buffersize(m) - blocksize(m)); // The next block will fit into the buffer. + ptr += blocksize(m)) { // Advance by a block + blockdata_t * t = (blockdata_t *) ptr; + t->id = (m->gid)++; + t->level = 0; + t->occupancy = 0; + } + + // Populate the return value. + *initialized_buffer = (median_buffer_t)buffer; + return error; +} + + +median_error_t median_insert_data(median_buffer_t buffer, + median_data_t data){ + metadata_t * m = (metadata_t *) buffer; + if(current_block_has_space(m)) { + insert_data_into_current_block(m,data); + } else { + shift_current_block(m); + return median_insert_data(buffer,data); + } + return MEDIAN_OK; +} + +median_error_t median_output_median(const median_buffer_t buffer, + median_data_t *approximate_median){ + return MEDIAN_OK; +} + +median_error_t median_dump_stderr(const median_buffer_t buffer){ + metadata_t * m = (metadata_t *) buffer; + blockdata_t * current = get_current_block(m); + fprintf(stderr, "Metadata:\n Next GID:%d\n nMembers:%d\n nLevels:%d\n",m->gid,(unsigned)m->nMembers,(unsigned)m->nLevels); + fprintf(stderr, "Current Block:%d\n", current->id); + for(unsigned char * ptr = (unsigned char *) m->data; // Start of the data blocks. + ptr <= (m->data + buffersize(m) - blocksize(m)); // The next block will fit into the buffer. + ptr += blocksize(m)) { // Advance by a block + blockdata_t * b = (blockdata_t *)ptr; + fprintf(stderr, "Block:%d\n Level:%d\n Occupancy:%d\n", b->id,(unsigned)b->level,(unsigned)b->occupancy); + stderr_hexdmp("Data:",(unsigned char *)(b->data),256); + } + return MEDIAN_OK; +}
diff --git a/median.h b/median.h new file mode 100644 index 0000000..c62fbbd --- /dev/null +++ b/median.h
@@ -0,0 +1,116 @@ +#ifndef __MONITORING_MEDIAN_H_INCLUDED__ +#define __MONITORING_MEDIAN_H_INCLUDED__ +#include <stdint.h> +#include <stddef.h> + +// ------------------------------------------------------------------------ +// The core phylosophy of the interface is this. +// The code and data structure are completely separated. +// There are no mallocs in this codebase. +// This is so that when we have a long running process which is using this +// these methods to maintain statistics, there is no memory allocation or +// fragmentation going on. This means that the statistics are always valid +// and we do not have to figure out how report them in panic situations. +// Other than memory corruption issues, the state of the buffer will always +// be good, and we will never be in a state where an update could not occur +// because of memory pressure. +// ------------------------------------------------------------------------ + +// The ErrorCode returned by all +// functions in this file. +typedef int32_t median_error_t; + +// A buffer which contains all the state held +// within an approximate median calculator. +typedef void * median_buffer_t; + +// A single data element. For now this is defined +// to be a 64 bit integer. But in principle this could be +// anything at all. +typedef uint64_t median_data_t; + +// All the error codes. +// No Error. +static const median_error_t MEDIAN_OK = 0; + +// The parameters are meaningless (for example, asking for neg values +// for epsilon). +static const median_error_t MEDIAN_ERROR_INVALID_PARAMETERS = -1; + +// The buffer allocated is too small for the parameters requested. +static const median_error_t MEDIAN_ERROR_BUFFER_TOO_SMALL = -2; + +// Overflow because MaxN has been exceeded. +static const median_error_t MEDIAN_ERROR_MAX_N_EXCEEDED = -3; + +// Indicates that the size of the largest possible dataset is not known. +static const size_t MEDIAN_MAX_N_UNKNOWN = 0; + +// The default size of the largest possible dataset. +static const size_t MEDIAN_MAX_N_DEFAULT = 1024*1024*1024; // 1G. + +// This routine returns a suggested size for a buffer to hold all the +// state needed for a median calculation. Here epsilon is the permissible error. +// maxN is the largest dataset that we anticipate. If maxN is set to +// MEDIAN_MAX_N_UNKNOWN (or zero), then we assume MEDIAN_MAX_N_DEFAULT to be the value of +// maxN. The value return parameter is conventionally the last one. In this +// case, it contains the suggested_size of the buffer, assuming that the return +// value is MEDIAN_ERROR_OK. +// In all reasonable calls, the returned value will be +// MEDIAN_ERROR_OK. +median_error_t median_suggest_buffer_size(double epsilon, size_t maxN, size_t *suggested_size); + +// This routine will init the buffer. It will check if the buffer meets the +// size restriction returned by #median_suggest_buffer_size(). If not, it will +// exit with MEDIAN_ERROR_BUFFER_TOO_SMALL. +// The method can also exit with MEDIAN_ERR)R_INVALID_PARAMETERS in case there +// are some meaningless parameters passed in the call. +// We assume that a buffer of length buffer_size (in bytes) has been allocated and is +// available at the pointer specified by buffer. +// The parameters to median_suggest_buffer_size are passed into the data +// initialization routine, and the init routine will store them inside the data +// structure, so that the values can be used during the operation +// of the data structure. +// The value return parameter initialized_buffer is identical to the buffer +// variable (i.e. *initialized_buffer == buffer), but is appropriately cast +// to indicate that it is initialized. +// In all reasonable calls, the returned value will be +// MEDIAN_ERROR_OK. +median_error_t median_init_buffer(void * buffer, size_t buffer_size, double epsilon, size_t maxN, median_buffer_t *initialized_buffer); + +// This routine inserts a data point into the median computation. It's +// implemented to be really cheap. However, it is not always constant time. +// There can be invocations of this function which take a bit longer, but +// in no case can it become onerous. The runtime is ammortizable to a constant +// per invocation. +// The first parameter, buffer, is the pointer to the buffer. +// The second parameter, data, is the data to be inserted. +// If the data inserted exceeds MaxN, an error might be returned. In such a +// case, MEDIAN_ERROR_MAX_N_EXCEEDED will be returned. There is no guarantee +// that the error will be returned as soon as MaxN is exceeded. But no error +// will be returned until it is. +// If the parameters do not make sense, for example buffer is 0x0, +// then MEDIAN_ERROR_INVALID_PARAMETERS will be returned. +// In all reasonable calls, the returned value will be +// MEDIAN_ERROR_OK. +median_error_t median_insert_data(median_buffer_t buffer, median_data_t data); + +// This routine returns the approximate median of the data seen thusfar. +// This is implemented to be really cheap as well, but not as cheap as the +// median_insert_data() function. So it should be used a bit less than the other +// case. +// The first parameter, buffer, is the buffer which contains the inserted data. +// The second parameter, approximate_median, is a value return parameter which +// will, on exit, contain the value of the approximate median. +// The only condition under which this method will return an error is if one of +// the parameters is invalid (for example, approximate_median is null). +// By and large, if there is no bug in the calling code, this method should not +// return an error. +// If it does return an error, the error will be +// MEDIAN_ERROR_INVALID_PARAMETERS +median_error_t median_output_median(const median_buffer_t buffer, median_data_t *approximate_median); + +// This hexdumps the state of the buffer to stderr so that it can help debug if needed. +median_error_t median_dump_stderr(const median_buffer_t buffer); + +#endif // ._MONITORING_MEDIAN>H_INCLUDED__
diff --git a/median.main.c b/median.main.c new file mode 100644 index 0000000..1aef5c7 --- /dev/null +++ b/median.main.c
@@ -0,0 +1,31 @@ +#include <stdlib.h> // size_t +#include <assert.h> // assert +#include "median.h" +#include "hex.h" + +// A sinple main routine meant as an example, +// and also to unit test most of the intersting +// functions of the median module. +int main(int argc, char * argv[]) { + // Call suggest buffer size to determine the size of the buffer + // we need to build the data structure. + size_t n = 0; + assert(median_suggest_buffer_size(.001, 1000, &n) == MEDIAN_OK); + + // Allocate the buffer of the required size. And then + // initialize it (median_init_buffer). + void * b = malloc(n); + median_buffer_t buf; + assert(median_init_buffer(b, n, .001, 1000, &buf) == MEDIAN_OK); + + // Insert some data. + for(median_data_t i = 1; i < 3010; i++) { + assert(median_insert_data(buf,i) == MEDIAN_OK); + } + + // Dump it. + assert(median_dump_stderr(buf) == MEDIAN_OK); + + // All done, return 0 + return 0; +}