squiggle.c

Self-contained Monte Carlo estimation in C99
Log | Files | Refs | README

commit e053a726eeee9e8f90b3290daf0ab4fa1b9eb3da
parent d531d5571ff74b9ee8c7073c82c1b49ac6b4e359
Author: NunoSempere <nuno.sempere@protonmail.com>
Date:   Sun, 23 Jul 2023 19:11:25 +0200

add example of getting confidence interval & misc changes

Diffstat:
MREADME.md | 22+++++++++++++++-------
Mexamples/01_one_sample/example | 0
Mexamples/02_many_samples/example | 0
Mexamples/03_gcc_nested_function/example | 0
Mexamples/04_sample_from_cdf_simple/example | 0
Mexamples/04_sample_from_cdf_simple/example.c | 4++--
Mexamples/05_sample_from_cdf_beta/example | 0
Mexamples/06_gamma_beta/example | 0
Mexamples/06_gamma_beta/example.c | 7-------
Aexamples/07_ci_beta/example | 0
Aexamples/07_ci_beta/example.c | 21+++++++++++++++++++++
Aexamples/07_ci_beta/makefile | 53+++++++++++++++++++++++++++++++++++++++++++++++++++++
Mmakefile | 1+
Msquiggle.c | 43+++++++++++++++++++++++++++++++++++++++++++
Msquiggle.h | 7+++++++
15 files changed, 142 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md @@ -11,7 +11,8 @@ A self-contained C99 library that provides a subset of [Squiggle](https://www.sq - Because it can fit in my head - Because if you can implement something in C, you can implement it anywhere else - Because it can be made faster if need be - - e.g., with a multi-threading library like OpenMP, or by adding more algorithmic complexity + - e.g., with a multi-threading library like OpenMP, + - or by implementing faster but more complex algorithms - or more simply, by inlining the sampling functions (adding an `inline` directive before their function declaration) - **Because there are few abstractions between it and machine code** (C => assembly => machine code with gcc, or C => machine code, with tcc), leading to fewer errors beyond the programmer's control. @@ -184,16 +185,11 @@ int main(){ ## To do list -- [ ] Test summary statistics for each of the distributions. +- [ ] Pontificate about lognormal tests - [ ] Have some more complicated & realistic example - [ ] Add summarization functions: 90% ci (or all c.i.?) - [ ] Systematize references - [ ] Publish online -- [ ] Add efficient sampling from a beta distribution - - https://dl.acm.org/doi/10.1145/358407.358414 - - https://link.springer.com/article/10.1007/bf02293108 - - https://stats.stackexchange.com/questions/502146/how-does-numpy-generate-samples-from-a-beta-distribution - - https://github.com/numpy/numpy/blob/5cae51e794d69dd553104099305e9f92db237c53/numpy/random/src/distributions/distributions.c - [ ] Support all distribution functions in <https://www.squiggle-language.com/docs/Api/Dist> - [ ] Support all distribution functions in <https://www.squiggle-language.com/docs/Api/Dist>, and do so efficiently @@ -224,3 +220,15 @@ int main(){ - https://dl.acm.org/doi/pdf/10.1145/358407.358414 - [x] Explain correlated samples - [-] ~~Add tests in Stan?~~ +- [x] Test summary statistics for each of the distributions. + - [x] For uniform + - [x] For normal + - [x] For lognormal + - [x] For lognormal (to syntax) + - [x] For beta distribution +- [x] Clarify gamma/standard gamma +- [x] Add efficient sampling from a beta distribution + - https://dl.acm.org/doi/10.1145/358407.358414 + - https://link.springer.com/article/10.1007/bf02293108 + - https://stats.stackexchange.com/questions/502146/how-does-numpy-generate-samples-from-a-beta-distribution + - https://github.com/numpy/numpy/blob/5cae51e794d69dd553104099305e9f92db237c53/numpy/random/src/distributions/distributions.c diff --git a/examples/01_one_sample/example b/examples/01_one_sample/example Binary files differ. diff --git a/examples/02_many_samples/example b/examples/02_many_samples/example Binary files differ. diff --git a/examples/03_gcc_nested_function/example b/examples/03_gcc_nested_function/example Binary files differ. diff --git a/examples/04_sample_from_cdf_simple/example b/examples/04_sample_from_cdf_simple/example Binary files differ. diff --git a/examples/04_sample_from_cdf_simple/example.c b/examples/04_sample_from_cdf_simple/example.c @@ -87,9 +87,9 @@ int main() printf("\nGetting some samples from sample_unit_normal\n"); clock_t begin_2 = clock(); - + double* normal_samples = malloc(NUM_SAMPLES * sizeof(double)); for (int i = 0; i < NUM_SAMPLES; i++) { - double normal_sample = sample_unit_normal(seed); + normal_samples[i] = sample_unit_normal(seed); // printf("%f\n", normal_sample); } diff --git a/examples/05_sample_from_cdf_beta/example b/examples/05_sample_from_cdf_beta/example Binary files differ. diff --git a/examples/06_gamma_beta/example b/examples/06_gamma_beta/example Binary files differ. diff --git a/examples/06_gamma_beta/example.c b/examples/06_gamma_beta/example.c @@ -43,10 +43,3 @@ int main() free(seed); } -/* -Aggregation mechanisms: -- Quantiles (requires a sort) -- Sum -- Average -- Std -*/ diff --git a/examples/07_ci_beta/example b/examples/07_ci_beta/example Binary files differ. diff --git a/examples/07_ci_beta/example.c b/examples/07_ci_beta/example.c @@ -0,0 +1,21 @@ +#include "../../squiggle.h" +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> + +// Estimate functions +double beta_1_2_sampler(uint64_t* seed){ + return sample_beta(1, 2.0, seed); +} + +int main() +{ + // set randomness seed + uint64_t* seed = malloc(sizeof(uint64_t)); + *seed = 1000; // xorshift can't start with 0 + + struct c_i beta_1_2_ci_90 = get_90_confidence_interval(beta_1_2_sampler, seed); + printf("90%% confidence interval of beta(1,2) is [%f, %f]\n", beta_1_2_ci_90.low, beta_1_2_ci_90.high); + + free(seed); +} diff --git a/examples/07_ci_beta/makefile b/examples/07_ci_beta/makefile @@ -0,0 +1,53 @@ +# Interface: +# make +# make build +# make format +# make run + +# Compiler +CC=gcc +# CC=tcc # <= faster compilation + +# Main file +SRC=example.c ../../squiggle.c +OUTPUT=example + +## Dependencies +MATH=-lm + +## Flags +DEBUG= #'-g' +STANDARD=-std=c99 +WARNINGS=-Wall +OPTIMIZED=-O3 #-Ofast +# OPENMP=-fopenmp + +## Formatter +STYLE_BLUEPRINT=webkit +FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT) + +## make build +build: $(SRC) + $(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(MATH) -o $(OUTPUT) + +format: $(SRC) + $(FORMATTER) $(SRC) + +run: $(SRC) $(OUTPUT) + OMP_NUM_THREADS=1 ./$(OUTPUT) && echo + +time-linux: + @echo "Requires /bin/time, found on GNU/Linux systems" && echo + + @echo "Running 100x and taking avg time $(OUTPUT)" + @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 1 thread: |" | sed 's|$$|ms|' && echo + +## Profiling + +profile-linux: + echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar" + echo "Must be run as sudo" + $(CC) $(SRC) $(MATH) -o $(OUTPUT) + sudo perf record ./$(OUTPUT) + sudo perf report + rm perf.data diff --git a/makefile b/makefile @@ -11,6 +11,7 @@ all: cd examples/04_sample_from_cdf_simple && make && echo cd examples/05_sample_from_cdf_beta && make && echo cd examples/06_gamma_beta && make && echo + cd examples/07_ci_beta && make && echo format: squiggle.c squiggle.h $(FORMATTER) squiggle.c diff --git a/squiggle.c b/squiggle.c @@ -94,6 +94,12 @@ double sample_gamma(double alpha, uint64_t* seed) // A Simple Method for Generating Gamma Variables, Marsaglia and Wan Tsang, 2001 // https://dl.acm.org/doi/pdf/10.1145/358407.358414 // see also the references/ folder + // Note that the Wikipedia page for the gamma distribution includes a scaling parameter + // k or beta + // https://en.wikipedia.org/wiki/Gamma_distribution + // such that gamma_k(alpha, k) = k * gamma(alpha) + // or gamma_beta(alpha, beta) = gamma(alpha) / beta + // So far I have not needed to use this, and thus the second parameter is by default 1. if (alpha >= 1) { double d, c, x, v, u; d = alpha - 1.0 / 3.0; @@ -377,6 +383,43 @@ struct box sampler_cdf_double(double cdf(double), uint64_t* seed) return result; } +// Get confidence intervals, given a sampler + +struct c_i { + float low; + float high; +}; +int compare_doubles(const void *p, const void *q) { + // https://wikiless.esmailelbob.xyz/wiki/Qsort?lang=en + double x = *(const double *)p; + double y = *(const double *)q; + + /* Avoid return x - y, which can cause undefined behaviour + because of signed integer overflow. */ + if (x < y) + return -1; // Return -1 if you want ascending, 1 if you want descending order. + else if (x > y) + return 1; // Return 1 if you want ascending, -1 if you want descending order. + + return 0; +} +struct c_i get_90_confidence_interval(double (*sampler)(uint64_t*), uint64_t* seed){ + int n = 100 * 1000; + double* samples_array = malloc(n * sizeof(double)); + for(int i=0; i<n; i++){ + samples_array[i] = sampler(seed); + } + qsort(samples_array, n, sizeof(double), compare_doubles); + + struct c_i result = { + .low = samples_array[5000], + .high =samples_array[94999], + }; + free(samples_array); + + return result; +} + /* Could also define other variations, e.g., double sampler_danger(struct box cdf(double), uint64_t* seed) { diff --git a/squiggle.h b/squiggle.h @@ -50,4 +50,11 @@ struct box inverse_cdf_box(struct box cdf_box(double), double p); struct box sampler_cdf_double(double cdf(double), uint64_t* seed); struct box sampler_cdf_box(struct box cdf(double), uint64_t* seed); +// Get 90% confidence interval +struct c_i { + float low; + float high; +}; +struct c_i get_90_confidence_interval(double (*sampler)(uint64_t*), uint64_t* seed); + #endif