add example of getting confidence interval & misc changes - squiggle.c - Self-contained Monte Carlo estimation in C99

commit e053a726eeee9e8f90b3290daf0ab4fa1b9eb3da
parent d531d5571ff74b9ee8c7073c82c1b49ac6b4e359
Author: NunoSempere <nuno.sempere@protonmail.com>
Date:   Sun, 23 Jul 2023 19:11:25 +0200

add example of getting confidence interval & misc changes

Diffstat:
M README.md  | 22 +++++++++++++++-------
M examples/01_one_sample/example  | 0 
M examples/02_many_samples/example  | 0 
M examples/03_gcc_nested_function/example  | 0 
M examples/04_sample_from_cdf_simple/example  | 0 
M examples/04_sample_from_cdf_simple/example.c  | 4 ++--
M examples/05_sample_from_cdf_beta/example  | 0 
M examples/06_gamma_beta/example  | 0 
M examples/06_gamma_beta/example.c  | 7 -------
A examples/07_ci_beta/example  | 0 
A examples/07_ci_beta/example.c  | 21 +++++++++++++++++++++
A examples/07_ci_beta/makefile  | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
M makefile  | 1 +
M squiggle.c  | 43 +++++++++++++++++++++++++++++++++++++++++++
M squiggle.h  | 7 +++++++

15 files changed, 142 insertions(+), 16 deletions(-)
diff --git a/README.md b/README.md
@@ -11,7 +11,8 @@ A self-contained C99 library that provides a subset of [Squiggle](https://www.sq
 - Because it can fit in my head
 - Because if you can implement something in C, you can implement it anywhere else
 - Because it can be made faster if need be
-  - e.g., with a multi-threading library like OpenMP, or by adding more algorithmic complexity
+  - e.g., with a multi-threading library like OpenMP, 
+  - or by implementing faster but more complex algorithms
   - or more simply, by inlining the sampling functions (adding an `inline` directive before their function declaration)
 - **Because there are few abstractions between it and machine code** (C => assembly => machine code with gcc, or C => machine code, with tcc), leading to fewer errors beyond the programmer's control.
 
@@ -184,16 +185,11 @@ int main(){
 
 ## To do list
 
-- [ ] Test summary statistics for each of the distributions.
+- [ ] Pontificate about lognormal tests
 - [ ] Have some more complicated & realistic example
 - [ ] Add summarization functions: 90% ci (or all c.i.?) 
 - [ ] Systematize references
 - [ ] Publish online
-- [ ] Add efficient sampling from a beta distribution
-  - https://dl.acm.org/doi/10.1145/358407.358414
-  - https://link.springer.com/article/10.1007/bf02293108
-  - https://stats.stackexchange.com/questions/502146/how-does-numpy-generate-samples-from-a-beta-distribution
-  - https://github.com/numpy/numpy/blob/5cae51e794d69dd553104099305e9f92db237c53/numpy/random/src/distributions/distributions.c
 - [ ] Support all distribution functions in <https://www.squiggle-language.com/docs/Api/Dist>
 - [ ] Support all distribution functions in <https://www.squiggle-language.com/docs/Api/Dist>, and do so efficiently
 
@@ -224,3 +220,15 @@ int main(){
   - https://dl.acm.org/doi/pdf/10.1145/358407.358414
 - [x] Explain correlated samples
 - [-] ~~Add tests in Stan?~~
+- [x] Test summary statistics for each of the distributions.
+  - [x] For uniform
+  - [x] For normal
+  - [x] For lognormal
+  - [x] For lognormal (to syntax)
+  - [x] For beta distribution
+- [x] Clarify gamma/standard gamma
+- [x] Add efficient sampling from a beta distribution
+  - https://dl.acm.org/doi/10.1145/358407.358414
+  - https://link.springer.com/article/10.1007/bf02293108
+  - https://stats.stackexchange.com/questions/502146/how-does-numpy-generate-samples-from-a-beta-distribution
+  - https://github.com/numpy/numpy/blob/5cae51e794d69dd553104099305e9f92db237c53/numpy/random/src/distributions/distributions.c
diff --git a/examples/01_one_sample/example b/examples/01_one_sample/example
Binary files differ.
diff --git a/examples/02_many_samples/example b/examples/02_many_samples/example
Binary files differ.
diff --git a/examples/03_gcc_nested_function/example b/examples/03_gcc_nested_function/example
Binary files differ.
diff --git a/examples/04_sample_from_cdf_simple/example b/examples/04_sample_from_cdf_simple/example
Binary files differ.
diff --git a/examples/04_sample_from_cdf_simple/example.c b/examples/04_sample_from_cdf_simple/example.c
@@ -87,9 +87,9 @@ int main()
     printf("\nGetting some samples from sample_unit_normal\n");
 
     clock_t begin_2 = clock();
-
+		double* normal_samples = malloc(NUM_SAMPLES * sizeof(double));
     for (int i = 0; i < NUM_SAMPLES; i++) {
-        double normal_sample = sample_unit_normal(seed);
+        normal_samples[i] = sample_unit_normal(seed);
         // printf("%f\n", normal_sample);
     }
 
diff --git a/examples/05_sample_from_cdf_beta/example b/examples/05_sample_from_cdf_beta/example
Binary files differ.
diff --git a/examples/06_gamma_beta/example b/examples/06_gamma_beta/example
Binary files differ.
diff --git a/examples/06_gamma_beta/example.c b/examples/06_gamma_beta/example.c
@@ -43,10 +43,3 @@ int main()
 		free(seed);
 }
 
-/* 
-Aggregation mechanisms:
-- Quantiles (requires a sort)
-- Sum 
-- Average
-- Std
-*/
diff --git a/examples/07_ci_beta/example b/examples/07_ci_beta/example
Binary files differ.
diff --git a/examples/07_ci_beta/example.c b/examples/07_ci_beta/example.c
@@ -0,0 +1,21 @@
+#include "../../squiggle.h"
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+// Estimate functions
+double beta_1_2_sampler(uint64_t* seed){
+	return sample_beta(1, 2.0, seed);
+}
+
+int main()
+{
+    // set randomness seed
+    uint64_t* seed = malloc(sizeof(uint64_t));
+    *seed = 1000; // xorshift can't start with 0
+
+		struct c_i beta_1_2_ci_90 = get_90_confidence_interval(beta_1_2_sampler, seed);
+		printf("90%% confidence interval of beta(1,2) is [%f, %f]\n", beta_1_2_ci_90.low, beta_1_2_ci_90.high);
+		
+		free(seed);
+}
diff --git a/examples/07_ci_beta/makefile b/examples/07_ci_beta/makefile
@@ -0,0 +1,53 @@
+# Interface: 
+#   make
+#   make build
+#   make format
+#   make run
+
+# Compiler
+CC=gcc
+# CC=tcc # <= faster compilation
+
+# Main file
+SRC=example.c ../../squiggle.c
+OUTPUT=example
+
+## Dependencies
+MATH=-lm
+
+## Flags
+DEBUG= #'-g'
+STANDARD=-std=c99
+WARNINGS=-Wall
+OPTIMIZED=-O3  #-Ofast
+# OPENMP=-fopenmp
+
+## Formatter
+STYLE_BLUEPRINT=webkit
+FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
+
+## make build
+build: $(SRC)
+	$(CC) $(OPTIMIZED) $(DEBUG) $(SRC) $(MATH) -o $(OUTPUT)
+
+format: $(SRC)
+	$(FORMATTER) $(SRC)
+
+run: $(SRC) $(OUTPUT)
+	OMP_NUM_THREADS=1 ./$(OUTPUT) && echo
+
+time-linux: 
+	@echo "Requires /bin/time, found on GNU/Linux systems" && echo
+	
+	@echo "Running 100x and taking avg time $(OUTPUT)"
+	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 1 thread: |" | sed 's|$$|ms|' && echo
+
+## Profiling
+
+profile-linux: 
+	echo "Requires perf, which depends on the kernel version, and might be in linux-tools package or similar"
+	echo "Must be run as sudo"
+	$(CC) $(SRC) $(MATH) -o $(OUTPUT)
+	sudo perf record ./$(OUTPUT)
+	sudo perf report
+	rm perf.data
diff --git a/makefile b/makefile
@@ -11,6 +11,7 @@ all:
 	cd examples/04_sample_from_cdf_simple && make && echo
 	cd examples/05_sample_from_cdf_beta && make && echo
 	cd examples/06_gamma_beta && make && echo
+	cd examples/07_ci_beta && make && echo
 
 format: squiggle.c squiggle.h
 	$(FORMATTER) squiggle.c
diff --git a/squiggle.c b/squiggle.c
@@ -94,6 +94,12 @@ double sample_gamma(double alpha, uint64_t* seed)
     // A Simple Method for Generating Gamma Variables, Marsaglia and Wan Tsang, 2001
     // https://dl.acm.org/doi/pdf/10.1145/358407.358414
     // see also the references/ folder
+		// Note that the Wikipedia page for the gamma distribution includes a scaling parameter
+		// k or beta
+		// https://en.wikipedia.org/wiki/Gamma_distribution
+		// such that gamma_k(alpha, k) = k * gamma(alpha)
+		// or gamma_beta(alpha, beta) = gamma(alpha) / beta
+		// So far I have not needed to use this, and thus the second parameter is by default 1.
     if (alpha >= 1) {
         double d, c, x, v, u;
         d = alpha - 1.0 / 3.0;
@@ -377,6 +383,43 @@ struct box sampler_cdf_double(double cdf(double), uint64_t* seed)
     return result;
 }
 
+// Get confidence intervals, given a sampler
+
+struct c_i {
+	float low;
+	float high;
+};
+int compare_doubles(const void *p, const void *q) {
+		// https://wikiless.esmailelbob.xyz/wiki/Qsort?lang=en
+    double x = *(const double *)p;
+    double y = *(const double *)q;
+
+    /* Avoid return x - y, which can cause undefined behaviour
+       because of signed integer overflow. */
+    if (x < y)
+        return -1;  // Return -1 if you want ascending, 1 if you want descending order. 
+    else if (x > y)
+        return 1;   // Return 1 if you want ascending, -1 if you want descending order.
+
+    return 0;
+}
+struct c_i get_90_confidence_interval(double (*sampler)(uint64_t*), uint64_t* seed){
+	int n = 100 * 1000;
+	double* samples_array = malloc(n * sizeof(double));
+	for(int i=0; i<n; i++){
+		samples_array[i] = sampler(seed);
+	}
+	qsort(samples_array, n, sizeof(double), compare_doubles);
+
+	struct c_i result = {
+		.low = samples_array[5000], 
+		.high =samples_array[94999],
+	};
+	free(samples_array);
+
+	return result;
+}
+
 /* Could also define other variations, e.g.,
 double sampler_danger(struct box cdf(double), uint64_t* seed)
 {
diff --git a/squiggle.h b/squiggle.h
@@ -50,4 +50,11 @@ struct box inverse_cdf_box(struct box cdf_box(double), double p);
 struct box sampler_cdf_double(double cdf(double), uint64_t* seed);
 struct box sampler_cdf_box(struct box cdf(double), uint64_t* seed);
 
+// Get 90% confidence interval
+struct c_i {
+	float low;
+	float high;
+};
+struct c_i get_90_confidence_interval(double (*sampler)(uint64_t*), uint64_t* seed);
+
 #endif

	squiggle.c Self-contained Monte Carlo estimation in C99
	Log \| Files \| Refs \| README

M	README.md	\|	22	+++++++++++++++-------
M	examples/01_one_sample/example	\|	0
M	examples/02_many_samples/example	\|	0
M	examples/03_gcc_nested_function/example	\|	0
M	examples/04_sample_from_cdf_simple/example	\|	0
M	examples/04_sample_from_cdf_simple/example.c	\|	4	++--
M	examples/05_sample_from_cdf_beta/example	\|	0
M	examples/06_gamma_beta/example	\|	0
M	examples/06_gamma_beta/example.c	\|	7	-------
A	examples/07_ci_beta/example	\|	0
A	examples/07_ci_beta/example.c	\|	21	+++++++++++++++++++++
A	examples/07_ci_beta/makefile	\|	53	+++++++++++++++++++++++++++++++++++++++++++++++++++++
M	makefile	\|	1	+
M	squiggle.c	\|	43	+++++++++++++++++++++++++++++++++++++++++++
M	squiggle.h	\|	7	+++++++