move some functions from scratchpad => squiggle.c, reorg - squiggle.c - Self-contained Monte Carlo estimation in C99

commit ee9ed342877423bcce2040d181a52674c297cf56
parent 68e7730f24c33268a151dd197534f8c28ed376e0
Author: NunoSempere <nuno.sempere@protonmail.com>
Date:   Sun, 16 Jul 2023 21:26:33 +0200

move some functions from scratchpad => squiggle.c, reorg

Diffstat:
M scratchpad/makefile  | 2 +-
M scratchpad/scratchpad  | 0 
M scratchpad/scratchpad.c  | 242 ++++---------------------------------------------------------------------------
M squiggle.c  | 187 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M squiggle.h  | 34 +++++++++++++++++++++++++++++++++-

5 files changed, 232 insertions(+), 233 deletions(-)
diff --git a/scratchpad/makefile b/scratchpad/makefile
@@ -9,7 +9,7 @@ CC=gcc # required for nested functions
 # CC=tcc # <= faster compilation
 
 # Main file
-SRC=scratchpad.c
+SRC=scratchpad.c ../squiggle.c
 OUTPUT=./scratchpad
 
 ## Dependencies
diff --git a/scratchpad/scratchpad b/scratchpad/scratchpad
Binary files differ.
diff --git a/scratchpad/scratchpad.c b/scratchpad/scratchpad.c
@@ -1,33 +1,13 @@
-#include <float.h> // FLT_MAX, FLT_MIN
-#include <limits.h> // INT_MAX
 #include <math.h> // erf, sqrt
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
-// #include <sys/types.h>
 #include <time.h>
+#include "../squiggle.h"
 
-#define EXIT_ON_ERROR 0
-#define MAX_ERROR_LENGTH 500
-#define PROCESS_ERROR(...)                                                              \
-    do {                                                                                \
-        if (EXIT_ON_ERROR) {                                                            \
-            printf("@, in %s (%d)", __FILE__, __LINE__);                                \
-            exit(1);                                                                    \
-        } else {                                                                        \
-            char error_msg[MAX_ERROR_LENGTH];                                           \
-            snprintf(error_msg, MAX_ERROR_LENGTH, "@, in %s (%d)", __FILE__, __LINE__); \
-            struct box error = { .empty = 1, .error_msg = error_msg };                  \
-            return error;                                                               \
-        }                                                                               \
-    } while (0)
 #define NUM_SAMPLES 1000000
-
-struct box {
-    int empty;
-    float content;
-    char* error_msg;
-};
+#define STOP_BETA 1.0e-8
+#define TINY_BETA 1.0e-30
 
 // Example cdf
 float cdf_uniform_0_1(float x)
@@ -59,16 +39,10 @@ float cdf_normal_0_1(float x)
     return 0.5 * (1 + erf((x - mean) / (std * sqrt(2)))); // erf from math.h
 }
 
-// [x] to do: add beta.
-// [x] for the cdf, use this incomplete beta function implementation, based on continuous fractions:
-// <https://codeplea.com/incomplete-beta-function-c>
-// <https://github.com/codeplea/incbeta>
-
-#define STOP_BETA 1.0e-8
-#define TINY_BETA 1.0e-30
 struct box incbeta(float a, float b, float x)
 {
     // Descended from <https://github.com/codeplea/incbeta/blob/master/incbeta.c>,
+    // <https://codeplea.com/incomplete-beta-function-c>
     // but modified to return a box struct and floats instead of doubles.
     // [ ] to do: add attribution in README
     // Original code under this license:
@@ -174,200 +148,6 @@ struct box cdf_beta(float x)
     }
 }
 
-// Inverse cdf at point
-// Two versions of this function:
-//   - raw, dealing with cdfs that return floats
-//   - box, dealing with cdfs that return a box.
-
-// Inverse cdf
-struct box inverse_cdf_float(float cdf(float), float p)
-{
-    // given a cdf: [-Inf, Inf] => [0,1]
-    // returns a box with either
-    // x such that cdf(x) = p
-    // or an error
-    // if EXIT_ON_ERROR is set to 1, it exits instead of providing an error
-
-    float low = -1.0;
-    float high = 1.0;
-
-    // 1. Make sure that cdf(low) < p < cdf(high)
-    int interval_found = 0;
-    while ((!interval_found) && (low > -FLT_MAX / 4) && (high < FLT_MAX / 4)) {
-        // ^ Using FLT_MIN and FLT_MAX is overkill
-        // but it's also the *correct* thing to do.
-
-        int low_condition = (cdf(low) < p);
-        int high_condition = (p < cdf(high));
-        if (low_condition && high_condition) {
-            interval_found = 1;
-        } else if (!low_condition) {
-            low = low * 2;
-        } else if (!high_condition) {
-            high = high * 2;
-        }
-    }
-
-    if (!interval_found) {
-        PROCESS_ERROR("Interval containing the target value not found, in function inverse_cdf");
-    } else {
-
-        int convergence_condition = 0;
-        int count = 0;
-        while (!convergence_condition && (count < (INT_MAX / 2))) {
-            float mid = (high + low) / 2;
-            int mid_not_new = (mid == low) || (mid == high);
-            // float width = high - low;
-            // if ((width < 1e-8) || mid_not_new){
-            if (mid_not_new) {
-                convergence_condition = 1;
-            } else {
-                float mid_sign = cdf(mid) - p;
-                if (mid_sign < 0) {
-                    low = mid;
-                } else if (mid_sign > 0) {
-                    high = mid;
-                } else if (mid_sign == 0) {
-                    low = mid;
-                    high = mid;
-                }
-            }
-        }
-
-        if (convergence_condition) {
-            struct box result = { .empty = 0, .content = low };
-            return result;
-        } else {
-            PROCESS_ERROR("Search process did not converge, in function inverse_cdf");
-        }
-    }
-}
-
-struct box inverse_cdf_box(struct box cdf_box(float), float p)
-{
-    // given a cdf: [-Inf, Inf] => Box([0,1])
-    // returns a box with either
-    // x such that cdf(x) = p
-    // or an error
-    // if EXIT_ON_ERROR is set to 1, it exits instead of providing an error
-
-    float low = -1.0;
-    float high = 1.0;
-
-    // 1. Make sure that cdf(low) < p < cdf(high)
-    int interval_found = 0;
-    while ((!interval_found) && (low > -FLT_MAX / 4) && (high < FLT_MAX / 4)) {
-        // ^ Using FLT_MIN and FLT_MAX is overkill
-        // but it's also the *correct* thing to do.
-        struct box cdf_low = cdf_box(low);
-        if (cdf_low.empty) {
-            PROCESS_ERROR(cdf_low.error_msg);
-        }
-
-        struct box cdf_high = cdf_box(high);
-        if (cdf_high.empty) {
-            PROCESS_ERROR(cdf_low.error_msg);
-        }
-
-        int low_condition = (cdf_low.content < p);
-        int high_condition = (p < cdf_high.content);
-        if (low_condition && high_condition) {
-            interval_found = 1;
-        } else if (!low_condition) {
-            low = low * 2;
-        } else if (!high_condition) {
-            high = high * 2;
-        }
-    }
-
-    if (!interval_found) {
-        PROCESS_ERROR("Interval containing the target value not found, in function inverse_cdf");
-    } else {
-
-        int convergence_condition = 0;
-        int count = 0;
-        while (!convergence_condition && (count < (INT_MAX / 2))) {
-            float mid = (high + low) / 2;
-            int mid_not_new = (mid == low) || (mid == high);
-            // float width = high - low;
-            if (mid_not_new) {
-                // if ((width < 1e-8) || mid_not_new){
-                convergence_condition = 1;
-            } else {
-                struct box cdf_mid = cdf_box(mid);
-                if (cdf_mid.empty) {
-                    PROCESS_ERROR(cdf_mid.error_msg);
-                }
-                float mid_sign = cdf_mid.content - p;
-                if (mid_sign < 0) {
-                    low = mid;
-                } else if (mid_sign > 0) {
-                    high = mid;
-                } else if (mid_sign == 0) {
-                    low = mid;
-                    high = mid;
-                }
-            }
-        }
-
-        if (convergence_condition) {
-            struct box result = { .empty = 0, .content = low };
-            return result;
-        } else {
-            PROCESS_ERROR("Search process did not converge, in function inverse_cdf");
-        }
-    }
-}
-
-// Some randomness functions for:
-//   - Sampling from a cdf
-//   - Benchmarking against a previous approach, which will be faster, but less general
-
-// Get random number between 0 and 1
-uint32_t xorshift32(uint32_t* seed)
-{
-    // Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs"
-    // See <https://stackoverflow.com/questions/53886131/how-does-xorshift32-works>
-    // https://en.wikipedia.org/wiki/Xorshift
-    // Also some drama: <https://www.pcg-random.org/posts/on-vignas-pcg-critique.html>, <https://prng.di.unimi.it/>
-
-    uint32_t x = *seed;
-    x ^= x << 13;
-    x ^= x >> 17;
-    x ^= x << 5;
-    return *seed = x;
-}
-
-// Distribution & sampling functions
-float rand_0_to_1(uint32_t* seed)
-{
-    return ((float)xorshift32(seed)) / ((float)UINT32_MAX);
-}
-
-// Sampler based on inverse cdf and randomness function
-struct box sampler_box_cdf(struct box cdf(float), uint32_t* seed)
-{
-    float p = rand_0_to_1(seed);
-    struct box result = inverse_cdf_box(cdf, p);
-    return result;
-}
-struct box sampler_float_cdf(float cdf(float), uint32_t* seed)
-{
-    float p = rand_0_to_1(seed);
-    struct box result = inverse_cdf_float(cdf, p);
-    return result;
-}
-
-// Comparison point with raw normal sampler
-const float PI = 3.14159265358979323846;
-float sampler_normal_0_1(uint32_t* seed)
-{
-    float u1 = rand_0_to_1(seed);
-    float u2 = rand_0_to_1(seed);
-    float z = sqrtf(-2.0 * log(u1)) * sin(2 * PI * u2);
-    return z;
-}
-
 // Some testers
 void test_inverse_cdf_float(char* cdf_name, float cdf_float(float))
 {
@@ -445,12 +225,12 @@ int main()
     test_and_time_sampler_float("cdf_normal_0_1", cdf_normal_0_1, seed);
 
     // Get some normal samples using a previous approach
-    printf("\nGetting some samples from sampler_normal_0_1\n");
+    printf("\nGetting some samples from unit_normal\n");
 
     clock_t begin_2 = clock();
 
     for (int i = 0; i < NUM_SAMPLES; i++) {
-        float normal_sample = sampler_normal_0_1(seed);
+        float normal_sample = unit_normal(seed);
         // printf("%f\n", normal_sample);
     }
 
@@ -460,11 +240,11 @@ int main()
 
     // Test box sampler
     test_and_time_sampler_box("cdf_beta", cdf_beta, seed);
-		// Ok, this is slower than python!!
-		// Partly this is because I am using a more general algorithm, 
-		//   which applies to any cdf
-		// But I am also using really anal convergence conditions.
-		// This could be optimized.
+    // Ok, this is slower than python!!
+    // Partly this is because I am using a more general algorithm,
+    //   which applies to any cdf
+    // But I am also using really anal convergence conditions.
+    // This could be optimized.
 
     free(seed);
     return 0;
diff --git a/squiggle.c b/squiggle.c
@@ -1,6 +1,25 @@
 #include <math.h>
 #include <stdint.h>
+#include <stdio.h>
 #include <stdlib.h>
+#include <float.h>
+#include <limits.h>
+#include <time.h>
+// #include <sys/types.h>
+#define EXIT_ON_ERROR 0
+#define MAX_ERROR_LENGTH 500
+#define PROCESS_ERROR(...)                                                              \
+    do {                                                                                \
+        if (EXIT_ON_ERROR) {                                                            \
+            printf("@, in %s (%d)", __FILE__, __LINE__);                                \
+            exit(1);                                                                    \
+        } else {                                                                        \
+            char error_msg[MAX_ERROR_LENGTH];                                           \
+            snprintf(error_msg, MAX_ERROR_LENGTH, "@, in %s (%d)", __FILE__, __LINE__); \
+            struct box error = { .empty = 1, .error_msg = error_msg };                  \
+            return error;                                                               \
+        }                                                                               \
+    } while (0)
 
 // PI constant
 const float PI = M_PI; // 3.14159265358979323846;
@@ -112,3 +131,171 @@ float mixture(float (*samplers[])(uint32_t*), float* weights, int n_dists, uint3
     free(cumsummed_normalized_weights);
     return result;
 }
+
+// Sample from an arbitrary cdf 
+struct box {
+    int empty;
+    float content;
+    char* error_msg;
+};
+
+// Inverse cdf at point
+// Two versions of this function:
+//   - raw, dealing with cdfs that return floats
+//     - input: cdf: float => float, p
+//     - output: Box(number|error)
+//   - box, dealing with cdfs that return a box.
+//     - input: cdf: float => Box(number|error), p
+//     - output: Box(number|error)
+struct box inverse_cdf_float(float cdf(float), float p)
+{
+    // given a cdf: [-Inf, Inf] => [0,1]
+    // returns a box with either
+    // x such that cdf(x) = p
+    // or an error
+    // if EXIT_ON_ERROR is set to 1, it exits instead of providing an error
+
+    float low = -1.0;
+    float high = 1.0;
+
+    // 1. Make sure that cdf(low) < p < cdf(high)
+    int interval_found = 0;
+    while ((!interval_found) && (low > -FLT_MAX / 4) && (high < FLT_MAX / 4)) {
+        // ^ Using FLT_MIN and FLT_MAX is overkill
+        // but it's also the *correct* thing to do.
+
+        int low_condition = (cdf(low) < p);
+        int high_condition = (p < cdf(high));
+        if (low_condition && high_condition) {
+            interval_found = 1;
+        } else if (!low_condition) {
+            low = low * 2;
+        } else if (!high_condition) {
+            high = high * 2;
+        }
+    }
+
+    if (!interval_found) {
+        PROCESS_ERROR("Interval containing the target value not found, in function inverse_cdf");
+    } else {
+
+        int convergence_condition = 0;
+        int count = 0;
+        while (!convergence_condition && (count < (INT_MAX / 2))) {
+            float mid = (high + low) / 2;
+            int mid_not_new = (mid == low) || (mid == high);
+            // float width = high - low;
+            // if ((width < 1e-8) || mid_not_new){
+            if (mid_not_new) {
+                convergence_condition = 1;
+            } else {
+                float mid_sign = cdf(mid) - p;
+                if (mid_sign < 0) {
+                    low = mid;
+                } else if (mid_sign > 0) {
+                    high = mid;
+                } else if (mid_sign == 0) {
+                    low = mid;
+                    high = mid;
+                }
+            }
+        }
+
+        if (convergence_condition) {
+            struct box result = { .empty = 0, .content = low };
+            return result;
+        } else {
+            PROCESS_ERROR("Search process did not converge, in function inverse_cdf");
+        }
+    }
+}
+
+struct box inverse_cdf_box(struct box cdf_box(float), float p)
+{
+    // given a cdf: [-Inf, Inf] => Box([0,1])
+    // returns a box with either
+    // x such that cdf(x) = p
+    // or an error
+    // if EXIT_ON_ERROR is set to 1, it exits instead of providing an error
+
+    float low = -1.0;
+    float high = 1.0;
+
+    // 1. Make sure that cdf(low) < p < cdf(high)
+    int interval_found = 0;
+    while ((!interval_found) && (low > -FLT_MAX / 4) && (high < FLT_MAX / 4)) {
+        // ^ Using FLT_MIN and FLT_MAX is overkill
+        // but it's also the *correct* thing to do.
+        struct box cdf_low = cdf_box(low);
+        if (cdf_low.empty) {
+            PROCESS_ERROR(cdf_low.error_msg);
+        }
+
+        struct box cdf_high = cdf_box(high);
+        if (cdf_high.empty) {
+            PROCESS_ERROR(cdf_low.error_msg);
+        }
+
+        int low_condition = (cdf_low.content < p);
+        int high_condition = (p < cdf_high.content);
+        if (low_condition && high_condition) {
+            interval_found = 1;
+        } else if (!low_condition) {
+            low = low * 2;
+        } else if (!high_condition) {
+            high = high * 2;
+        }
+    }
+
+    if (!interval_found) {
+        PROCESS_ERROR("Interval containing the target value not found, in function inverse_cdf");
+    } else {
+
+        int convergence_condition = 0;
+        int count = 0;
+        while (!convergence_condition && (count < (INT_MAX / 2))) {
+            float mid = (high + low) / 2;
+            int mid_not_new = (mid == low) || (mid == high);
+            // float width = high - low;
+            if (mid_not_new) {
+                // if ((width < 1e-8) || mid_not_new){
+                convergence_condition = 1;
+            } else {
+                struct box cdf_mid = cdf_box(mid);
+                if (cdf_mid.empty) {
+                    PROCESS_ERROR(cdf_mid.error_msg);
+                }
+                float mid_sign = cdf_mid.content - p;
+                if (mid_sign < 0) {
+                    low = mid;
+                } else if (mid_sign > 0) {
+                    high = mid;
+                } else if (mid_sign == 0) {
+                    low = mid;
+                    high = mid;
+                }
+            }
+        }
+
+        if (convergence_condition) {
+            struct box result = { .empty = 0, .content = low };
+            return result;
+        } else {
+            PROCESS_ERROR("Search process did not converge, in function inverse_cdf");
+        }
+    }
+}
+
+// Sampler based on inverse cdf and randomness function
+struct box sampler_box_cdf(struct box cdf(float), uint32_t* seed)
+{
+    float p = rand_0_to_1(seed);
+    struct box result = inverse_cdf_box(cdf, p);
+    return result;
+}
+struct box sampler_float_cdf(float cdf(float), uint32_t* seed)
+{
+    float p = rand_0_to_1(seed);
+    struct box result = inverse_cdf_float(cdf, p);
+    return result;
+}
diff --git a/squiggle.h b/squiggle.h
@@ -4,13 +4,29 @@
 // uint32_t header
 #include <stdint.h>
 
+// Macros
+#define EXIT_ON_ERROR 0
+#define MAX_ERROR_LENGTH 500
+#define PROCESS_ERROR(...)                                                              \
+    do {                                                                                \
+        if (EXIT_ON_ERROR) {                                                            \
+            printf("@, in %s (%d)", __FILE__, __LINE__);                                \
+            exit(1);                                                                    \
+        } else {                                                                        \
+            char error_msg[MAX_ERROR_LENGTH];                                           \
+            snprintf(error_msg, MAX_ERROR_LENGTH, "@, in %s (%d)", __FILE__, __LINE__); \
+            struct box error = { .empty = 1, .error_msg = error_msg };                  \
+            return error;                                                               \
+        }                                                                               \
+    } while (0)
+
 // Pseudo Random number generator
 uint32_t xorshift32(uint32_t* seed);
 
 // Distribution & sampling functions
 float rand_0_to_1(uint32_t* seed);
 float rand_float(float max, uint32_t* seed);
-float ur_normal(uint32_t* seed);
+float unit_normal(uint32_t* seed);
 float random_uniform(float from, float to, uint32_t* seed);
 float random_normal(float mean, float sigma, uint32_t* seed);
 float random_lognormal(float logmean, float logsigma, uint32_t* seed);
@@ -23,4 +39,20 @@ void array_cumsum(float* array_to_sum, float* array_cumsummed, int length);
 // Mixture function
 float mixture(float (*samplers[])(uint32_t*), float* weights, int n_dists, uint32_t* seed);
 
+// Box
+struct box {
+    int empty;
+    float content;
+    char* error_msg;
+};
+
+// Inverse cdf
+struct box inverse_cdf_float(float cdf(float), float p);
+struct box inverse_cdf_box(struct box cdf_box(float), float p);
+
+// Samplers from cdf
+struct box sampler_box_cdf(struct box cdf(float), uint32_t* seed);
+struct box sampler_float_cdf(float cdf(float), uint32_t* seed);
+
 #endif
+

	squiggle.c Self-contained Monte Carlo estimation in C99
	Log \| Files \| Refs \| README

M	scratchpad/makefile	\|	2	+-
M	scratchpad/scratchpad	\|	0
M	scratchpad/scratchpad.c	\|	242	++++---------------------------------------------------------------------------
M	squiggle.c	\|	187	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	squiggle.h	\|	34	+++++++++++++++++++++++++++++++++-