squiggle.c (7942B)
1 #include <limits.h> 2 #include <math.h> 3 #include <stdint.h> 4 #include <stdlib.h> 5 6 // Defs 7 #define PI 3.14159265358979323846 // M_PI in gcc gnu99 8 #define NORMAL90CONFIDENCE 1.6448536269514727 9 #define UNUSED(x) (void)(x) 10 // ^ https://stackoverflow.com/questions/3599160/how-can-i-suppress-unused-parameter-warnings-in-c 11 12 // Pseudo Random number generators 13 static uint64_t xorshift64(uint64_t* seed) 14 { 15 // Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs" 16 // See: 17 // <https://en.wikipedia.org/wiki/Xorshift> 18 // <https://stackoverflow.com/questions/53886131/how-does-xorshift32-works>, 19 // Also some drama: 20 // <https://www.pcg-random.org/posts/on-vignas-pcg-critique.html>, 21 // <https://prng.di.unimi.it/> 22 uint64_t x = *seed; 23 x ^= x << 13; 24 x ^= x >> 7; 25 x ^= x << 17; 26 return *seed = x; 27 28 /* 29 // if one wanted to generate 32 bit ints, 30 // from which to generate floats, 31 // one could do the following: 32 uint32_t x = *seed; 33 x ^= x << 13; 34 x ^= x >> 17; 35 x ^= x << 5; 36 return *seed = x; 37 */ 38 } 39 40 // Distribution & sampling functions 41 // Unit distributions 42 double sample_unit_uniform(uint64_t* seed) 43 { 44 // samples uniform from [0,1] interval. 45 return ((double)xorshift64(seed)) / ((double)UINT64_MAX); 46 } 47 48 double sample_unit_normal(uint64_t* seed) 49 { 50 // // See: <https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform> 51 double u1 = sample_unit_uniform(seed); 52 double u2 = sample_unit_uniform(seed); 53 double z = sqrt(-2.0 * log(u1)) * sin(2.0 * PI * u2); 54 return z; 55 } 56 57 // Composite distributions 58 double sample_uniform(double start, double end, uint64_t* seed) 59 { 60 return sample_unit_uniform(seed) * (end - start) + start; 61 } 62 63 double sample_normal(double mean, double sigma, uint64_t* seed) 64 { 65 return (mean + sigma * sample_unit_normal(seed)); 66 } 67 68 double sample_lognormal(double logmean, double logstd, uint64_t* seed) 69 { 70 return exp(sample_normal(logmean, logstd, seed)); 71 } 72 73 double sample_normal_from_90_ci(double low, double high, uint64_t* seed) 74 { 75 // Explanation of key idea: 76 // 1. We know that the 90% confidence interval of the unit normal is 77 // [-1.6448536269514722, 1.6448536269514722] 78 // see e.g.: https://stackoverflow.com/questions/20626994/how-to-calculate-the-inverse-of-the-normal-cumulative-distribution-function-in-p 79 // or https://www.wolframalpha.com/input?i=N%5BInverseCDF%28normal%280%2C1%29%2C+0.05%29%2C%7B%E2%88%9E%2C100%7D%5D 80 // 2. So if we take a unit normal and multiply it by 81 // L / 1.6448536269514722, its new 90% confidence interval will be 82 // [-L, L], i.e., length 2 * L 83 // 3. Instead, if we want to get a confidence interval of length L, 84 // we should multiply the unit normal by 85 // L / (2 * 1.6448536269514722) 86 // Meaning that its standard deviation should be multiplied by that amount 87 // see: https://en.wikipedia.org/wiki/Normal_distribution?lang=en#Operations_on_a_single_normal_variable 88 // 4. So we have learnt that Normal(0, L / (2 * 1.6448536269514722)) 89 // has a 90% confidence interval of length L 90 // 5. If we want a 90% confidence interval from high to low, 91 // we can set mean = (high + low)/2; the midpoint, and L = high-low, 92 // Normal([high + low]/2, [high - low]/(2 * 1.6448536269514722)) 93 double mean = (high + low) * 0.5; 94 double std = (high - low) / (2.0 * NORMAL90CONFIDENCE); 95 return sample_normal(mean, std, seed); 96 } 97 98 double sample_to(double low, double high, uint64_t* seed) 99 { 100 // Given a (positive) 90% confidence interval, 101 // returns a sample from a lognorma with a matching 90% c.i. 102 // Key idea: If we want a lognormal with 90% confidence interval [a, b] 103 // we need but get a normal with 90% confidence interval [log(a), log(b)]. 104 // Then see code for sample_normal_from_90_ci 105 double loglow = log(low); 106 double loghigh = log(high); 107 return exp(sample_normal_from_90_ci(loglow, loghigh, seed)); 108 } 109 110 double sample_gamma(double alpha, uint64_t* seed) 111 { 112 113 // A Simple Method for Generating Gamma Variables, Marsaglia and Wan Tsang, 2001 114 // https://dl.acm.org/doi/pdf/10.1145/358407.358414 115 // see also the references/ folder 116 // Note that the Wikipedia page for the gamma distribution includes a scaling parameter 117 // k or beta 118 // https://en.wikipedia.org/wiki/Gamma_distribution 119 // such that gamma_k(alpha, k) = k * gamma(alpha) 120 // or gamma_beta(alpha, beta) = gamma(alpha) / beta 121 // So far I have not needed to use this, and thus the second parameter is by default 1. 122 if (alpha >= 1) { 123 double d, c, x, v, u; 124 d = alpha - 1.0 / 3.0; 125 c = 1.0 / sqrt(9.0 * d); 126 while (1) { 127 128 do { 129 x = sample_unit_normal(seed); 130 v = 1.0 + c * x; 131 } while (v <= 0.0); 132 133 v = v * v * v; 134 u = sample_unit_uniform(seed); 135 if (u < 1.0 - 0.0331 * (x * x * x * x)) { // Condition 1 136 // the 0.0331 doesn't inspire much confidence 137 // however, this isn't the whole story 138 // by knowing that Condition 1 implies condition 2 139 // we realize that this is just a way of making the algorithm faster 140 // i.e., of not using the logarithms 141 return d * v; 142 } 143 if (log(u) < 0.5 * (x * x) + d * (1.0 - v + log(v))) { // Condition 2 144 return d * v; 145 } 146 } 147 } else { 148 return sample_gamma(1 + alpha, seed) * pow(sample_unit_uniform(seed), 1 / alpha); 149 // see note in p. 371 of https://dl.acm.org/doi/pdf/10.1145/358407.358414 150 } 151 } 152 153 double sample_beta(double a, double b, uint64_t* seed) 154 { 155 // See: https://en.wikipedia.org/wiki/Gamma_distribution#Related_distributions 156 double gamma_a = sample_gamma(a, seed); 157 double gamma_b = sample_gamma(b, seed); 158 return gamma_a / (gamma_a + gamma_b); 159 } 160 161 double sample_laplace(double successes, double failures, uint64_t* seed) 162 { 163 // see <https://en.wikipedia.org/wiki/Beta_distribution?lang=en#Rule_of_succession> 164 return sample_beta(successes + 1, failures + 1, seed); 165 } 166 167 // Array helpers 168 double array_sum(double* array, int length) 169 { 170 double sum = 0.0; 171 for (int i = 0; i < length; i++) { 172 sum += array[i]; 173 } 174 return sum; 175 } 176 177 void array_cumsum(double* array_to_sum, double* array_cumsummed, int length) 178 { 179 array_cumsummed[0] = array_to_sum[0]; 180 for (int i = 1; i < length; i++) { 181 array_cumsummed[i] = array_cumsummed[i - 1] + array_to_sum[i]; 182 } 183 } 184 185 double array_mean(double* array, int length) 186 { 187 double sum = array_sum(array, length); 188 return sum / length; 189 } 190 191 double array_std(double* array, int length) 192 { 193 double mean = array_mean(array, length); 194 double std = 0.0; 195 for (int i = 0; i < length; i++) { 196 std += (array[i] - mean) * (array[i] - mean); 197 } 198 std = sqrt(std / length); 199 return std; 200 } 201 202 // Mixture function 203 double sample_mixture(double (*samplers[])(uint64_t*), double* weights, int n_dists, uint64_t* seed) 204 { 205 // Sample from samples with frequency proportional to their weights. 206 double sum_weights = array_sum(weights, n_dists); 207 double* cumsummed_normalized_weights = (double*)malloc((size_t)n_dists * sizeof(double)); 208 cumsummed_normalized_weights[0] = weights[0] / sum_weights; 209 for (int i = 1; i < n_dists; i++) { 210 cumsummed_normalized_weights[i] = cumsummed_normalized_weights[i - 1] + weights[i] / sum_weights; 211 } 212 213 double result; 214 int result_set_flag = 0; 215 double p = sample_uniform(0, 1, seed); 216 for (int k = 0; k < n_dists; k++) { 217 if (p < cumsummed_normalized_weights[k]) { 218 result = samplers[k](seed); 219 result_set_flag = 1; 220 break; 221 } 222 } 223 if (result_set_flag == 0) 224 result = samplers[n_dists - 1](seed); 225 226 free(cumsummed_normalized_weights); 227 return result; 228 }