commit 56ab018469ab962ee0161b8dee4670589c2fe7c0
parent 79dfcf79db6734e3a672f8a8c39650b7a8ecc5ee
Author: NunoSempere <nuno.sempere@protonmail.com>
Date: Sat, 23 Sep 2023 21:53:45 +0100
add many more comments, start adding to header file.
Diffstat:
| M | README.md | | | 6 | ++++-- |
| M | squiggle.c | | | 85 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------- |
| M | squiggle.h | | | 20 | ++++++++++++++++++++ |
3 files changed, 87 insertions(+), 24 deletions(-)
diff --git a/README.md b/README.md
@@ -344,7 +344,9 @@ It emits one warning about something I already took care of, so by default I've
- [x] Add a few functions for doing simple algebra on normals, and lognormals
- [x] Add prototypes
- [x] Use named structs
+ - [x] Add to header file
- [ ] Test results
- [ ] Provide example
- - [ ] Add to headers
- - [ ] Move to own file
+ - [ ] Add conversion between 90%ci and parameters.
+ - [ ] Move to own file? Or signpost in file?
+- [ ] Disambiguate sample_laplace--successes vs failures || successes vs total trials as two distinct and differently named functions
diff --git a/squiggle.c b/squiggle.c
@@ -7,15 +7,22 @@
#include <sys/types.h>
#include <time.h>
+// Some error niceties; these won't be used until later
#define MAX_ERROR_LENGTH 500
#define EXIT_ON_ERROR 0
#define PROCESS_ERROR(error_msg) process_error(error_msg, EXIT_ON_ERROR, __FILE__, __LINE__)
const double PI = 3.14159265358979323846; // M_PI in gcc gnu99
+// # Key functionality
+// Define the minimum number of functions needed to do simple estimation
+// Starts here, ends until the end of the mixture function
+
// Pseudo Random number generator
uint64_t xorshift32(uint32_t* seed)
{
+ // The reader isn't expected to understand this code immediately;
+ // read the linked Wikipedia page!
// Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs"
// See <https://stackoverflow.com/questions/53886131/how-does-xorshift64-works>
// https://en.wikipedia.org/wiki/Xorshift
@@ -71,17 +78,40 @@ double sample_lognormal(double logmean, double logstd, uint64_t* seed)
return exp(sample_normal(logmean, logstd, seed));
}
+inline double sample_normal_from_95_confidence_interval(double low, double high, uint64_t* seed){
+ // Explanation of key idea:
+ // 1. We know that the 90% confidence interval of the unit normal is
+ // [-1.6448536269514722, 1.6448536269514722]
+ // see e.g.: https://stackoverflow.com/questions/20626994/how-to-calculate-the-inverse-of-the-normal-cumulative-distribution-function-in-p
+ // 2. So if we take a unit normal and multiply it by
+ // L / 1.6448536269514722, its new 90% confidence interval will be
+ // [-L, L], i.e., length 2 * L
+ // 3. Instead, if we want to get a confidence interval of length L,
+ // we should multiply the unit normal by
+ // L / (2 * 1.6448536269514722)
+ // Meaning that its standard deviation should be multiplied by that amount
+ // see: https://en.wikipedia.org/wiki/Normal_distribution?lang=en#Operations_on_a_single_normal_variable
+ // 4. So we have learnt that Normal(0, L / (2 * 1.6448536269514722))
+ // has a 90% confidence interval of length L
+ // 5. If we want a 90% confidence interval from high to low,
+ // we can set mean = (high + low)/2; the midpoint, and L = high-low,
+ // Normal([high + low]/2, [high - low]/(2 * 1.6448536269514722))
+ const double NORMAL95CONFIDENCE = 1.6448536269514722;
+ double mean = (high + low) / 2.0;
+ double std = (high - low) / (2.0 * NORMAL95CONFIDENCE );
+ return sample_normal(mean, std);
+}
+
double sample_to(double low, double high, uint64_t* seed)
{
// Given a (positive) 90% confidence interval,
- // returns a sample from a lognormal
- // with a matching 90% c.i.
- const double NORMAL95CONFIDENCE = 1.6448536269514722;
+ // returns a sample from a lognorma with a matching 90% c.i.
+ // Key idea: If we want a lognormal with 90% confidence interval [a, b]
+ // we need but get a normal with 90% confidence interval [log(a), log(b)].
+ // Then see code for sample_normal_from_95_confidence_interval
double loglow = logf(low);
double loghigh = logf(high);
- double logmean = (loglow + loghigh) / 2;
- double logstd = (loghigh - loglow) / (2.0 * NORMAL95CONFIDENCE);
- return sample_lognormal(logmean, logstd, seed);
+ return exp(sample_normal_from_95_confidence_interval(loglow, loghigh));
}
double sample_gamma(double alpha, uint64_t* seed)
@@ -129,13 +159,14 @@ double sample_gamma(double alpha, uint64_t* seed)
double sample_beta(double a, double b, uint64_t* seed)
{
+ // See: https://en.wikipedia.org/wiki/Gamma_distribution#Related_distributions
double gamma_a = sample_gamma(a, seed);
double gamma_b = sample_gamma(b, seed);
return gamma_a / (gamma_a + gamma_b);
}
double sample_laplace(double successes, double failures, uint64_t* seed){
- // see <https://wikiless.esmailelbob.xyz/wiki/Beta_distribution?lang=en#Rule_of_succession>
+ // see <https://en.wikipedia.org/wiki/Beta_distribution?lang=en#Rule_of_succession>
return sample_beta(successes + 1, failures + 1, seed);
}
@@ -177,8 +208,7 @@ double array_std(double* array, int length)
// Mixture function
double sample_mixture(double (*samplers[])(uint64_t*), double* weights, int n_dists, uint64_t* seed)
{
- // You can see a simpler version of this function in the git history
- // or in C-02-better-algorithm-one-thread/
+ // Sample from samples with frequency proportional to their weights.
double sum_weights = array_sum(weights, n_dists);
double* cumsummed_normalized_weights = (double*)malloc(n_dists * sizeof(double));
cumsummed_normalized_weights[0] = weights[0] / sum_weights;
@@ -203,7 +233,11 @@ double sample_mixture(double (*samplers[])(uint64_t*), double* weights, int n_di
return result;
}
-// Sample from an arbitrary cdf
+// # More cool stuff
+// This is no longer necessary to do basic estimation,
+// but is still cool
+
+// ## Sample from an arbitrary cdf
struct box {
int empty;
double content;
@@ -436,45 +470,52 @@ struct c_i get_90_confidence_interval(double (*sampler)(uint64_t*), uint64_t* se
return result;
}
+// # Small algebra system
+
// Do algebra over lognormals and normals
-typedef struct {
+// here I discover named structs,
+// which mean that I don't have to be typing
+// struct blah all the time.
+typedef struct normal_params_t {
double mean;
double std;
-} normal_parameters;
+} normal_params;
-struct {
+typedef struct lognormal_params_t {
double logmean;
double logstd;
-} lognormal_parameters;
+} lognormal_params;
-normal_parameters algebra_sum_normals(normal_parameters a, normal_parameters b)
+normal_params algebra_sum_normals(normal_params a, normal_params b)
{
- normal_parameters result = {
+ normal_params result = {
.mean = a.mean + b.mean,
.std = sqrt((a.std * a.std) + (b.std * b.std)),
};
return result;
}
-normal_parameters algebra_shift_normal(normal_parameters a, double shift)
+normal_params algebra_shift_normal(normal_params a, double shift)
{
- normal_parameters result = {
+ normal_params result = {
.mean = a.mean + shift,
.std = a.std,
};
return result;
}
-lognormal_parameters algebra_product_lognormals(lognormal_parameters a, lognormal_parameters b)
+// Also add stretching
+
+lognormal_params algebra_product_lognormals(lognormal_params a, lognormal_params b)
{
- lognormal_parameters result = {
+ lognormal_params result = {
.logmean = a.logmean + b.logmean,
.logstd = sqrt((a.logstd * a.logstd) + (b.logstd * b.logstd)),
};
return result;
}
-lognormal_parameters algebra_scale_lognormal(lognormal_parameters a, double k)
+lognormal_params algebra_scale_lognormal(lognormal_params a, double k)
{
- lognormal_parameters result = {
+ lognormal_params result = {
.logmean = a.logmean + k,
.logstd = a.logstd,
};
diff --git a/squiggle.h b/squiggle.h
@@ -58,4 +58,24 @@ struct c_i {
};
struct c_i get_90_confidence_interval(double (*sampler)(uint64_t*), uint64_t* seed);
+// small algebra system
+
+typedef struct normal_params_t {
+ double mean;
+ double std;
+} normal_params;
+
+typedef struct lognormal_params_t {
+ double logmean;
+ double logstd;
+} lognormal_params;
+
+
+normal_params algebra_sum_normals(normal_params a, normal_params b);
+normal_params algebra_shift_normal(normal_params a, double shift);
+
+
+lognormal_params algebra_product_lognormals(lognormal_params a, lognormal_params b);
+lognormal_params algebra_scale_lognormal(lognormal_params a, double k);
+
#endif