squiggle_more.c - time-to-botec - Benchmark sampling in different programming languages

squiggle_more.c (15606B)
      1 #include "squiggle.h"
      2 #include <float.h>
      3 #include <limits.h>
      4 #include <math.h>
      5 #include <omp.h>
      6 #include <stdint.h>
      7 #include <stdio.h>
      8 #include <stdlib.h>
      9 #include <string.h> // memcpy
     10 
     11 /* Cache optimizations */
     12 #define CACHE_LINE_SIZE 64
     13 // getconf LEVEL1_DCACHE_LINESIZE
     14 // <https://stackoverflow.com/questions/794632/programmatically-get-the-cache-line-size>
     15 typedef struct seed_cache_box_t {
     16     uint64_t seed;
     17     char padding[CACHE_LINE_SIZE - sizeof(uint64_t)];
     18     // Cache line size is 64 *bytes*, uint64_t is 64 *bits* (8 bytes). Different units!
     19 } seed_cache_box;
     20 // This avoids "false sharing", i.e., different threads competing for the same cache line
     21 // Dealing with this shaves 4ms from a 12ms process, or a third of runtime
     22 // <http://www.nic.uoregon.edu/~khuck/ts/acumem-report/manual_html/ch06s07.html>
     23 
     24 /* Parallel sampler */
     25 void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_threads, int n_samples)
     26 {
     27 
     28     // Terms of the division:
     29     // a = b * quotient + reminder
     30     // a = b * (a/b)    + (a%b)
     31     // dividend: a
     32     // divisor: b
     33     // quotient = a/b
     34     // reminder = a%b
     35     // "divisor's multiple" := b*(a/b)
     36 
     37     // now, we have n_samples and n_threads
     38     // to make our life easy, each thread will have a number of samples of: a/b (quotient)
     39     // and we'll compute the remainder of samples separately
     40     // to possibly do by Jorge: improve so that the remainder is included in the threads
     41 
     42     int quotient = n_samples / n_threads;
     43     int divisor_multiple = quotient * n_threads;
     44 
     45     // uint64_t** seeds = malloc((size_t)n_threads * sizeof(uint64_t*));
     46     seed_cache_box* cache_box = (seed_cache_box*)malloc(sizeof(seed_cache_box) * (size_t)n_threads);
     47     // seed_cache_box cache_box[n_threads]; // we could use the C stack. On normal linux machines, it's 8MB ($ ulimit -s). However, it doesn't quite feel right.
     48     srand(1);
     49     for (int i = 0; i < n_threads; i++) {
     50         // Constraints:
     51         // - xorshift can't start with 0
     52         // - the seeds should be reasonably separated and not correlated
     53         cache_box[i].seed = (uint64_t)rand() * (UINT64_MAX / RAND_MAX);
     54 
     55         // Other initializations tried:
     56         // *seeds[i] = 1 + i;
     57         // *seeds[i] = (i + 0.5)*(UINT64_MAX/n_threads);
     58         // *seeds[i] = (i + 0.5)*(UINT64_MAX/n_threads) + constant * i;
     59     }
     60 
     61     int i;
     62 #pragma omp parallel private(i)
     63     {
     64 #pragma omp for
     65         for (i = 0; i < n_threads; i++) {
     66             // It's possible I don't need the for, and could instead call omp
     67             // in some different way and get  the thread number with omp_get_thread_num()
     68             int lower_bound_inclusive = i * quotient;
     69             int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below,
     70 
     71             for (int j = lower_bound_inclusive; j < upper_bound_not_inclusive; j++) {
     72                 results[j] = sampler(&(cache_box[i].seed));
     73                 /* 
     74                 t starts at 0 and ends at T
     75                 at t=0, 
     76                   thread i accesses: results[i*quotient +0], 
     77                   thread i+1 acccesses: results[(i+1)*quotient +0]
     78                 at t=T
     79                   thread i accesses: results[(i+1)*quotient -1]
     80                   thread i+1 acccesses: results[(i+2)*quotient -1]
     81                 The results[j] that are directly adjacent are 
     82                   results[(i+1)*quotient -1] (accessed by thread i at time T)
     83                   results[(i+1)*quotient +0] (accessed by thread i+1 at time 0)
     84                 and these are themselves adjacent to
     85                   results[(i+1)*quotient -2] (accessed by thread i at time T-1)
     86                   results[(i+1)*quotient +1] (accessed by thread i+1 at time 2)
     87                 If T is large enough, which it is, two threads won't access the same
     88                 cache line at the same time.
     89                 Pictorially:
     90                 at t=0 ....i.........I.........
     91                 at t=T .............i.........I
     92                 and the two never overlap
     93                 Note that results[j] is a double, a double has 8 bytes (64 bits)
     94                 8 doubles fill a cache line of 64 bytes.
     95                 So we specifically won't get problems as long as n_samples/n_threads > 8
     96                 n_threads is normally 16, so n_samples > 128 
     97                 Note also that this is only a problem in terms of speed, if n_samples<128
     98                 the results are still computed, it'll just be slower
     99                 */
    100             }
    101         }
    102     }
    103     for (int j = divisor_multiple; j < n_samples; j++) {
    104         results[j] = sampler(&(cache_box[0].seed));
    105         // we can just reuse a seed,
    106         // this isn't problematic because we;ve now stopped doing multithreading
    107     }
    108 
    109     free(cache_box);
    110 }
    111 
    112 /* Get confidence intervals, given a sampler */
    113 
    114 typedef struct ci_t {
    115     double low;
    116     double high;
    117 } ci;
    118 
    119 inline static void swp(int i, int j, double xs[])
    120 {
    121     double tmp = xs[i];
    122     xs[i] = xs[j];
    123     xs[j] = tmp;
    124 }
    125 
    126 static int partition(int low, int high, double xs[], int length)
    127 {
    128     if (low > high || high >= length) {
    129         printf("Invariant violated for function partition in %s (%d)", __FILE__, __LINE__);
    130         exit(1);
    131     }
    132     // Note: the scratchpad/ folder in commit 578bfa27 has printfs sprinkled throughout
    133     int pivot = low + (int)floor((high - low) / 2);
    134     double pivot_value = xs[pivot];
    135     swp(pivot, high, xs);
    136     int gt = low; /* This pointer will iterate until finding an element which is greater than the pivot. Then it will move elements that are smaller before it--more specifically, it will move elements to its position and then increment. As a result all elements between gt and i will be greater than the pivot. */
    137     for (int i = low; i < high; i++) {
    138         if (xs[i] < pivot_value) {
    139             swp(gt, i, xs);
    140             gt++;
    141         }
    142     }
    143     swp(high, gt, xs);
    144     return gt;
    145 }
    146 
    147 static double quickselect(int k, double xs[], int n)
    148 {
    149     // https://en.wikipedia.org/wiki/Quickselect
    150 
    151     double* ys = malloc((size_t)n * sizeof(double));
    152     memcpy(ys, xs, (size_t)n * sizeof(double));
    153     // ^: don't rearrange item order in the original array
    154 
    155     int low = 0;
    156     int high = n - 1;
    157     for (;;) {
    158         if (low == high) {
    159             double result = ys[low];
    160             free(ys);
    161             return result;
    162         }
    163         int pivot = partition(low, high, ys, n);
    164         if (pivot == k) {
    165             double result = ys[pivot];
    166             free(ys);
    167             return result;
    168         } else if (k < pivot) {
    169             high = pivot - 1;
    170         } else {
    171             low = pivot + 1;
    172         }
    173     }
    174 }
    175 
    176 ci array_get_ci(ci interval, double* xs, int n)
    177 {
    178 
    179     int low_k = (int)floor(interval.low * n);
    180     int high_k = (int)ceil(interval.high * n);
    181     ci result = {
    182         .low = quickselect(low_k, xs, n),
    183         .high = quickselect(high_k, xs, n),
    184     };
    185     return result;
    186 }
    187 ci array_get_90_ci(double xs[], int n)
    188 {
    189     return array_get_ci((ci) { .low = 0.05, .high = 0.95 }, xs, n);
    190 }
    191 
    192 double array_get_median(double xs[], int n)
    193 {
    194     int median_k = (int)floor(0.5 * n);
    195     return quickselect(median_k, xs, n);
    196 }
    197 
    198 /* array print: potentially useful for debugging */
    199 void array_print(double xs[], int n)
    200 {
    201     printf("[");
    202     for (int i = 0; i < n - 1; i++) {
    203         printf("%f, ", xs[i]);
    204     }
    205     printf("%f", xs[n - 1]);
    206     printf("]\n");
    207 }
    208 
    209 void array_print_stats(double xs[], int n)
    210 {
    211     ci ci_90 = array_get_ci((ci) { .low = 0.05, .high = 0.95 }, xs, n);
    212     ci ci_80 = array_get_ci((ci) { .low = 0.1, .high = 0.9 }, xs, n);
    213     ci ci_50 = array_get_ci((ci) { .low = 0.25, .high = 0.75 }, xs, n);
    214     double median = array_get_median(xs, n);
    215     double mean = array_mean(xs, n);
    216     double std = array_std(xs, n);
    217     printf("| Statistic | Value |\n"
    218            "| --- | --- |\n"
    219            "| Mean   | %lf |\n"
    220            "| Median | %lf |\n"
    221            "| Std    | %lf |\n"
    222            "| 90%% confidence interval | %lf to %lf |\n"
    223            "| 80%% confidence interval | %lf to %lf |\n"
    224            "| 50%% confidence interval | %lf to %lf |\n",
    225            mean, median, std, ci_90.low, ci_90.high, ci_80.low, ci_80.high, ci_50.low, ci_50.high);
    226 }
    227 
    228 void array_print_histogram(double* xs, int n_samples, int n_bins)
    229 {
    230     // Interface inspired by <https://github.com/red-data-tools/YouPlot>
    231     if (n_bins <= 1) {
    232         fprintf(stderr, "Number of bins must be greater than 1.\n");
    233         return;
    234     } else if (n_samples <= 1) {
    235         fprintf(stderr, "Number of samples must be higher than 1.\n");
    236         return;
    237     }
    238 
    239     int* bins = (int*)calloc((size_t)n_bins, sizeof(int));
    240     if (bins == NULL) {
    241         fprintf(stderr, "Memory allocation for bins failed.\n");
    242         return;
    243     }
    244 
    245     // Find the minimum and maximum values from the samples
    246     double min_value = xs[0], max_value = xs[0];
    247     for (int i = 0; i < n_samples; i++) {
    248         if (xs[i] < min_value) {
    249             min_value = xs[i];
    250         }
    251         if (xs[i] > max_value) {
    252             max_value = xs[i];
    253         }
    254     }
    255 
    256     // Avoid division by zero for a single unique value
    257     if (min_value == max_value) {
    258         max_value++;
    259     }
    260 
    261     // Calculate bin width
    262     double bin_width = (max_value - min_value) / n_bins;
    263 
    264     // Fill the bins with sample counts
    265     for (int i = 0; i < n_samples; i++) {
    266         int bin_index = (int)((xs[i] - min_value) / bin_width);
    267         if (bin_index == n_bins) {
    268             bin_index--; // Last bin includes max_value
    269         }
    270         bins[bin_index]++;
    271     }
    272 
    273     // Calculate the scaling factor based on the maximum bin count
    274     int max_bin_count = 0;
    275     for (int i = 0; i < n_bins; i++) {
    276         if (bins[i] > max_bin_count) {
    277             max_bin_count = bins[i];
    278         }
    279     }
    280     const int MAX_WIDTH = 50; // Adjust this to your terminal width
    281     double scale = max_bin_count > MAX_WIDTH ? (double)MAX_WIDTH / max_bin_count : 1.0;
    282 
    283     // Print the histogram
    284     for (int i = 0; i < n_bins; i++) {
    285         double bin_start = min_value + i * bin_width;
    286         double bin_end = bin_start + bin_width;
    287 
    288         int decimalPlaces = 1;
    289         if ((0 < bin_width) && (bin_width < 1)) {
    290             int magnitude = (int)floor(log10(bin_width));
    291             decimalPlaces = -magnitude;
    292             decimalPlaces = decimalPlaces > 10 ? 10 : decimalPlaces;
    293         }
    294         printf("[%*.*f, %*.*f", 4 + decimalPlaces, decimalPlaces, bin_start, 4 + decimalPlaces, decimalPlaces, bin_end);
    295         char interval_delimiter = ')';
    296         if (i == (n_bins - 1)) {
    297             interval_delimiter = ']'; // last bucket is inclusive
    298         }
    299         printf("%c: ", interval_delimiter);
    300 
    301         int marks = (int)(bins[i] * scale);
    302         for (int j = 0; j < marks; j++) {
    303             printf("█");
    304         }
    305         printf(" %d\n", bins[i]);
    306     }
    307 
    308     // Free the allocated memory for bins
    309     free(bins);
    310 }
    311 
    312 void array_print_90_ci_histogram(double* xs, int n_samples, int n_bins)
    313 {
    314     // Code duplicated from previous function
    315     // I'll consider simplifying it at some future point
    316     // Possible ideas:
    317     // - having only one function that takes any confidence interval?
    318     // - having a utility function that is called by both functions?
    319     ci ci_90 = array_get_90_ci(xs, n_samples);
    320 
    321     if (n_bins <= 1) {
    322         fprintf(stderr, "Number of bins must be greater than 1.\n");
    323         return;
    324     } else if (n_samples <= 10) {
    325         fprintf(stderr, "Number of samples must be higher than 10.\n");
    326         return;
    327     }
    328 
    329     int* bins = (int*)calloc((size_t)n_bins, sizeof(int));
    330     if (bins == NULL) {
    331         fprintf(stderr, "Memory allocation for bins failed.\n");
    332         return;
    333     }
    334 
    335     double min_value = ci_90.low, max_value = ci_90.high;
    336 
    337     // Avoid division by zero for a single unique value
    338     if (min_value == max_value) {
    339         max_value++;
    340     }
    341     double bin_width = (max_value - min_value) / n_bins;
    342 
    343     // Fill the bins with sample counts
    344     int below_min = 0, above_max = 0;
    345     for (int i = 0; i < n_samples; i++) {
    346         if (xs[i] < min_value) {
    347             below_min++;
    348         } else if (xs[i] > max_value) {
    349             above_max++;
    350         } else {
    351             int bin_index = (int)((xs[i] - min_value) / bin_width);
    352             if (bin_index == n_bins) {
    353                 bin_index--; // Last bin includes max_value
    354             }
    355             bins[bin_index]++;
    356         }
    357     }
    358 
    359     // Calculate the scaling factor based on the maximum bin count
    360     int max_bin_count = 0;
    361     for (int i = 0; i < n_bins; i++) {
    362         if (bins[i] > max_bin_count) {
    363             max_bin_count = bins[i];
    364         }
    365     }
    366     const int MAX_WIDTH = 40; // Adjust this to your terminal width
    367     double scale = max_bin_count > MAX_WIDTH ? (double)MAX_WIDTH / max_bin_count : 1.0;
    368 
    369     // Print the histogram
    370     int decimalPlaces = 1;
    371     if ((0 < bin_width) && (bin_width < 1)) {
    372         int magnitude = (int)floor(log10(bin_width));
    373         decimalPlaces = -magnitude;
    374         decimalPlaces = decimalPlaces > 10 ? 10 : decimalPlaces;
    375     }
    376     printf("(%*s, %*.*f): ", 6 + decimalPlaces, "-∞", 4 + decimalPlaces, decimalPlaces, min_value);
    377     int marks_below_min = (int)(below_min * scale);
    378     for (int j = 0; j < marks_below_min; j++) {
    379         printf("█");
    380     }
    381     printf(" %d\n", below_min);
    382     for (int i = 0; i < n_bins; i++) {
    383         double bin_start = min_value + i * bin_width;
    384         double bin_end = bin_start + bin_width;
    385 
    386         printf("[%*.*f, %*.*f", 4 + decimalPlaces, decimalPlaces, bin_start, 4 + decimalPlaces, decimalPlaces, bin_end);
    387         char interval_delimiter = ')';
    388         if (i == (n_bins - 1)) {
    389             interval_delimiter = ']'; // last bucket is inclusive
    390         }
    391         printf("%c: ", interval_delimiter);
    392 
    393         int marks = (int)(bins[i] * scale);
    394         for (int j = 0; j < marks; j++) {
    395             printf("█");
    396         }
    397         printf(" %d\n", bins[i]);
    398     }
    399     printf("(%*.*f, %*s): ", 4 + decimalPlaces, decimalPlaces, max_value, 6 + decimalPlaces, "+∞");
    400     int marks_above_max = (int)(above_max * scale);
    401     for (int j = 0; j < marks_above_max; j++) {
    402         printf("█");
    403     }
    404     printf(" %d\n", above_max);
    405 
    406     // Free the allocated memory for bins
    407     free(bins);
    408 }
    409 
    410 /* Algebra manipulations */
    411 
    412 #define NORMAL90CONFIDENCE 1.6448536269514727
    413 
    414 typedef struct normal_params_t {
    415     double mean;
    416     double std;
    417 } normal_params;
    418 
    419 normal_params algebra_sum_normals(normal_params a, normal_params b)
    420 {
    421     normal_params result = {
    422         .mean = a.mean + b.mean,
    423         .std = sqrt((a.std * a.std) + (b.std * b.std)),
    424     };
    425     return result;
    426 }
    427 
    428 typedef struct lognormal_params_t {
    429     double logmean;
    430     double logstd;
    431 } lognormal_params;
    432 
    433 lognormal_params algebra_product_lognormals(lognormal_params a, lognormal_params b)
    434 {
    435     lognormal_params result = {
    436         .logmean = a.logmean + b.logmean,
    437         .logstd = sqrt((a.logstd * a.logstd) + (b.logstd * b.logstd)),
    438     };
    439     return result;
    440 }
    441 
    442 lognormal_params convert_ci_to_lognormal_params(ci x)
    443 {
    444     double loghigh = log(x.high);
    445     double loglow = log(x.low);
    446     double logmean = (loghigh + loglow) / 2.0;
    447     double logstd = (loghigh - loglow) / (2.0 * NORMAL90CONFIDENCE);
    448     lognormal_params result = { .logmean = logmean, .logstd = logstd };
    449     return result;
    450 }
    451 
    452 ci convert_lognormal_params_to_ci(lognormal_params y)
    453 {
    454     double h = y.logstd * NORMAL90CONFIDENCE;
    455     double loghigh = y.logmean + h;
    456     double loglow = y.logmean - h;
    457     ci result = { .low = exp(loglow), .high = exp(loghigh) };
    458     return result;
    459 }
	time-to-botec Benchmark sampling in different programming languages
	Log \| Files \| Refs \| README