add failed example of loop unrolling - squiggle.c - Self-contained Monte Carlo estimation in C99

commit c25e9f916f001ef84681eda43f356628d55f29aa
parent a50d776d2cbddcc0f960cdc879f47d3aab91327b
Author: NunoSempere <nuno.sempere@protonmail.com>
Date:   Fri, 12 Jan 2024 20:41:06 +0100

add failed example of loop unrolling

Diffstat:
M examples/more/00_example_template/example  | 0 
M examples/more/01_sample_from_cdf/example  | 0 
M examples/more/02_sample_from_cdf_beta/example  | 0 
M examples/more/03_ci_beta/example  | 0 
M examples/more/04_nuclear_war/example  | 0 
M examples/more/05_burn_10kg_fat/example  | 0 
M examples/more/06_nuclear_recovery/example  | 0 
M examples/more/07_algebra/example  | 0 
M examples/more/08_algebra_and_conversion/example  | 0 
M examples/more/09_ergonomic_algebra/example  | 0 
M examples/more/10_twitter_thread_example/example  | 0 
M examples/more/11_billion_lognormals_paralell/example  | 0 
M examples/more/12_time_to_botec_parallel/example  | 0 
M examples/more/13_parallelize_min/example  | 0 
M examples/more/14_check_confidence_interval/example  | 0 
M makefile  | 6 ++++++
M squiggle_more.c  | 30 ++++++++++++++++++++++++++++++

17 files changed, 36 insertions(+), 0 deletions(-)
diff --git a/examples/more/00_example_template/example b/examples/more/00_example_template/example
Binary files differ.
diff --git a/examples/more/01_sample_from_cdf/example b/examples/more/01_sample_from_cdf/example
Binary files differ.
diff --git a/examples/more/02_sample_from_cdf_beta/example b/examples/more/02_sample_from_cdf_beta/example
Binary files differ.
diff --git a/examples/more/03_ci_beta/example b/examples/more/03_ci_beta/example
Binary files differ.
diff --git a/examples/more/04_nuclear_war/example b/examples/more/04_nuclear_war/example
Binary files differ.
diff --git a/examples/more/05_burn_10kg_fat/example b/examples/more/05_burn_10kg_fat/example
Binary files differ.
diff --git a/examples/more/06_nuclear_recovery/example b/examples/more/06_nuclear_recovery/example
Binary files differ.
diff --git a/examples/more/07_algebra/example b/examples/more/07_algebra/example
Binary files differ.
diff --git a/examples/more/08_algebra_and_conversion/example b/examples/more/08_algebra_and_conversion/example
Binary files differ.
diff --git a/examples/more/09_ergonomic_algebra/example b/examples/more/09_ergonomic_algebra/example
Binary files differ.
diff --git a/examples/more/10_twitter_thread_example/example b/examples/more/10_twitter_thread_example/example
Binary files differ.
diff --git a/examples/more/11_billion_lognormals_paralell/example b/examples/more/11_billion_lognormals_paralell/example
Binary files differ.
diff --git a/examples/more/12_time_to_botec_parallel/example b/examples/more/12_time_to_botec_parallel/example
Binary files differ.
diff --git a/examples/more/13_parallelize_min/example b/examples/more/13_parallelize_min/example
Binary files differ.
diff --git a/examples/more/14_check_confidence_interval/example b/examples/more/14_check_confidence_interval/example
Binary files differ.
diff --git a/makefile b/makefile
@@ -4,6 +4,9 @@ MAKEFLAGS += --no-print-directory
 STYLE_BLUEPRINT=webkit
 FORMATTER=clang-format -i -style=$(STYLE_BLUEPRINT)
 
+## Time to botec
+TTB=./examples/more/12_time_to_botec_parallel/example
+
 build-examples:
 	cd examples/core && make all
 	cd examples/more && make all
@@ -25,3 +28,6 @@ profile:
 	sudo perf report
 	rm perf.data
 
+time-linux: 
+	@echo "Running 100x and taking avg time: $(TTB)"
+	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do $(TTB); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo
diff --git a/squiggle_more.c b/squiggle_more.c
@@ -64,6 +64,8 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_
     {
 #pragma omp for
         for (i = 0; i < n_threads; i++) {
+            // Simple version 
+            /*
             int quotient = n_samples / n_threads;
             int lower_bound_inclusive = i * quotient;
             int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below,
@@ -71,6 +73,34 @@ void sampler_parallel(double (*sampler)(uint64_t* seed), double* results, int n_
                 results[j] = sampler(&(cache_box[i].seed));
                 // Could also result in inefficient cache stuff, but hopefully not too often
             }
+            */
+            
+            // Version with loop unrolling
+            int quotient = n_samples / n_threads;
+            int lower_bound_inclusive = i * quotient;
+            int upper_bound_not_inclusive = ((i + 1) * quotient); // note the < in the for loop below,
+            int delta = quotient;
+            int eighth_of_deltas = delta/8; // why 8? a double in 8 bytes, 8 doubles is the size of a cache line
+            int k;
+            // to do: simplify these variables. Maybe divide by n_threads * 8 directly
+            for(int j=0; j<eighth_of_deltas; j++){
+                k = lower_bound_inclusive + j*8;
+                results[k+0] = sampler(&(cache_box[i].seed));
+                results[k+1] = sampler(&(cache_box[i].seed));
+                results[k+2] = sampler(&(cache_box[i].seed));
+                results[k+3] = sampler(&(cache_box[i].seed));
+                results[k+4] = sampler(&(cache_box[i].seed));
+                results[k+5] = sampler(&(cache_box[i].seed));
+                results[k+6] = sampler(&(cache_box[i].seed));
+                results[k+7] = sampler(&(cache_box[i].seed));
+                // these all fit one single cache line
+                // name of the techique: loop unrolling.
+            }
+            for(int k=eighth_of_deltas*8; k<upper_bound_not_inclusive; k++){
+                results[k] = sampler(&(cache_box[i].seed));
+
+            }
+            
         }
     }
     for (int j = divisor_multiple; j < n_samples; j++) {

	squiggle.c Self-contained Monte Carlo estimation in C99
	Log \| Files \| Refs \| README

M	examples/more/00_example_template/example	\|	0
M	examples/more/01_sample_from_cdf/example	\|	0
M	examples/more/02_sample_from_cdf_beta/example	\|	0
M	examples/more/03_ci_beta/example	\|	0
M	examples/more/04_nuclear_war/example	\|	0
M	examples/more/05_burn_10kg_fat/example	\|	0
M	examples/more/06_nuclear_recovery/example	\|	0
M	examples/more/07_algebra/example	\|	0
M	examples/more/08_algebra_and_conversion/example	\|	0
M	examples/more/09_ergonomic_algebra/example	\|	0
M	examples/more/10_twitter_thread_example/example	\|	0
M	examples/more/11_billion_lognormals_paralell/example	\|	0
M	examples/more/12_time_to_botec_parallel/example	\|	0
M	examples/more/13_parallelize_min/example	\|	0
M	examples/more/14_check_confidence_interval/example	\|	0
M	makefile	\|	6	++++++
M	squiggle_more.c	\|	30	++++++++++++++++++++++++++++++