finish xorshift updating. - time-to-botec - Benchmark sampling in different programming languages

commit c73476e5aa2db8ab7a5312d2379af6da73b0046a
parent eaee16f205a3528768276bf94f8b902eb6d6f5ea
Author: NunoSempere <nuno.sempere@protonmail.com>
Date:   Sat,  3 Jun 2023 04:21:59 -0600

finish xorshift updating.

Diffstat:
M C/README.md  | 2 ++
M C/out/samples  | 0 
M C/perf.txt  | 47 ++++++++++++++++++++++++-----------------------
M C/samples.c  | 11 ++++++++++-
M README.md  | 4 +++-
M time.txt  | 14 ++++++++------

6 files changed, 47 insertions(+), 31 deletions(-)
diff --git a/C/README.md b/C/README.md
@@ -11,6 +11,8 @@ This repository contains a few implementations of a simple botec (back-of-the-en
 - [ ] Add Windows/Powershell time-measuring commands
 - [ ] Add CUDA?
 - [x] Added results of perf. `rand_r` seems like a big chunk of it, but I'm hesitant to use lower-quality random numbers
+  - [x] used xorshift instead
+  - [ ] Use xorshift with a struct instead of a pointer? idk, could be faster for some reason?
 - [x] Update repository with correct timing
 - [x] Use better profiling approach to capture timing with 1M samples.
 - [x] See if program can be reworded so as to use multithreading effectively, e.g., so that you see speed gains proportional to the number of threads used
diff --git a/C/out/samples b/C/out/samples
Binary files differ.
diff --git a/C/perf.txt b/C/perf.txt
@@ -1,25 +1,26 @@
+Samples: 884  of event 'cycles', Event count (approx.): 551167949
 Overhead  Command  Shared Object      Symbol
-  23.94%  samples  libc-2.31.so       [.] rand_r
-  18.14%  samples  libgomp.so.1.0.0   [.] 0x000000000001d132
-  15.43%  samples  libgomp.so.1.0.0   [.] 0x000000000001d2ea
-  12.16%  samples  samples            [.] mixture._omp_fn.0
-   4.36%  samples  libm-2.31.so       [.] __sin_fma
-   3.49%  samples  libm-2.31.so       [.] __ieee754_log_fma
-   3.34%  samples  samples            [.] random_to
-   3.13%  samples  samples            [.] random_uniform
-   2.77%  samples  samples            [.] split_array_sum._omp_fn.0
-   2.01%  samples  samples            [.] rand_float
-   1.65%  samples  libm-2.31.so       [.] __logf_fma
-   0.88%  samples  libgomp.so.1.0.0   [.] 0x000000000001d2f5
-   0.86%  samples  samples            [.] ur_normal
-   0.75%  samples  libm-2.31.so       [.] __expf_fma
-   0.70%  samples  libgomp.so.1.0.0   [.] 0x000000000001d13d
-   0.69%  samples  libgomp.so.1.0.0   [.] 0x000000000001d139
-   0.57%  samples  libgomp.so.1.0.0   [.] 0x000000000001d2f1
-   0.57%  samples  samples            [.] sample_1
-   0.55%  samples  samples            [.] random_lognormal
-   0.50%  samples  [kernel.kallsyms]  [k] asm_exc_page_fault
-   0.49%  samples  [kernel.kallsyms]  [k] clear_page_rep
-   0.47%  samples  samples            [.] random_normal
-   0.38%  samples  [kernel.kallsyms]  [k] default_send_IPI_single_phys
+  35.32%  samples  samples            [.] xorshift32
+  14.09%  samples  libgomp.so.1.0.0   [.] 0x000000000001d2ea
+  12.04%  samples  libgomp.so.1.0.0   [.] 0x000000000001d132
+  11.53%  samples  samples            [.] mixture._omp_fn.0
+   4.55%  samples  libm-2.31.so       [.] __sin_fma
+   4.24%  samples  samples            [.] rand_0_to_1
+   3.77%  samples  samples            [.] random_to
+   3.03%  samples  libm-2.31.so       [.] __logf_fma
+   1.61%  samples  libm-2.31.so       [.] __expf_fma
+   1.54%  samples  samples            [.] split_array_sum._omp_fn.0
+   1.38%  samples  samples            [.] random_uniform
+   0.94%  samples  samples            [.] ur_normal
+   0.91%  samples  libm-2.31.so       [.] __ieee754_log_fma
+   0.74%  samples  libgomp.so.1.0.0   [.] 0x000000000001d13d
+   0.52%  samples  samples            [.] sample_0
+   0.41%  samples  libm-2.31.so       [.] __sqrtf_finite@GLIBC_2.15
+   0.38%  samples  samples            [.] sample_1
+   0.36%  samples  libgomp.so.1.0.0   [.] 0x000000000001d139
+   0.36%  samples  libgomp.so.1.0.0   [.] 0x000000000001d2f5
+   0.22%  samples  [kernel.kallsyms]  [k] native_queued_spin_lock_slowpath
+   0.18%  samples  [kernel.kallsyms]  [k] _raw_spin_lock_irq
+   0.18%  samples  samples            [.] random_lognormal
+   0.17%  samples  libgomp.so.1.0.0   [.] 0x000000000001d2f1
 
diff --git a/C/samples.c b/C/samples.c
@@ -81,7 +81,9 @@ float split_array_sum(float** meta_array, int length, int divided_into)
 
 uint32_t xorshift32(uint32_t* seed)
 {
-	/* Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs" */
+	// Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs"
+  // See <https://stackoverflow.com/questions/53886131/how-does-xorshift32-works>
+  // https://en.wikipedia.org/wiki/Xorshift
 	uint32_t x = *seed;
 	x ^= x << 13;
 	x ^= x >> 17;
@@ -93,6 +95,13 @@ uint32_t xorshift32(uint32_t* seed)
 
 float rand_0_to_1(uint32_t* seed){
 	return ((float) xorshift32(seed)) / ((float) UINT32_MAX);
+	/* 
+	uint32_t x = *seed;
+	x ^= x << 13;
+	x ^= x >> 17;
+	x ^= x << 5;
+	return ((float)(*seed = x))/((float) UINT32_MAX);
+	*/
 	// previously:
 	// ((float)rand_r(seed) / (float)RAND_MAX) 
 	// and before that: rand, but it wasn't thread-safe.
diff --git a/README.md b/README.md
@@ -33,7 +33,7 @@ The title of this repository is a pun on two meanings of "time to": "how much ti
 
 | Language                    | Time      | Lines of code |
 |-----------------------------|-----------|---------------|
-| C (optimized, 16 threads)   | 6ms       | 222 |
+| C (optimized, 16 threads)   | 5ms       | 249 |
 | Nim                         | 68ms      | 84  |
 | C (naïve implementation)    | 292ms     | 149 |
 | Javascript (NodeJS)         | 732ms     | 69  |
@@ -74,6 +74,8 @@ In fact, the C code ended up being so fast that I had to measure its time by run
 
 And still, there are some missing optimizations, like tweaking the code to take into account cache misses. I'm not exactly sure how that would go, though.
 
+Once the code was at 6.6ms, there was a 0.6ms gain possible by using OMP better, and a 1ms gain by using a xorshift algorithm instead of rand_r from stdlib.
+
 Although the above paragraphs were written in the first person, the C code was written together with Jorge Sierra, who translated the algorithmic improvements from nim to it and added the initial multithreading support.
 
 ### NodeJS and Squiggle
diff --git a/time.txt b/time.txt
@@ -1,22 +1,24 @@
 # Optimized C
 
-$ make time-linux
+
+$ make && make time-linux
+gcc -O3   samples.c -fopenmp -lm -o out/samples
 Requires /bin/time, found on GNU/Linux systems
 
 Running 100x and taking avg time: OMP_NUM_THREADS=1 out/samples
-Time using 1 thread: 24.00ms
+Time using 1 thread: 20.20ms
 
 Running 100x and taking avg time: OMP_NUM_THREADS=2 out/samples
-Time using 2 threads: 21.80ms
+Time using 2 threads: 17.50ms
 
 Running 100x and taking avg time: OMP_NUM_THREADS=4 out/samples
-Time for 4 threads: 24.40ms
+Time for 4 threads: 17.00ms
 
 Running 100x and taking avg time: OMP_NUM_THREADS=8 out/samples
-Time using 8 threads: 10.40ms
+Time using 8 threads: 8.40ms
 
 Running 100x and taking avg time: OMP_NUM_THREADS=16 out/samples
-Time using 16 threads: 6.60ms
+Time using 16 threads: 5.00ms
 
 # C

	time-to-botec Benchmark sampling in different programming languages
	Log \| Files \| Refs \| README

M	C/README.md	\|	2	++
M	C/out/samples	\|	0
M	C/perf.txt	\|	47	++++++++++++++++++++++++-----------------------
M	C/samples.c	\|	11	++++++++++-
M	README.md	\|	4	+++-
M	time.txt	\|	14	++++++++------