commit c73476e5aa2db8ab7a5312d2379af6da73b0046a
parent eaee16f205a3528768276bf94f8b902eb6d6f5ea
Author: NunoSempere <nuno.sempere@protonmail.com>
Date: Sat, 3 Jun 2023 04:21:59 -0600
finish xorshift updating.
Diffstat:
6 files changed, 47 insertions(+), 31 deletions(-)
diff --git a/C/README.md b/C/README.md
@@ -11,6 +11,8 @@ This repository contains a few implementations of a simple botec (back-of-the-en
- [ ] Add Windows/Powershell time-measuring commands
- [ ] Add CUDA?
- [x] Added results of perf. `rand_r` seems like a big chunk of it, but I'm hesitant to use lower-quality random numbers
+ - [x] used xorshift instead
+ - [ ] Use xorshift with a struct instead of a pointer? idk, could be faster for some reason?
- [x] Update repository with correct timing
- [x] Use better profiling approach to capture timing with 1M samples.
- [x] See if program can be reworded so as to use multithreading effectively, e.g., so that you see speed gains proportional to the number of threads used
diff --git a/C/out/samples b/C/out/samples
Binary files differ.
diff --git a/C/perf.txt b/C/perf.txt
@@ -1,25 +1,26 @@
+Samples: 884 of event 'cycles', Event count (approx.): 551167949
Overhead Command Shared Object Symbol
- 23.94% samples libc-2.31.so [.] rand_r
- 18.14% samples libgomp.so.1.0.0 [.] 0x000000000001d132
- 15.43% samples libgomp.so.1.0.0 [.] 0x000000000001d2ea
- 12.16% samples samples [.] mixture._omp_fn.0
- 4.36% samples libm-2.31.so [.] __sin_fma
- 3.49% samples libm-2.31.so [.] __ieee754_log_fma
- 3.34% samples samples [.] random_to
- 3.13% samples samples [.] random_uniform
- 2.77% samples samples [.] split_array_sum._omp_fn.0
- 2.01% samples samples [.] rand_float
- 1.65% samples libm-2.31.so [.] __logf_fma
- 0.88% samples libgomp.so.1.0.0 [.] 0x000000000001d2f5
- 0.86% samples samples [.] ur_normal
- 0.75% samples libm-2.31.so [.] __expf_fma
- 0.70% samples libgomp.so.1.0.0 [.] 0x000000000001d13d
- 0.69% samples libgomp.so.1.0.0 [.] 0x000000000001d139
- 0.57% samples libgomp.so.1.0.0 [.] 0x000000000001d2f1
- 0.57% samples samples [.] sample_1
- 0.55% samples samples [.] random_lognormal
- 0.50% samples [kernel.kallsyms] [k] asm_exc_page_fault
- 0.49% samples [kernel.kallsyms] [k] clear_page_rep
- 0.47% samples samples [.] random_normal
- 0.38% samples [kernel.kallsyms] [k] default_send_IPI_single_phys
+ 35.32% samples samples [.] xorshift32
+ 14.09% samples libgomp.so.1.0.0 [.] 0x000000000001d2ea
+ 12.04% samples libgomp.so.1.0.0 [.] 0x000000000001d132
+ 11.53% samples samples [.] mixture._omp_fn.0
+ 4.55% samples libm-2.31.so [.] __sin_fma
+ 4.24% samples samples [.] rand_0_to_1
+ 3.77% samples samples [.] random_to
+ 3.03% samples libm-2.31.so [.] __logf_fma
+ 1.61% samples libm-2.31.so [.] __expf_fma
+ 1.54% samples samples [.] split_array_sum._omp_fn.0
+ 1.38% samples samples [.] random_uniform
+ 0.94% samples samples [.] ur_normal
+ 0.91% samples libm-2.31.so [.] __ieee754_log_fma
+ 0.74% samples libgomp.so.1.0.0 [.] 0x000000000001d13d
+ 0.52% samples samples [.] sample_0
+ 0.41% samples libm-2.31.so [.] __sqrtf_finite@GLIBC_2.15
+ 0.38% samples samples [.] sample_1
+ 0.36% samples libgomp.so.1.0.0 [.] 0x000000000001d139
+ 0.36% samples libgomp.so.1.0.0 [.] 0x000000000001d2f5
+ 0.22% samples [kernel.kallsyms] [k] native_queued_spin_lock_slowpath
+ 0.18% samples [kernel.kallsyms] [k] _raw_spin_lock_irq
+ 0.18% samples samples [.] random_lognormal
+ 0.17% samples libgomp.so.1.0.0 [.] 0x000000000001d2f1
diff --git a/C/samples.c b/C/samples.c
@@ -81,7 +81,9 @@ float split_array_sum(float** meta_array, int length, int divided_into)
uint32_t xorshift32(uint32_t* seed)
{
- /* Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs" */
+ // Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs"
+ // See <https://stackoverflow.com/questions/53886131/how-does-xorshift32-works>
+ // https://en.wikipedia.org/wiki/Xorshift
uint32_t x = *seed;
x ^= x << 13;
x ^= x >> 17;
@@ -93,6 +95,13 @@ uint32_t xorshift32(uint32_t* seed)
float rand_0_to_1(uint32_t* seed){
return ((float) xorshift32(seed)) / ((float) UINT32_MAX);
+ /*
+ uint32_t x = *seed;
+ x ^= x << 13;
+ x ^= x >> 17;
+ x ^= x << 5;
+ return ((float)(*seed = x))/((float) UINT32_MAX);
+ */
// previously:
// ((float)rand_r(seed) / (float)RAND_MAX)
// and before that: rand, but it wasn't thread-safe.
diff --git a/README.md b/README.md
@@ -33,7 +33,7 @@ The title of this repository is a pun on two meanings of "time to": "how much ti
| Language | Time | Lines of code |
|-----------------------------|-----------|---------------|
-| C (optimized, 16 threads) | 6ms | 222 |
+| C (optimized, 16 threads) | 5ms | 249 |
| Nim | 68ms | 84 |
| C (naïve implementation) | 292ms | 149 |
| Javascript (NodeJS) | 732ms | 69 |
@@ -74,6 +74,8 @@ In fact, the C code ended up being so fast that I had to measure its time by run
And still, there are some missing optimizations, like tweaking the code to take into account cache misses. I'm not exactly sure how that would go, though.
+Once the code was at 6.6ms, there was a 0.6ms gain possible by using OMP better, and a 1ms gain by using a xorshift algorithm instead of rand_r from stdlib.
+
Although the above paragraphs were written in the first person, the C code was written together with Jorge Sierra, who translated the algorithmic improvements from nim to it and added the initial multithreading support.
### NodeJS and Squiggle
diff --git a/time.txt b/time.txt
@@ -1,22 +1,24 @@
# Optimized C
-$ make time-linux
+
+$ make && make time-linux
+gcc -O3 samples.c -fopenmp -lm -o out/samples
Requires /bin/time, found on GNU/Linux systems
Running 100x and taking avg time: OMP_NUM_THREADS=1 out/samples
-Time using 1 thread: 24.00ms
+Time using 1 thread: 20.20ms
Running 100x and taking avg time: OMP_NUM_THREADS=2 out/samples
-Time using 2 threads: 21.80ms
+Time using 2 threads: 17.50ms
Running 100x and taking avg time: OMP_NUM_THREADS=4 out/samples
-Time for 4 threads: 24.40ms
+Time for 4 threads: 17.00ms
Running 100x and taking avg time: OMP_NUM_THREADS=8 out/samples
-Time using 8 threads: 10.40ms
+Time using 8 threads: 8.40ms
Running 100x and taking avg time: OMP_NUM_THREADS=16 out/samples
-Time using 16 threads: 6.60ms
+Time using 16 threads: 5.00ms
# C