add fast python, other tweaks - time-to-botec - Benchmark sampling in different programming languages

commit 6f4c975bd4324e5c77e9ac1c83123c44bce8623f
parent 829781b8a756fe62569172fec7bd530812abe42b
Author: NunoSempere <nuno.sempere@protonmail.com>
Date:   Sat, 24 Feb 2024 14:51:19 -0300

add fast python, other tweaks

Diffstat:
M C/makefile  | 2 +-
M C/out/samples  | 0 
M README.md  | 5 +++++
M go/makefile  | 5 +++++
M go/squiggle  | 0 
M go/squiggle.go  | 12 +++---------
M makefile  | 3 +++
A python/samples-fast.py  | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
M squiggle.c/makefile  | 16 +++++++++++++---
M squiggle.c/samples  | 0 
M time.txt  | 4 ++++

11 files changed, 82 insertions(+), 13 deletions(-)
diff --git a/C/makefile b/C/makefile
@@ -24,7 +24,7 @@ MATH=-lm
 DEBUG= #'-g'
 STANDARD=-std=c99
 WARNINGS=-Wall
-OPTIMIZED=-O3 #-O3 actually gives better performance than -Ofast, at least for this version
+OPTIMIZED=-O3 #-O3 actually gives better performance than -Ofast, at least for this version. Could also add -march=native 
 LOCAL=-march=native 
 OPENMP=-fopenmp
 
diff --git a/C/out/samples b/C/out/samples
Binary files differ.
diff --git a/README.md b/README.md
@@ -26,6 +26,7 @@ The name of this repository is a pun on two meanings of "time to": "how much tim
 |-----------------------------|-----------|---------------|
 | C                           | 6.20ms   | 252  |
 | squiggle.c                  | 7.20ms   | 29*  | 
+| go                          | 32.70ms  | 150  | 
 | Nim                         | 41.10ms  | 84   |
 | Lua (LuaJIT)                | 68.80ms  | 82   |
 | OCaml (flambda)             | 185.50ms | 123  |
@@ -83,6 +84,10 @@ I like the [operator](http://duskos.org/#operator) section of [Dusk OS](http://d
 
 > Dusk OS doesn't have users, but operators. What's the difference? Control. You use a phone, you use a coffee machine, hell you even use a car these days. But you operate a bulldozer, you operate a crane, you operate a plane.
 
+### Go
+
+Go is reasonably fast, though not as fast as C. Partly this is because it uses a different, slightly more robust random number generator. I have high hopes for go, hopefully it will fit the role for me of a C with fewer warts.
+
 ### NodeJS and Squiggle
 
 Using [bun](https://bun.sh/) instead of node is actually a bit slower for the raw js code. Also, both the NodeJS and the Squiggle code use [stdlib](https://stdlib.io/) in their innards, which has a bunch of interleaved functions that make the code slower. It's possible that not using that external library could make the code faster. But at the same time, the js approach does seem to be to use external libraries whenever possible. 
diff --git a/go/makefile b/go/makefile
@@ -8,6 +8,11 @@ build-complex:
 	go build -ldflags="-s -w" squiggle.go  
 	# https://stackoverflow.com/questions/45003259/passing-an-optimization-flag-to-a-go-compiler
 
+build-show:
+	go build -gcflags="-m" squiggle.go 
+	# https://pkg.go.dev/cmd/compile
+	# consider pgo: <https://go.dev/doc/pgo>
+
 run:
 	./squiggle
 
diff --git a/go/squiggle b/go/squiggle
Binary files differ.
diff --git a/go/squiggle.go b/go/squiggle.go
@@ -5,11 +5,11 @@ import "math"
 import "sync"
 import rand "math/rand/v2"
 
+// https://pkg.go.dev/math/rand/v2
+
 type src = *rand.Rand
 type func64 = func(src) float64
 
-// https://pkg.go.dev/math/rand/v2
-
 func sample_unit_uniform(r src) float64 {
 	return r.Float64()
 }
@@ -75,7 +75,6 @@ func sample_mixture(fs []func64, weights []float64, r src) float64 {
 			break
 		}
 	}
-	// fmt.Println(cumsummed_normalized_weights)
 
 	if flag == 0 {
 		result = fs[len(fs)-1](r)
@@ -84,12 +83,6 @@ func sample_mixture(fs []func64, weights []float64, r src) float64 {
 
 }
 
-func slice_fill(xs []float64, fs func64, r src) {
-	for i := range xs {
-		xs[i] = fs(r)
-	}
-}
-
 func sample_parallel(f func64, n_samples int) []float64 {
 	var num_threads = 16
 	var xs = make([]float64, n_samples)
@@ -135,6 +128,7 @@ func main() {
 	avg = avg / float64(n_samples)
 	fmt.Printf("Average: %v\n", avg)
 	/*
+	  // Without concurrency:
 		n_samples := 1_000_000
 		var r = rand.New(rand.NewPCG(uint64(1), uint64(2)))
 		var avg float64 = 0
diff --git a/makefile b/makefile
@@ -20,6 +20,9 @@ time-all:
 	@echo "# Squiggle (0.8.6)" && cd squiggle && make time-linux && echo && echo
 	@echo "# SquigglePy (0.27)" && cd squigglepy && make time && echo && echo
 	@echo "# squiggle.c" && cd squiggle.c && make time-linux && echo && echo
+	@echo "# squiggle.go" && cd go && make time-linux && echo && echo
+
+
 
 record:
 	make time-all > time.txt 2>&1
diff --git a/python/samples-fast.py b/python/samples-fast.py
@@ -0,0 +1,48 @@
+import numpy as np
+rng = np.random.default_rng(123)
+DEFAULT_N = 1000000
+
+
+def normal(mean, std, n=DEFAULT_N):
+    return rng.normal(mean, std, n)
+
+
+def lognormal(mean, std, n=DEFAULT_N):
+    return rng.lognormal(mean, std, n)
+
+
+def to(low, high, n=DEFAULT_N):
+    normal95confidencePoint = 1.6448536269514722
+    logLow = np.log(low)
+    logHigh = np.log(high)
+    meanlog = (logLow + logHigh) / 2
+    sdlog = (logHigh - logLow) / (2 * normal95confidencePoint)
+    return lognormal(meanlog, sdlog, n)
+
+
+def optimized_mixture(samples_funcs, weights_array, n=DEFAULT_N):
+    normalized_weights = weights_array / sum(weights_array)
+    cummulative_sums = np.cumsum(normalized_weights)
+    helper_probs = rng.random(n)
+    results = np.empty(n)
+    for i, (start, end) in enumerate(zip([0]+list(cummulative_sums[:-1]), cummulative_sums)):
+        idx = np.where((helper_probs >= start) & (helper_probs < end))[0]
+        # Generate only as many samples as needed for each distribution
+        samples_func = samples_funcs[i]
+        results[idx] = samples_func(n=len(idx))
+    return results
+
+
+p_a = 0.8
+p_b = 0.5
+p_c = p_a * p_b
+dists = [
+    lambda n=1: np.zeros(n),  # Distribution returning 0
+    lambda n=1: np.ones(n),   # Distribution returning 1
+    lambda n=1: to(1, 3, n),
+    lambda n=1: to(2, 10, n)
+]
+weights = np.array([1 - p_c, p_c/2, p_c/4, p_c/4])
+result = optimized_mixture(dists, weights)
+mean_result = np.mean(result)
+print(f'Mean result: {mean_result}')
diff --git a/squiggle.c/makefile b/squiggle.c/makefile
@@ -1,8 +1,18 @@
 OUTPUT=./samples
 CC=gcc
+OPTIMIZATIONS=-funit-at-a-time -march=native -fno-math-errno -ffast-math -std=gnu99 -fno-unroll-loops -flto
 
 build:
-	$(CC) -O3 -march=native samples.c ./squiggle_c/squiggle.c  ./squiggle_c/squiggle_more.c -lm -fopenmp -o $(OUTPUT)
+	$(CC) -O3 samples.c ./squiggle_c/squiggle.c  ./squiggle_c/squiggle_more.c -lm -fopenmp -o $(OUTPUT)
+
+experimental:
+	# https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html
+	rm -f *.gcda
+	$(CC) -Ofast -fprofile-generate $(OPTIMIZATIONS) samples.c ./squiggle_c/squiggle.c  ./squiggle_c/squiggle_more.c -lm -fopenmp -o $(OUTPUT)
+	./$(OUTPUT)
+	$(CC) -Ofast -fprofile-use  $(OPTIMIZATIONS) samples.c ./squiggle_c/squiggle.c  ./squiggle_c/squiggle_more.c -lm -fopenmp -o $(OUTPUT)
+	rm *.gcda
+	# Using -Ofast increases speed a bit, but I don't trust it. <https://stackoverflow.com/questions/61232427/gcc-differences-between-o3-vs-ofast-optimizations>
 
 install: 
 	rm -r squiggle_c
@@ -25,8 +35,8 @@ install-git:
 	sed -i 's|../../..|squiggle_c|' samples.c
 
 time-linux: 
-	@echo "Running 100x and taking avg time: OMP_NUM_THREADS=16 $(OUTPUT)"
-	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=16 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo
+	@echo "Running 1000x and taking avg time: OMP_NUM_THREADS=16 $(OUTPUT)"
+	@t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..1001}; do OMP_NUM_THREADS=16 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 1000" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo
 
 install-small:
 	rm -r squiggle_c
diff --git a/squiggle.c/samples b/squiggle.c/samples
Binary files differ.
diff --git a/time.txt b/time.txt
@@ -92,4 +92,8 @@ sys     0m2.226s
 Running 100x and taking avg time: OMP_NUM_THREADS=16 ./samples
 Time using 16 threads: 7.20ms
 
+# go
 
+make time-linux
+Running 100x and taking avg time: ./squiggle
+Time using 16 threads: 32.70ms

	time-to-botec Benchmark sampling in different programming languages
	Log \| Files \| Refs \| README

M	C/makefile	\|	2	+-
M	C/out/samples	\|	0
M	README.md	\|	5	+++++
M	go/makefile	\|	5	+++++
M	go/squiggle	\|	0
M	go/squiggle.go	\|	12	+++---------
M	makefile	\|	3	+++
A	python/samples-fast.py	\|	48	++++++++++++++++++++++++++++++++++++++++++++++++
M	squiggle.c/makefile	\|	16	+++++++++++++---
M	squiggle.c/samples	\|	0
M	time.txt	\|	4	++++