time-to-botec

Benchmark sampling in different programming languages
Log | Files | Refs | README

commit 6f4c975bd4324e5c77e9ac1c83123c44bce8623f
parent 829781b8a756fe62569172fec7bd530812abe42b
Author: NunoSempere <nuno.sempere@protonmail.com>
Date:   Sat, 24 Feb 2024 14:51:19 -0300

add fast python, other tweaks

Diffstat:
MC/makefile | 2+-
MC/out/samples | 0
MREADME.md | 5+++++
Mgo/makefile | 5+++++
Mgo/squiggle | 0
Mgo/squiggle.go | 12+++---------
Mmakefile | 3+++
Apython/samples-fast.py | 48++++++++++++++++++++++++++++++++++++++++++++++++
Msquiggle.c/makefile | 16+++++++++++++---
Msquiggle.c/samples | 0
Mtime.txt | 4++++
11 files changed, 82 insertions(+), 13 deletions(-)

diff --git a/C/makefile b/C/makefile @@ -24,7 +24,7 @@ MATH=-lm DEBUG= #'-g' STANDARD=-std=c99 WARNINGS=-Wall -OPTIMIZED=-O3 #-O3 actually gives better performance than -Ofast, at least for this version +OPTIMIZED=-O3 #-O3 actually gives better performance than -Ofast, at least for this version. Could also add -march=native LOCAL=-march=native OPENMP=-fopenmp diff --git a/C/out/samples b/C/out/samples Binary files differ. diff --git a/README.md b/README.md @@ -26,6 +26,7 @@ The name of this repository is a pun on two meanings of "time to": "how much tim |-----------------------------|-----------|---------------| | C | 6.20ms | 252 | | squiggle.c | 7.20ms | 29* | +| go | 32.70ms | 150 | | Nim | 41.10ms | 84 | | Lua (LuaJIT) | 68.80ms | 82 | | OCaml (flambda) | 185.50ms | 123 | @@ -83,6 +84,10 @@ I like the [operator](http://duskos.org/#operator) section of [Dusk OS](http://d > Dusk OS doesn't have users, but operators. What's the difference? Control. You use a phone, you use a coffee machine, hell you even use a car these days. But you operate a bulldozer, you operate a crane, you operate a plane. +### Go + +Go is reasonably fast, though not as fast as C. Partly this is because it uses a different, slightly more robust random number generator. I have high hopes for go, hopefully it will fit the role for me of a C with fewer warts. + ### NodeJS and Squiggle Using [bun](https://bun.sh/) instead of node is actually a bit slower for the raw js code. Also, both the NodeJS and the Squiggle code use [stdlib](https://stdlib.io/) in their innards, which has a bunch of interleaved functions that make the code slower. It's possible that not using that external library could make the code faster. But at the same time, the js approach does seem to be to use external libraries whenever possible. diff --git a/go/makefile b/go/makefile @@ -8,6 +8,11 @@ build-complex: go build -ldflags="-s -w" squiggle.go # https://stackoverflow.com/questions/45003259/passing-an-optimization-flag-to-a-go-compiler +build-show: + go build -gcflags="-m" squiggle.go + # https://pkg.go.dev/cmd/compile + # consider pgo: <https://go.dev/doc/pgo> + run: ./squiggle diff --git a/go/squiggle b/go/squiggle Binary files differ. diff --git a/go/squiggle.go b/go/squiggle.go @@ -5,11 +5,11 @@ import "math" import "sync" import rand "math/rand/v2" +// https://pkg.go.dev/math/rand/v2 + type src = *rand.Rand type func64 = func(src) float64 -// https://pkg.go.dev/math/rand/v2 - func sample_unit_uniform(r src) float64 { return r.Float64() } @@ -75,7 +75,6 @@ func sample_mixture(fs []func64, weights []float64, r src) float64 { break } } - // fmt.Println(cumsummed_normalized_weights) if flag == 0 { result = fs[len(fs)-1](r) @@ -84,12 +83,6 @@ func sample_mixture(fs []func64, weights []float64, r src) float64 { } -func slice_fill(xs []float64, fs func64, r src) { - for i := range xs { - xs[i] = fs(r) - } -} - func sample_parallel(f func64, n_samples int) []float64 { var num_threads = 16 var xs = make([]float64, n_samples) @@ -135,6 +128,7 @@ func main() { avg = avg / float64(n_samples) fmt.Printf("Average: %v\n", avg) /* + // Without concurrency: n_samples := 1_000_000 var r = rand.New(rand.NewPCG(uint64(1), uint64(2))) var avg float64 = 0 diff --git a/makefile b/makefile @@ -20,6 +20,9 @@ time-all: @echo "# Squiggle (0.8.6)" && cd squiggle && make time-linux && echo && echo @echo "# SquigglePy (0.27)" && cd squigglepy && make time && echo && echo @echo "# squiggle.c" && cd squiggle.c && make time-linux && echo && echo + @echo "# squiggle.go" && cd go && make time-linux && echo && echo + + record: make time-all > time.txt 2>&1 diff --git a/python/samples-fast.py b/python/samples-fast.py @@ -0,0 +1,48 @@ +import numpy as np +rng = np.random.default_rng(123) +DEFAULT_N = 1000000 + + +def normal(mean, std, n=DEFAULT_N): + return rng.normal(mean, std, n) + + +def lognormal(mean, std, n=DEFAULT_N): + return rng.lognormal(mean, std, n) + + +def to(low, high, n=DEFAULT_N): + normal95confidencePoint = 1.6448536269514722 + logLow = np.log(low) + logHigh = np.log(high) + meanlog = (logLow + logHigh) / 2 + sdlog = (logHigh - logLow) / (2 * normal95confidencePoint) + return lognormal(meanlog, sdlog, n) + + +def optimized_mixture(samples_funcs, weights_array, n=DEFAULT_N): + normalized_weights = weights_array / sum(weights_array) + cummulative_sums = np.cumsum(normalized_weights) + helper_probs = rng.random(n) + results = np.empty(n) + for i, (start, end) in enumerate(zip([0]+list(cummulative_sums[:-1]), cummulative_sums)): + idx = np.where((helper_probs >= start) & (helper_probs < end))[0] + # Generate only as many samples as needed for each distribution + samples_func = samples_funcs[i] + results[idx] = samples_func(n=len(idx)) + return results + + +p_a = 0.8 +p_b = 0.5 +p_c = p_a * p_b +dists = [ + lambda n=1: np.zeros(n), # Distribution returning 0 + lambda n=1: np.ones(n), # Distribution returning 1 + lambda n=1: to(1, 3, n), + lambda n=1: to(2, 10, n) +] +weights = np.array([1 - p_c, p_c/2, p_c/4, p_c/4]) +result = optimized_mixture(dists, weights) +mean_result = np.mean(result) +print(f'Mean result: {mean_result}') diff --git a/squiggle.c/makefile b/squiggle.c/makefile @@ -1,8 +1,18 @@ OUTPUT=./samples CC=gcc +OPTIMIZATIONS=-funit-at-a-time -march=native -fno-math-errno -ffast-math -std=gnu99 -fno-unroll-loops -flto build: - $(CC) -O3 -march=native samples.c ./squiggle_c/squiggle.c ./squiggle_c/squiggle_more.c -lm -fopenmp -o $(OUTPUT) + $(CC) -O3 samples.c ./squiggle_c/squiggle.c ./squiggle_c/squiggle_more.c -lm -fopenmp -o $(OUTPUT) + +experimental: + # https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html + rm -f *.gcda + $(CC) -Ofast -fprofile-generate $(OPTIMIZATIONS) samples.c ./squiggle_c/squiggle.c ./squiggle_c/squiggle_more.c -lm -fopenmp -o $(OUTPUT) + ./$(OUTPUT) + $(CC) -Ofast -fprofile-use $(OPTIMIZATIONS) samples.c ./squiggle_c/squiggle.c ./squiggle_c/squiggle_more.c -lm -fopenmp -o $(OUTPUT) + rm *.gcda + # Using -Ofast increases speed a bit, but I don't trust it. <https://stackoverflow.com/questions/61232427/gcc-differences-between-o3-vs-ofast-optimizations> install: rm -r squiggle_c @@ -25,8 +35,8 @@ install-git: sed -i 's|../../..|squiggle_c|' samples.c time-linux: - @echo "Running 100x and taking avg time: OMP_NUM_THREADS=16 $(OUTPUT)" - @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..100}; do OMP_NUM_THREADS=16 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 100" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo + @echo "Running 1000x and taking avg time: OMP_NUM_THREADS=16 $(OUTPUT)" + @t=$$(/usr/bin/time -f "%e" -p bash -c 'for i in {1..1001}; do OMP_NUM_THREADS=16 $(OUTPUT); done' 2>&1 >/dev/null | grep real | awk '{print $$2}' ); echo "scale=2; 1000 * $$t / 1000" | bc | sed "s|^|Time using 16 threads: |" | sed 's|$$|ms|' && echo install-small: rm -r squiggle_c diff --git a/squiggle.c/samples b/squiggle.c/samples Binary files differ. diff --git a/time.txt b/time.txt @@ -92,4 +92,8 @@ sys 0m2.226s Running 100x and taking avg time: OMP_NUM_THREADS=16 ./samples Time using 16 threads: 7.20ms +# go +make time-linux +Running 100x and taking avg time: ./squiggle +Time using 16 threads: 32.70ms