Avatar for the OpenMathLib user
OpenMathLib
OpenBLAS
BlogDocsChangelog

Optimize gemv_n_sve_v1x3 kernel

#5292
Comparing
isharif168:optimized_gemv_n_1x3
(
1ed7eb6
) with
develop
(
02267d8
)
CodSpeed Performance Gauge
+11%
Improvements
1
Regressions
0
Untouched
61
New
0
Dropped
0
Ignored
0

Benchmarks

Improved

test_dgemv[1000-s]
benchmark/pybench/benchmarks/bench_blas.py::test_dgemv[1000-s]
CodSpeed Performance Gauge
+11%
7.7 ms
7 ms

Passed

test_dgemv[100-s]
benchmark/pybench/benchmarks/bench_blas.py::test_dgemv[100-s]
CodSpeed Performance Gauge
+6%
110.3 µs
103.7 µs
test_dgemv[1000-d]
benchmark/pybench/benchmarks/bench_blas.py::test_dgemv[1000-d]
CodSpeed Performance Gauge
+5%
14.7 ms
13.9 ms
test_dgemv[100-d]
benchmark/pybench/benchmarks/bench_blas.py::test_dgemv[100-d]
CodSpeed Performance Gauge
+5%
147.6 µs
141 µs
test_syrk[100-d]
benchmark/pybench/benchmarks/bench_blas.py::test_syrk[100-d]
CodSpeed Performance Gauge
0%
340.1 µs
339.3 µs
test_nrm2[100-d]
benchmark/pybench/benchmarks/bench_blas.py::test_nrm2[100-d]
CodSpeed Performance Gauge
0%
35.5 µs
35.4 µs
test_syrk[100-c]
benchmark/pybench/benchmarks/bench_blas.py::test_syrk[100-c]
CodSpeed Performance Gauge
0%
472.7 µs
472 µs
test_gesv[100-d]
benchmark/pybench/benchmarks/bench_blas.py::test_gesv[100-d]
CodSpeed Performance Gauge
0%
395.9 µs
395.4 µs
test_gesv[100-c]
benchmark/pybench/benchmarks/bench_blas.py::test_gesv[100-c]
CodSpeed Performance Gauge
0%
694.3 µs
693.6 µs
test_gesv[1000-d]
benchmark/pybench/benchmarks/bench_blas.py::test_gesv[1000-d]
CodSpeed Performance Gauge
0%
93.3 ms
93.3 ms
test_nrm2[1000-dz]
benchmark/pybench/benchmarks/bench_blas.py::test_nrm2[1000-dz]
CodSpeed Performance Gauge
0%
35 µs
35 µs
test_dgemv[100-c]
benchmark/pybench/benchmarks/bench_blas.py::test_dgemv[100-c]
CodSpeed Performance Gauge
0%
149.4 µs
149.4 µs
test_dgemv[1000-c]
benchmark/pybench/benchmarks/bench_blas.py::test_dgemv[1000-c]
CodSpeed Performance Gauge
0%
14.8 ms
14.8 ms
test_gesv[1000-c]
benchmark/pybench/benchmarks/bench_blas.py::test_gesv[1000-c]
CodSpeed Performance Gauge
0%
188.6 ms
188.6 ms
test_gesdd[mn1-d]
benchmark/pybench/benchmarks/bench_blas.py::test_gesdd[mn1-d]
CodSpeed Performance Gauge
0%
93.8 ms
93.8 ms
test_gesv[1000-s]
benchmark/pybench/benchmarks/bench_blas.py::test_gesv[1000-s]
CodSpeed Performance Gauge
0%
52.6 ms
52.6 ms
test_syrk[1000-d]
benchmark/pybench/benchmarks/bench_blas.py::test_syrk[1000-d]
CodSpeed Performance Gauge
0%
130.4 ms
130.4 ms
test_dgemv[1000-z]
benchmark/pybench/benchmarks/bench_blas.py::test_dgemv[1000-z]
CodSpeed Performance Gauge
0%
26.3 ms
26.3 ms
test_syrk[1000-z]
benchmark/pybench/benchmarks/bench_blas.py::test_syrk[1000-z]
CodSpeed Performance Gauge
0%
476.4 ms
476.4 ms
test_gemm[1000-d]
benchmark/pybench/benchmarks/bench_blas.py::test_gemm[1000-d]
CodSpeed Performance Gauge
0%
239.4 ms
239.4 ms
test_gemm[1000-c]
benchmark/pybench/benchmarks/bench_blas.py::test_gemm[1000-c]
CodSpeed Performance Gauge
0%
426 ms
426 ms
test_gemm[1000-z]
benchmark/pybench/benchmarks/bench_blas.py::test_gemm[1000-z]
CodSpeed Performance Gauge
0%
875.6 ms
875.6 ms
test_syrk[1000-c]
benchmark/pybench/benchmarks/bench_blas.py::test_syrk[1000-c]
CodSpeed Performance Gauge
0%
227.5 ms
227.5 ms
test_gemm[1000-s]
benchmark/pybench/benchmarks/bench_blas.py::test_gemm[1000-s]
CodSpeed Performance Gauge
0%
117.4 ms
117.4 ms
test_syrk[1000-s]
benchmark/pybench/benchmarks/bench_blas.py::test_syrk[1000-s]
CodSpeed Performance Gauge
0%
65.4 ms
65.4 ms
test_gesv[1000-z]
benchmark/pybench/benchmarks/bench_blas.py::test_gesv[1000-z]
CodSpeed Performance Gauge
0%
353.3 ms
353.4 ms
test_gesdd[mn1-s]
benchmark/pybench/benchmarks/bench_blas.py::test_gesdd[mn1-s]
CodSpeed Performance Gauge
0%
65.2 ms
65.2 ms
test_syev[50-d]
benchmark/pybench/benchmarks/bench_blas.py::test_syev[50-d]
CodSpeed Performance Gauge
0%
1.4 ms
1.4 ms
test_gemm[100-z]
benchmark/pybench/benchmarks/bench_blas.py::test_gemm[100-z]
CodSpeed Performance Gauge
0%
1.2 ms
1.2 ms
test_syev[200-s]
benchmark/pybench/benchmarks/bench_blas.py::test_syev[200-s]
CodSpeed Performance Gauge
0%
49.1 ms
49.1 ms
test_syev[50-s]
benchmark/pybench/benchmarks/bench_blas.py::test_syev[50-s]
CodSpeed Performance Gauge
0%
1.3 ms
1.3 ms
test_syrk[100-z]
benchmark/pybench/benchmarks/bench_blas.py::test_syrk[100-z]
CodSpeed Performance Gauge
0%
855.3 µs
855.5 µs
test_gemm[100-c]
benchmark/pybench/benchmarks/bench_blas.py::test_gemm[100-c]
CodSpeed Performance Gauge
0%
659 µs
659.2 µs
test_syev[200-d]
benchmark/pybench/benchmarks/bench_blas.py::test_syev[200-d]
CodSpeed Performance Gauge
0%
58.6 ms
58.6 ms
test_gemm[100-d]
benchmark/pybench/benchmarks/bench_blas.py::test_gemm[100-d]
CodSpeed Performance Gauge
0%
470.6 µs
470.8 µs
test_dgemv[100-z]
benchmark/pybench/benchmarks/bench_blas.py::test_dgemv[100-z]
CodSpeed Performance Gauge
0%
230.5 µs
230.6 µs
test_syrk[100-s]
benchmark/pybench/benchmarks/bench_blas.py::test_syrk[100-s]
CodSpeed Performance Gauge
0%
212.9 µs
213.1 µs
test_gesdd[mn0-d]
benchmark/pybench/benchmarks/bench_blas.py::test_gesdd[mn0-d]
CodSpeed Performance Gauge
0%
120.1 µs
120.3 µs
test_gesv[100-z]
benchmark/pybench/benchmarks/bench_blas.py::test_gesv[100-z]
CodSpeed Performance Gauge
0%
934.1 µs
935.5 µs
test_gemm[100-s]
benchmark/pybench/benchmarks/bench_blas.py::test_gemm[100-s]
CodSpeed Performance Gauge
0%
272.6 µs
273 µs
test_dgbmv[1-1000-z]
benchmark/pybench/benchmarks/bench_blas.py::test_dgbmv[1-1000-z]
CodSpeed Performance Gauge
0%
118.1 µs
118.4 µs
test_nrm2[100-dz]
benchmark/pybench/benchmarks/bench_blas.py::test_nrm2[100-dz]
CodSpeed Performance Gauge
0%
28.1 µs
28.1 µs
test_nrm2[1000-d]
benchmark/pybench/benchmarks/bench_blas.py::test_nrm2[1000-d]
CodSpeed Performance Gauge
0%
29.9 µs
30 µs
test_dgbmv[1-1000-d]
benchmark/pybench/benchmarks/bench_blas.py::test_dgbmv[1-1000-d]
CodSpeed Performance Gauge
0%
82.8 µs
83.1 µs
test_gesv[100-s]
benchmark/pybench/benchmarks/bench_blas.py::test_gesv[100-s]
CodSpeed Performance Gauge
0%
256.6 µs
257.5 µs
test_daxpy[1000-c]
benchmark/pybench/benchmarks/bench_blas.py::test_daxpy[1000-c]
CodSpeed Performance Gauge
0%
32.1 µs
32.3 µs
test_gesdd[mn0-s]
benchmark/pybench/benchmarks/bench_blas.py::test_gesdd[mn0-s]
CodSpeed Performance Gauge
0%
108.4 µs
109 µs
test_daxpy[100-z]
benchmark/pybench/benchmarks/bench_blas.py::test_daxpy[100-z]
CodSpeed Performance Gauge
0%
25.2 µs
25.3 µs
test_dgbmv[1-1000-c]
benchmark/pybench/benchmarks/bench_blas.py::test_dgbmv[1-1000-c]
CodSpeed Performance Gauge
0%
98.6 µs
99.1 µs
test_dgbmv[1-100-z]
benchmark/pybench/benchmarks/bench_blas.py::test_dgbmv[1-100-z]
CodSpeed Performance Gauge
-1%
41.4 µs
41.6 µs
test_daxpy[100-c]
benchmark/pybench/benchmarks/bench_blas.py::test_daxpy[100-c]
CodSpeed Performance Gauge
-1%
24.6 µs
24.7 µs
test_dgbmv[1-1000-s]
benchmark/pybench/benchmarks/bench_blas.py::test_dgbmv[1-1000-s]
CodSpeed Performance Gauge
-1%
74.2 µs
74.7 µs
test_dgbmv[1-100-d]
benchmark/pybench/benchmarks/bench_blas.py::test_dgbmv[1-100-d]
CodSpeed Performance Gauge
-1%
37.2 µs
37.5 µs
test_daxpy[1000-z]
benchmark/pybench/benchmarks/bench_blas.py::test_daxpy[1000-z]
CodSpeed Performance Gauge
-1%
39.9 µs
40.2 µs
test_daxpy[100-s]
benchmark/pybench/benchmarks/bench_blas.py::test_daxpy[100-s]
CodSpeed Performance Gauge
-1%
23.4 µs
23.6 µs
test_dgbmv[1-100-s]
benchmark/pybench/benchmarks/bench_blas.py::test_dgbmv[1-100-s]
CodSpeed Performance Gauge
-1%
36.5 µs
36.8 µs
test_daxpy[1000-d]
benchmark/pybench/benchmarks/bench_blas.py::test_daxpy[1000-d]
CodSpeed Performance Gauge
-1%
31.7 µs
32 µs
test_dgbmv[1-100-c]
benchmark/pybench/benchmarks/bench_blas.py::test_dgbmv[1-100-c]
CodSpeed Performance Gauge
-1%
39.4 µs
39.8 µs
test_daxpy[1000-s]
benchmark/pybench/benchmarks/bench_blas.py::test_daxpy[1000-s]
CodSpeed Performance Gauge
-1%
26.8 µs
27.1 µs
test_dot[100]
benchmark/pybench/benchmarks/bench_blas.py::test_dot[100]
CodSpeed Performance Gauge
-1%
21.6 µs
21.9 µs
test_dot[1000]
benchmark/pybench/benchmarks/bench_blas.py::test_dot[1000]
CodSpeed Performance Gauge
-1%
27.6 µs
28 µs
test_daxpy[100-d]
benchmark/pybench/benchmarks/bench_blas.py::test_daxpy[100-d]
CodSpeed Performance Gauge
-1%
23.5 µs
23.8 µs

Commits

Click on a commit to change the comparison range
Base
develop
02267d8
+11%
Optimize gemv_n_sve_v1x3 kernel - Calculate predicate outside the loop - Divide matrix in blocks of 3
1ed7eb6
5 days ago
by isharif168
© 2025 CodSpeed Technology
Home Terms Privacy Docs