Speeding up Python on the Elbrus architecture
Using --with-computed-gotos=no
is essential, configure
detects its availability, but computed-gotos
slows down Python interpreter by 4 times!
Use the patch below to speed up the interpreter by about 10-15%.
Use --enable-optimizations
to get around 10% speedup.
Patching with sed
and awk
gives more resilience against changes in Python sources.
tested for Python 2.7.18 using LCC 1.25.15
# unsupported profiling option
sed -i " s| -fprofile-correction||" configure*
# need this option for LCC because tests use threads
sed -i " s|-fprofile-generate|-fprofile-generate-parallel|" configure*
# LCC profiling bug workaround
sed -i " /^Modules\\ /_math.o:/{n;s|\$ (CCSHARED) \$ (PY_CFLAGS)|\$ (filter-out -fprofile-generate-parallel,\$ (CCSHARED) \$ (PY_CFLAGS))|}" Makefile.pre.in
# faster interpreter on Elbrus
sed -i " /#if USE_COMPUTED_GOTOS/{:b;g;N;/DISPATCH/!bb;s|^|#undef USE_COMPUTED_GOTOS\n#define USE_COMPUTED_GOTOS 0\n#if 1\n#define TARGET(op) case op:\n#define TARGET_WITH_IMPL(op, impl) if(0)goto impl;case op:\n#define TARGET_NOARG TARGET\n#define TARGET_WITH_IMPL_NOARG TARGET_WITH_IMPL|;:a;n;ba}" Python/ceval.c
sed -i " s|goto \\ *opcode_targets\\ [\\ *next_instr++\\ ]|opcode=NEXTOP();oparg=0;if(HAS_ARG(opcode))oparg=NEXTARG();goto switch_loop|" Python/ceval.c
sed -i " /switch (opcode) {/{s|^|switch_loop:|;:a;n;ba}" Python/ceval.c
sed -i " /_unknown_opcode:/{n;n;s|$|__builtin_unreachable();\n#include \" opcode_unknown.h\" |}" Python/ceval.c
awk ' /_unknown_opcode/{print "case " NR-2 ":"}' Python/opcode_targets.h > Python/opcode_unknown.h
tested for Python 3.9.5 using LCC 1.25.15
# add e2k arch
sed -i " /elif defined(__hppa__)/i\\ \n# elif defined (__e2k__)\n e2k-linux-gnu" configure*
# unsupported profiling option
sed -i " s| -fprofile-correction||" configure*
# LCC profiling bug workaround
sed -i " /^Modules\\ /_math.o:/{n;s|\$ (CCSHARED) \$ (PY_CORE_CFLAGS)|\$ (filter-out -fprofile-generate,\$ (CCSHARED) \$ (PY_CORE_CFLAGS))|}" Makefile.pre.in
# faster interpreter on Elbrus
sed -i " /#if USE_COMPUTED_GOTOS/{:b;g;N;/LLTRACE/!bb;s|^|#undef USE_COMPUTED_GOTOS\n#define USE_COMPUTED_GOTOS 0\n#if 1\n#define TARGET(op) op|;:a;n;ba}" Python/ceval.c
sed -i " s|\\ *opcode_targets\\ [opcode\\ ]|switch_loop|" Python/ceval.c
sed -i " /switch (opcode) {/{s|^|switch_loop:|;:a;n;ba}" Python/ceval.c
sed -i " /_unknown_opcode:/{n;n;s|$|Py_UNREACHABLE();\n#include \" opcode_unknown.h\" |}" Python/ceval.c
awk ' /_unknown_opcode/{print "case " NR-2 ":"}' Python/opcode_targets.h > Python/opcode_unknown.h
tested for Python 3.10.0 using LCC 1.25.20
# add e2k arch
sed -i " /elif defined(__hppa__)/i\\ \n# elif defined (__e2k__)\n e2k-linux-gnu" configure*
# unsupported profiling option
sed -i " s| -fprofile-correction||" configure*
# LCC profiling bug workaround
sed -i " /^Modules\\ /_math.o:/{n;s|\$ (CCSHARED) \$ (PY_CORE_CFLAGS)|\$ (filter-out -fprofile-generate-parallel,\$ (CCSHARED) \$ (PY_CORE_CFLAGS))|}" Makefile.pre.in
sed -i " s|-fprofile-generate|-fprofile-generate-parallel|" configure*
sed -i " s/rm -f profile-clean-stamp/&; eprof -s eprof.sum/" Makefile.pre.in
# exclude hanging tests
sed -i " s/^.*def test_stack_overflow(/ @unittest.skipIf(True, 'hangs')\n&/" Lib/test/test_faulthandler.py
# faster interpreter on Elbrus
sed -i " /#if USE_COMPUTED_GOTOS/{s|^|#undef USE_COMPUTED_GOTOS\n#define USE_COMPUTED_GOTOS 0\n|;:a;n;ba}" Python/ceval.c
sed -i " /_unknown_opcode:/{n;n;s|$|Py_UNREACHABLE();\n#include \" opcode_unknown.h\" |}" Python/ceval.c
awk ' /_unknown_opcode/{print "case " NR-2 ":"}' Python/opcode_targets.h > Python/opcode_unknown.h