-
Notifications
You must be signed in to change notification settings - Fork 6.8k
CI slowdown on CentOS CPU related to OpenMP and intgemm #19502
Description
Description
intgemm is a 3rd-party library written by me and included as a submodule. Unrelated continuous integration tests were going slower afterwards on CentOS 7 CPU. This has been discussed in comments on 1393602#commitcomment-43224930 .
After losing several hairs, the issue appears to be OpenMP support in intgemm's CMakeLists.txt
:
https://github.com/kpu/intgemm/blob/8f28282c3bd854922da638024d2659be52e892e9/CMakeLists.txt#L47-L56
I think this is causing MXNet to use the slow CentOS OpenMP instead of the bundled support.
Question
What's the best practice for a standalone library that has its own OpenMP support to not step on MXNet's internal support?
To Reproduce
Started with c5.18xlarge with the latest AL2 machine learning image.
To build:
# Always delete the build directory. This is sneaky and appears to survive.
rm -rf build; ci/build.py --docker-registry mxnetci --platform centos7_cpu --docker-build-retries 3 --shm-size 500m /work/runtime_functions.sh build_centos7_cpu
To run:
#Running
docker run --cap-add SYS_PTRACE --rm --shm-size=500m -v $HOME/incubator-mxnet:/work/mxnet -v $HOME/incubator-mxnet/build:/work/build -v $HOME/.ccache:/work/ccache -u 1001:1001 -e CCACHE_MAXSIZE=500G -e CCACHE_TEMPDIR=/tmp/ccache -e CCACHE_DIR=/work/ccache -e CCACHE_LOGFILE=/tmp/ccache.log -ti mxnetci/build.centos7_cpu:latest bash
CI_CUDA_COMPUTE_CAPABILITIES='-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_70,code=sm_70'
CI_CMAKE_CUDA_ARCH='5.2 7.0'
set +x
source /opt/rh/rh-python36/enable
export PATH=/opt/rh/rh-python36/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
PATH=/opt/rh/rh-python36/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
export LD_LIBRARY_PATH=/opt/rh/rh-python36/root/usr/lib64
LD_LIBRARY_PATH=/opt/rh/rh-python36/root/usr/lib64
export MANPATH=/opt/rh/rh-python36/root/usr/share/man:
MANPATH=/opt/rh/rh-python36/root/usr/share/man:
export PKG_CONFIG_PATH=/opt/rh/rh-python36/root/usr/lib64/pkgconfig
PKG_CONFIG_PATH=/opt/rh/rh-python36/root/usr/lib64/pkgconfig
export XDG_DATA_DIRS=/opt/rh/rh-python36/root/usr/share:/usr/local/share:/usr/share
XDG_DATA_DIRS=/opt/rh/rh-python36/root/usr/share:/usr/local/share:/usr/share
cd /work/mxnet
nproc
expr 72 / 4
OMP_NUM_THREADS=18
python -m pytest --verbose tests/python/unittest/test_gluon.py::test_slice_pooling2d_slice_pooling2d
Repeat the above steps for master and again with these lines commented out in 3rdparty/intgemm/CMakeLists.txt
#option(USE_OPENMP "Use OpenMP" OFF)
#if (USE_OPENMP)
# message(STATUS "Compiling with OpenMP")
# find_package(OpenMP)
# if (NOT ${OpenMP_CXX_FOUND})
# message(SEND_ERROR "OpenMP requested but C++ support not found")
# endif()
# add_compile_options(${OpenMP_CXX_FLAGS})
# target_link_libraries(intgemm PUBLIC OpenMP::OpenMP_CXX)
#endif()
The master version takes about 548.22s on a c5.18xlarge, while the commented version takes about 58.87s.