Skip to content
This repository was archived by the owner on Nov 17, 2023. It is now read-only.
This repository was archived by the owner on Nov 17, 2023. It is now read-only.

CI slowdown on CentOS CPU related to OpenMP and intgemm #19502

@kpuatamazon

Description

@kpuatamazon

Description

intgemm is a 3rd-party library written by me and included as a submodule. Unrelated continuous integration tests were going slower afterwards on CentOS 7 CPU. This has been discussed in comments on 1393602#commitcomment-43224930 .

After losing several hairs, the issue appears to be OpenMP support in intgemm's CMakeLists.txt:

https://github.com/kpu/intgemm/blob/8f28282c3bd854922da638024d2659be52e892e9/CMakeLists.txt#L47-L56

I think this is causing MXNet to use the slow CentOS OpenMP instead of the bundled support.

Question

What's the best practice for a standalone library that has its own OpenMP support to not step on MXNet's internal support?

To Reproduce

Started with c5.18xlarge with the latest AL2 machine learning image.

To build:

# Always delete the build directory.  This is sneaky and appears to survive.  
rm -rf build; ci/build.py --docker-registry mxnetci --platform centos7_cpu --docker-build-retries 3 --shm-size 500m /work/runtime_functions.sh build_centos7_cpu

To run:

#Running
docker run --cap-add SYS_PTRACE --rm --shm-size=500m -v $HOME/incubator-mxnet:/work/mxnet -v $HOME/incubator-mxnet/build:/work/build -v $HOME/.ccache:/work/ccache -u 1001:1001 -e CCACHE_MAXSIZE=500G -e CCACHE_TEMPDIR=/tmp/ccache -e CCACHE_DIR=/work/ccache -e CCACHE_LOGFILE=/tmp/ccache.log -ti mxnetci/build.centos7_cpu:latest bash
CI_CUDA_COMPUTE_CAPABILITIES='-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_70,code=sm_70'
CI_CMAKE_CUDA_ARCH='5.2 7.0'
set +x
source /opt/rh/rh-python36/enable
export PATH=/opt/rh/rh-python36/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
PATH=/opt/rh/rh-python36/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
export LD_LIBRARY_PATH=/opt/rh/rh-python36/root/usr/lib64
LD_LIBRARY_PATH=/opt/rh/rh-python36/root/usr/lib64
export MANPATH=/opt/rh/rh-python36/root/usr/share/man:
MANPATH=/opt/rh/rh-python36/root/usr/share/man:
export PKG_CONFIG_PATH=/opt/rh/rh-python36/root/usr/lib64/pkgconfig
PKG_CONFIG_PATH=/opt/rh/rh-python36/root/usr/lib64/pkgconfig
export XDG_DATA_DIRS=/opt/rh/rh-python36/root/usr/share:/usr/local/share:/usr/share
XDG_DATA_DIRS=/opt/rh/rh-python36/root/usr/share:/usr/local/share:/usr/share
cd /work/mxnet
nproc
expr 72 / 4
OMP_NUM_THREADS=18
python -m pytest --verbose tests/python/unittest/test_gluon.py::test_slice_pooling2d_slice_pooling2d

Repeat the above steps for master and again with these lines commented out in 3rdparty/intgemm/CMakeLists.txt

#option(USE_OPENMP "Use OpenMP" OFF)
#if (USE_OPENMP)
#  message(STATUS "Compiling with OpenMP")
#  find_package(OpenMP)
#  if (NOT ${OpenMP_CXX_FOUND})
#    message(SEND_ERROR "OpenMP requested but C++ support not found")
#  endif()
#  add_compile_options(${OpenMP_CXX_FLAGS})
#  target_link_libraries(intgemm PUBLIC OpenMP::OpenMP_CXX)
#endif()

The master version takes about 548.22s on a c5.18xlarge, while the commented version takes about 58.87s.

cc @mseth10 @access2rohit @leezu

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions