Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,14 @@ option(ENABLE_MULTI_DEVICE
"Enable building with multi device support (requires NCCL, MPI,...)" ON)
option(ENABLE_UCX "Enable building with UCX (Uniform Communication X) support"
ON)
option(USING_OSS_CUTLASS_LOW_LATENCY_GEMM
"Using open sourced Cutlass low latency gemm kernel" ON)
option(USING_OSS_CUTLASS_FP4_GEMM "Using open sourced Cutlass fp4 gemm kernel"
ON)
option(USING_OSS_CUTLASS_MOE_GEMM "Using open sourced Cutlass moe gemm kernel"
ON)
option(USING_OSS_CUTLASS_ALLREDUCE_GEMM
"Using open sourced Cutlass AR gemm kernel" ON)

if(NVTX_DISABLE)
add_compile_definitions("NVTX_DISABLE")
Expand Down
14 changes: 2 additions & 12 deletions cpp/include/tensorrt_llm/deep_gemm/fp8_gemm.cuh
Original file line number Diff line number Diff line change
@@ -1,21 +1,11 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 DeepSeek
* SPDX-License-Identifier: MIT
*
* Licensed under the MIT License.
* You may obtain a copy of the License at
*
* https://opensource.org/licenses/MIT
*
*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
Expand Down
14 changes: 2 additions & 12 deletions cpp/include/tensorrt_llm/deep_gemm/mma_utils.cuh
Original file line number Diff line number Diff line change
@@ -1,21 +1,11 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 DeepSeek
* SPDX-License-Identifier: MIT
*
* Licensed under the MIT License.
* You may obtain a copy of the License at
*
* https://opensource.org/licenses/MIT
*
*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
Expand Down
14 changes: 2 additions & 12 deletions cpp/include/tensorrt_llm/deep_gemm/scheduler.cuh
Original file line number Diff line number Diff line change
@@ -1,21 +1,11 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 DeepSeek
* SPDX-License-Identifier: MIT
*
* Licensed under the MIT License.
* You may obtain a copy of the License at
*
* https://opensource.org/licenses/MIT
*
*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
Expand Down
14 changes: 2 additions & 12 deletions cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh
Original file line number Diff line number Diff line change
@@ -1,21 +1,11 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 DeepSeek
* SPDX-License-Identifier: MIT
*
* Licensed under the MIT License.
* You may obtain a copy of the License at
*
* https://opensource.org/licenses/MIT
*
*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
Expand Down
14 changes: 2 additions & 12 deletions cpp/include/tensorrt_llm/deep_gemm/utils.cuh
Original file line number Diff line number Diff line change
@@ -1,21 +1,11 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2025 DeepSeek
* SPDX-License-Identifier: MIT
*
* Licensed under the MIT License.
* You may obtain a copy of the License at
*
* https://opensource.org/licenses/MIT
*
*
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
Expand Down
4 changes: 2 additions & 2 deletions cpp/kernels/xqa/mha_sm90.cu
Original file line number Diff line number Diff line change
Expand Up @@ -937,7 +937,7 @@ CUBIN_EXPORT __global__
#endif

__syncwarp();
// the release semantics of arrive does not work for async consumers like gmma/utcmma. additional fence is
// the release semantics of arrive does not work for async consumers like gmma. additional fence is
// needed.
asm volatile("fence.proxy.async.shared::cta;\n");
unused(xBar.produced.arrive());
Expand Down Expand Up @@ -1298,7 +1298,7 @@ CUBIN_EXPORT __global__
smem.qBar.consumed.arrive_and_wait();
QCvt::store(threadIdx.x, smem.q, f16QData);
#endif
// the release semantics of arrive does not work for async consumers like gmma/utcmma. additional fence is
// the release semantics of arrive does not work for async consumers like gmma. additional fence is
// needed.
asm volatile("fence.proxy.async.shared::cta;\n");
unused(smem.qBar.produced.arrive());
Expand Down
7 changes: 7 additions & 0 deletions cpp/micro_benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,5 +50,12 @@ function(add_benchmark test_name test_src)
add_dependencies(micro_benchmarks ${test_name})
endfunction()

# currently only support internal-cutlass lib version
add_benchmark(mixtureOfExpertsBackendBenchmark
mixtureOfExpertsBackendBenchmarkLauncher.cu)
# Temporary opend-sourced version. Will be daleted when open-sourced moe_gemm
# support MXFP4
if(USING_OSS_CUTLASS_MOE_GEMM)
add_benchmark(mixtureOfExpertsBackendBenchmarkOss
mixtureOfExpertsBackendBenchmarkLauncherOss.cu)
endif()
Loading