NVIDIA · juney-nvidia · Jun 13, 2025 · May 29, 2025 · Jun 10, 2025 · Jun 10, 2025
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -43,6 +43,14 @@ option(ENABLE_MULTI_DEVICE
        "Enable building with multi device support (requires NCCL, MPI,...)" ON)
 option(ENABLE_UCX "Enable building with UCX (Uniform Communication X) support"
        ON)
+option(USING_OSS_CUTLASS_LOW_LATENCY_GEMM
+       "Using open sourced Cutlass low latency gemm kernel" ON)
+option(USING_OSS_CUTLASS_FP4_GEMM "Using open sourced Cutlass fp4 gemm kernel"
+       ON)
+option(USING_OSS_CUTLASS_MOE_GEMM "Using open sourced Cutlass moe gemm kernel"
+       ON)
+option(USING_OSS_CUTLASS_ALLREDUCE_GEMM
+       "Using open sourced Cutlass AR gemm kernel" ON)
 
 if(NVTX_DISABLE)
   add_compile_definitions("NVTX_DISABLE")

diff --git a/cpp/include/tensorrt_llm/deep_gemm/fp8_gemm.cuh b/cpp/include/tensorrt_llm/deep_gemm/fp8_gemm.cuh
@@ -1,21 +1,11 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 DeepSeek
- * SPDX-License-Identifier: MIT
- *
- * Licensed under the MIT License.
- * You may obtain a copy of the License at
- *
- * https://opensource.org/licenses/MIT
- *
- *
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- * http://www.apache.org/licenses/LICENSE-2.0
+ *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,

diff --git a/cpp/include/tensorrt_llm/deep_gemm/mma_utils.cuh b/cpp/include/tensorrt_llm/deep_gemm/mma_utils.cuh
@@ -1,21 +1,11 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 DeepSeek
- * SPDX-License-Identifier: MIT
- *
- * Licensed under the MIT License.
- * You may obtain a copy of the License at
- *
- * https://opensource.org/licenses/MIT
- *
- *
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- * http://www.apache.org/licenses/LICENSE-2.0
+ *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,

diff --git a/cpp/include/tensorrt_llm/deep_gemm/scheduler.cuh b/cpp/include/tensorrt_llm/deep_gemm/scheduler.cuh
@@ -1,21 +1,11 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 DeepSeek
- * SPDX-License-Identifier: MIT
- *
- * Licensed under the MIT License.
- * You may obtain a copy of the License at
- *
- * https://opensource.org/licenses/MIT
- *
- *
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- * http://www.apache.org/licenses/LICENSE-2.0
+ *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,

diff --git a/cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh b/cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh
@@ -1,21 +1,11 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 DeepSeek
- * SPDX-License-Identifier: MIT
- *
- * Licensed under the MIT License.
- * You may obtain a copy of the License at
- *
- * https://opensource.org/licenses/MIT
- *
- *
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- * http://www.apache.org/licenses/LICENSE-2.0
+ *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,

diff --git a/cpp/include/tensorrt_llm/deep_gemm/utils.cuh b/cpp/include/tensorrt_llm/deep_gemm/utils.cuh
@@ -1,21 +1,11 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025 DeepSeek
- * SPDX-License-Identifier: MIT
- *
- * Licensed under the MIT License.
- * You may obtain a copy of the License at
- *
- * https://opensource.org/licenses/MIT
- *
- *
- * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- * http://www.apache.org/licenses/LICENSE-2.0
+ *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,

diff --git a/cpp/kernels/xqa/mha_sm90.cu b/cpp/kernels/xqa/mha_sm90.cu
@@ -937,7 +937,7 @@ CUBIN_EXPORT __global__
 #endif
 
             __syncwarp();
-            // the release semantics of arrive does not work for async consumers like gmma/utcmma. additional fence is
+            // the release semantics of arrive does not work for async consumers like gmma. additional fence is
             // needed.
             asm volatile("fence.proxy.async.shared::cta;\n");
             unused(xBar.produced.arrive());
@@ -1298,7 +1298,7 @@ CUBIN_EXPORT __global__
             smem.qBar.consumed.arrive_and_wait();
             QCvt::store(threadIdx.x, smem.q, f16QData);
 #endif
-            // the release semantics of arrive does not work for async consumers like gmma/utcmma. additional fence is
+            // the release semantics of arrive does not work for async consumers like gmma. additional fence is
             // needed.
             asm volatile("fence.proxy.async.shared::cta;\n");
             unused(smem.qBar.produced.arrive());

diff --git a/cpp/micro_benchmarks/CMakeLists.txt b/cpp/micro_benchmarks/CMakeLists.txt
@@ -50,5 +50,12 @@ function(add_benchmark test_name test_src)
   add_dependencies(micro_benchmarks ${test_name})
 endfunction()
 
+# currently only support internal-cutlass lib version
 add_benchmark(mixtureOfExpertsBackendBenchmark
               mixtureOfExpertsBackendBenchmarkLauncher.cu)
+# Temporary opend-sourced version. Will be daleted when open-sourced moe_gemm
+# support MXFP4
+if(USING_OSS_CUTLASS_MOE_GEMM)
+  add_benchmark(mixtureOfExpertsBackendBenchmarkOss
+                mixtureOfExpertsBackendBenchmarkLauncherOss.cu)
+endif()