Skip to content

[SYCL][Driver]Enable multiple third-party targets for SYCL AOT. #18145

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: sycl
Choose a base branch
from
50 changes: 21 additions & 29 deletions clang/lib/Driver/Driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5159,19 +5159,6 @@ class OffloadingActionBuilder final {
: DeviceActionBuilder(C, Args, Inputs, Action::OFK_SYCL, OAB),
SYCLInstallation(C.getDriver()) {}

void withBoundArchForToolChain(const ToolChain *TC,
llvm::function_ref<void(const char *)> Op) {
for (auto &A : GpuArchList) {
if (TC->getTriple() == A.first) {
Op(A.second ? Args.MakeArgString(A.second) : nullptr);
return;
}
}

// no bound arch for this toolchain
Op(nullptr);
}

void pushForeignAction(Action *A) override {
// Accept a foreign action from the CudaActionBuilder for compiling CUDA
// sources
Expand Down Expand Up @@ -5454,10 +5441,13 @@ class OffloadingActionBuilder final {
return;

OffloadAction::DeviceDependences Dep;
withBoundArchForToolChain(ToolChains.front(), [&](const char *BoundArch) {
Dep.add(*SYCLLinkBinary, *ToolChains.front(), BoundArch,
Action::OFK_SYCL);
});
for (auto &TripleAndArchPair : GpuArchList) {
if (ToolChains.front()->getTriple() == TripleAndArchPair.first) {
Dep.add(*SYCLLinkBinary, *ToolChains.front(),
TripleAndArchPair.second, Action::OFK_SYCL);
}
}

AL.push_back(C.MakeAction<OffloadAction>(Dep, SYCLLinkBinary->getType()));
SYCLLinkBinary = nullptr;
}
Expand Down Expand Up @@ -5901,9 +5891,7 @@ class OffloadingActionBuilder final {
}
if (SkipWrapper) {
// Wrapper step not requested.
withBoundArchForToolChain(TC, [&](const char *BoundArch) {
addDeps(WrapperInputs.front(), TC, BoundArch);
});
addDeps(WrapperInputs.front(), TC, BoundArch);
continue;
}

Expand All @@ -5917,9 +5905,7 @@ class OffloadingActionBuilder final {
BoundArch != nullptr);
addDeps(DeviceWrappingAction, TC, AddBA ? BoundArch : nullptr);
} else {
withBoundArchForToolChain(TC, [&](const char *BoundArch) {
addDeps(DeviceWrappingAction, TC, BoundArch);
});
addDeps(DeviceWrappingAction, TC, BoundArch);
}
}
}
Expand Down Expand Up @@ -6440,14 +6426,20 @@ class OffloadingActionBuilder final {
SYCLTargetInfoList.emplace_back(*TCIt, nullptr);
} else {
const char *OffloadArch = nullptr;
for (auto &A : GpuArchList) {
if (TT == A.first) {
OffloadArch = A.second;
break;
for (auto &TargetTripleArchPair : GpuArchList) {
if (TT == TargetTripleArchPair.first) {
OffloadArch = TargetTripleArchPair.second;
// Add an arch to the SYCLTargetInfoList
// only if it is not already present in the list.
auto Arch = llvm::find_if(
SYCLTargetInfoList, [&](auto &DeviceTargetInfo) {
return OffloadArch == DeviceTargetInfo.BoundArch;
});

if (Arch == SYCLTargetInfoList.end())
SYCLTargetInfoList.emplace_back(*TCIt, OffloadArch);
}
}
assert(OffloadArch && "Failed to find matching arch.");
SYCLTargetInfoList.emplace_back(*TCIt, OffloadArch);
}
}
}
Expand Down
12 changes: 12 additions & 0 deletions clang/test/Driver/sycl-offload-new-driver.c
Original file line number Diff line number Diff line change
Expand Up @@ -211,3 +211,15 @@
// RUN: %clangxx -fsycl -### --offload-new-driver %s 2>&1 \
// RUN: | FileCheck -check-prefix CHECK_NO_DYNAMIC_LINKING %s
// CHECK_NO_DYNAMIC_LINKING-NOT: clang-linker-wrapper{{.*}} "-sycl-allow-device-image-dependencies"

// Check if fsycl-targets correctly processes multiple NVidia
// and AMD GPU targets.
// RUN: %clang -### -fsycl -fsycl-targets=nvidia_gpu_sm_60,nvidia_gpu_sm_70 -nocudalib --offload-new-driver %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHK-MACRO-SM-60,CHK-MACRO-SM-70 %s
// CHK-MACRO-SM-60: clang{{.*}} "-fsycl-is-device"{{.*}} "-D__SYCL_TARGET_NVIDIA_GPU_SM_60__"{{.*}}
// CHK-MACRO-SM-70: clang{{.*}} "-fsycl-is-device"{{.*}} "-D__SYCL_TARGET_NVIDIA_GPU_SM_70__"{{.*}}
// RUN: %clang -### -fsycl -fsycl-targets=amd_gpu_gfx90a,amd_gpu_gfx90c -fno-sycl-libspirv -nogpulib --offload-new-driver %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHK-MACRO-GFX90A,CHK-MACRO-GFX90C %s
// CHK-MACRO-GFX90A: clang{{.*}} "-fsycl-is-device"{{.*}} "-D__SYCL_TARGET_AMD_GPU_GFX90A__"{{.*}}
// CHK-MACRO-GFX90C: clang{{.*}} "-fsycl-is-device"{{.*}} "-D__SYCL_TARGET_AMD_GPU_GFX90C__"{{.*}}

12 changes: 12 additions & 0 deletions clang/test/Driver/sycl-offload-old-model.c
Original file line number Diff line number Diff line change
Expand Up @@ -861,3 +861,15 @@
// FSYCL-PREVIEW-BREAKING-CHANGES-DEBUG-CHECK: --dependent-lib=sycl{{[0-9]*}}-previewd
// FSYCL-PREVIEW-BREAKING-CHANGES-DEBUG-CHECK-NOT: -defaultlib:sycl{{[0-9]*}}.lib
// FSYCL-PREVIEW-BREAKING-CHANGES-DEBUG-CHECK-NOT: -defaultlib:sycl{{[0-9]*}}-preview.lib

// Check if fsycl-targets correctly processes multiple NVidia
// and AMD GPU targets.
// RUN: %clang -### -fsycl -fsycl-targets=nvidia_gpu_sm_60,nvidia_gpu_sm_70 -nocudalib --no-offload-new-driver %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHK-MACRO-SM-60,CHK-MACRO-SM-70 %s
// CHK-MACRO-SM-60: clang{{.*}} "-fsycl-is-device"{{.*}} "-D__SYCL_TARGET_NVIDIA_GPU_SM_60__"{{.*}}
// CHK-MACRO-SM-70: clang{{.*}} "-fsycl-is-device"{{.*}} "-D__SYCL_TARGET_NVIDIA_GPU_SM_70__"{{.*}}
// RUN: %clang -### -fsycl -fsycl-targets=amd_gpu_gfx90a,amd_gpu_gfx90c -fno-sycl-libspirv -nogpulib --no-offload-new-driver %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHK-MACRO-GFX90A,CHK-MACRO-GFX90C %s
// CHK-MACRO-GFX90A: clang{{.*}} "-fsycl-is-device"{{.*}} "-D__SYCL_TARGET_AMD_GPU_GFX90A__"{{.*}}
// CHK-MACRO-GFX90C: clang{{.*}} "-fsycl-is-device"{{.*}} "-D__SYCL_TARGET_AMD_GPU_GFX90C__"{{.*}}

4 changes: 2 additions & 2 deletions sycl/doc/UsersManual.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ and not recommended to use in production environment.
currently overrides all the other specified SYCL targets when enabled.)

Special target values specific to Intel, NVIDIA and AMD Processor Graphics
support are accepted, providing a streamlined interface for AOT. Only one of
these values at a time is supported.
support are accepted, providing a streamlined interface for AOT.
A comma-separated list of valid Intel, NVIDIA and AMD Processor Graphics values are supported.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the behavior if user combines "special" target triples with "generic" target triples defined above?

I.e. something like -fsycl-targets=spir64_gen,intel_gpu_ptl_u?

Copy link
Contributor Author

@srividya-sundaram srividya-sundaram Apr 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@AlexeySachkov

Ideally, for, -fsycl-targets=spir64_gen,intel_gpu_ptl_u, we would provide the target GPU device via
-Xsycl-target-backend=spir64_gen "-device <device-name>" for spir64_gen AOT targets.

For the example, -fsycl-targets=spir64_gen,intel_gpu_ptl_u, ptl_u is passed as the target device for SYCL device compilation and ocloc call. Since no specific GPU device is provided for spir64_gen AOT target, no device info is passed to the SYCL device compilation step and the ocloc call.

Example:
device = ptl_u

"clang-20" "-cc1" "-triple" "spir64_gen-unknown-unknown" "-fsycl-is-device" "-D__SYCL_TARGET_INTEL_GPU_PTL_U__" ...
"ocloc"  ...."-output_no_suffix" "-spirv_input" "-device" "ptl_u"

No device info passed (fsycl-targets=spir64_gen)

"clang-20" "-cc1" "-triple" "spir64_gen-unknown-unknown" "-fsycl-is-device" ....
"ocloc" .... "-output_no_suffix" "-spirv_input"

* intel_gpu_ptl_u, intel_gpu_30_1_1 - Panther Lake U Intel graphics architecture
* intel_gpu_ptl_h, intel_gpu_30_0_4 - Panther Lake H Intel graphics architecture
* intel_gpu_lnl_m, intel_gpu_20_4_4 - Lunar Lake Intel graphics architecture
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// REQUIRES: hip
// RUN: %clangxx -fsycl -fsycl-targets=amd_gpu_gfx906 %s -S -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SAFE
// RUN: %clangxx -fsycl -fsycl-targets=amd_gpu_gfx906 %s -mllvm --amdgpu-oclc-unsafe-int-atomics=true -S -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-UNSAFE
// RUN: %clangxx -fsycl -fsycl-targets=amd_gpu_gfx90a %s -mllvm --amdgpu-oclc-unsafe-fp-atomics=true -mllvm --amdgpu-oclc-unsafe-int-atomics=true -S -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-UNSAFE-FP
// RUN: %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa %s -S -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SAFE
// RUN: %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa %s -mllvm --amdgpu-oclc-unsafe-int-atomics=true -S -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-UNSAFE
// RUN: %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa %s -mllvm --amdgpu-oclc-unsafe-fp-atomics=true -mllvm --amdgpu-oclc-unsafe-int-atomics=true -S -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-UNSAFE-FP
Comment on lines +2 to +4
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the idea is that these values must be aligned with values from sycl_ext_oneapi_device_architecture (link).
Change in this PR can confuse our customers as there is no "amdgcn-amd-amdhsa" in sycl_ext_oneapi_device_architecture

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dm-vodopyanov

The test files were updated because these lines from lit.cfg.py were adding "-Xsycl-target-backend=amdgcn-amd-amdhsa", "--offload-arch=gfx906" options in addition to the existing driver options in the RUN command in the test files.

This was resulting in either :

  1. two similar targets, causing duplication error (error: Duplicate targets are not allowed) or
  2. two different targets causing : clang: error: cannot specify -o when generating multiple output files

Additionally, all the matrix related tests have joint_matrix usage, where the joint_matrix API is only supported by the Intel, CUDA and HIP (GFX90A) backends.

Hence I updated the default AMD GPU device to be GFX90A.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, thanks for explanation, LGTM

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why don't we just drop that lit.cfg.py hardcode for --offload-arch in favor of --fsycl-targets?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I removed the hardcoded check, there were about ~60 tests failing with the error : clang: error: missing AMDGPU architecture for SYCL offloading; specify it with '-Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=<arch-name>'.
Hence they were not removed.


#include <sycl/sycl.hpp>

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// REQUIRES: hip
// RUN: %clangxx -fsycl-device-only -fsycl-targets=amd_gpu_gfx90a -S -Xclang -emit-llvm %s -o -| FileCheck %s
// RUN: %clangxx -fsycl-device-only -fsycl-targets=amdgcn-amd-amdhsa -S -Xclang -emit-llvm %s -o -| FileCheck %s

#include <sycl/sycl.hpp>

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// REQUIRES: hip
// RUN: %clangxx -fsycl-device-only -fsycl-targets=amd_gpu_gfx90a -S -Xclang -emit-llvm %s -o -| FileCheck %s
// RUN: %clangxx -fsycl-device-only -fsycl-targets=amdgcn-amd-amdhsa -S -Xclang -emit-llvm %s -o -| FileCheck %s

#include <sycl/sycl.hpp>

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// REQUIRES: hip
// RUN: %clangxx -fsycl-device-only -fsycl-targets=amd_gpu_gfx90a -S -Xclang -emit-llvm %s -o -| FileCheck %s
// RUN: %clangxx -fsycl-device-only -fsycl-targets=amdgcn-amd-amdhsa -S -Xclang -emit-llvm %s -o -| FileCheck %s

#include <sycl/sycl.hpp>

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// REQUIRES: hip
// RUN: %clangxx -fsycl-device-only -fsycl-targets=amd_gpu_gfx90a -S -Xclang -emit-llvm %s -o -| FileCheck %s
// RUN: %clangxx -fsycl-device-only -fsycl-targets=amdgcn-amd-amdhsa -S -Xclang -emit-llvm %s -o -| FileCheck %s

#include <sycl/sycl.hpp>

Expand Down
4 changes: 2 additions & 2 deletions sycl/test/lit.cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,10 +172,10 @@
# For AMD the specific GPU has to be specified with --offload-arch
if not any([f.startswith("--offload-arch") for f in additional_flags]):
# If the offload arch wasn't specified in SYCL_CLANG_EXTRA_FLAGS,
# hardcode it to gfx906, this is fine because only compiler tests
# hardcode it to gfx90a, this is fine because only compiler tests
additional_flags += [
"-Xsycl-target-backend=amdgcn-amd-amdhsa",
"--offload-arch=gfx906",
"--offload-arch=gfx90a",
]

config.sycl_headers_filter = lit_config.params.get("SYCL_HEADERS_FILTER", None)
Expand Down