@@ -150,6 +150,66 @@ static void ggml_vk_destroy_buffer(vk_buffer& buf);
150
150
151
151
static constexpr uint32_t mul_mat_vec_max_cols = 8;
152
152
153
+ enum vk_device_architecture {
154
+ OTHER,
155
+ AMD_GCN,
156
+ AMD_RDNA1,
157
+ AMD_RDNA2,
158
+ AMD_RDNA3,
159
+ };
160
+
161
+ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) {
162
+ vk::PhysicalDeviceProperties props = device.getProperties();
163
+
164
+ if (props.vendorID == VK_VENDOR_ID_AMD) {
165
+ const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
166
+
167
+ bool amd_shader_core_properties = false;
168
+ bool integer_dot_product = false;
169
+ bool subgroup_size_control = false;
170
+
171
+ for (const auto& properties : ext_props) {
172
+ if (strcmp("VK_AMD_shader_core_properties", properties.extensionName) == 0) {
173
+ amd_shader_core_properties = true;
174
+ } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0) {
175
+ integer_dot_product = true;
176
+ } else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
177
+ subgroup_size_control = true;
178
+ }
179
+ }
180
+
181
+ if (!amd_shader_core_properties || !integer_dot_product || !subgroup_size_control) {
182
+ return vk_device_architecture::OTHER;
183
+ }
184
+
185
+ vk::PhysicalDeviceProperties2 props2;
186
+ vk::PhysicalDeviceShaderCorePropertiesAMD shader_core_props_amd;
187
+ vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR integer_dot_props;
188
+ vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
189
+
190
+ props2.pNext = &shader_core_props_amd;
191
+ shader_core_props_amd.pNext = &integer_dot_props;
192
+ integer_dot_props.pNext = &subgroup_size_control_props;
193
+
194
+ device.getProperties2(&props2);
195
+
196
+ if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 64) {
197
+ return vk_device_architecture::AMD_GCN;
198
+ }
199
+ if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 32) {
200
+ // RDNA
201
+ if (shader_core_props_amd.wavefrontsPerSimd == 20) {
202
+ return vk_device_architecture::AMD_RDNA1;
203
+ }
204
+ if (integer_dot_props.integerDotProduct4x8BitPackedMixedSignednessAccelerated) {
205
+ return vk_device_architecture::AMD_RDNA3;
206
+ }
207
+ return vk_device_architecture::AMD_RDNA2;
208
+ }
209
+ }
210
+ return vk_device_architecture::OTHER;
211
+ }
212
+
153
213
struct vk_device_struct {
154
214
std::mutex mutex;
155
215
@@ -162,6 +222,7 @@ struct vk_device_struct {
162
222
bool pipeline_robustness;
163
223
vk::Device device;
164
224
uint32_t vendor_id;
225
+ vk_device_architecture architecture;
165
226
vk_queue compute_queue;
166
227
vk_queue transfer_queue;
167
228
bool single_queue;
@@ -1448,6 +1509,73 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
1448
1509
return supported;
1449
1510
}
1450
1511
1512
+ struct GpuPipelineConfig {
1513
+ // GPU architecture identifier.
1514
+ // Example: vk_device_architecture::AMD_GCN
1515
+ vk_device_architecture arch;
1516
+
1517
+ // Mapping of pipeline names to their specific subgroup sizes.
1518
+ // Example: {"soft_max_f32", 64}
1519
+ std::unordered_map<std::string, uint32_t> pipelines;
1520
+
1521
+ // Default subgroup size for this GPU.
1522
+ // Defaults to 0 if not explicitly provided.
1523
+ uint32_t default_subgroup_size = 0;
1524
+ };
1525
+
1526
+ // Pipeline configuration for RDNA1 GPUs.
1527
+ static const std::unordered_map<std::string, uint32_t> rdna1_pipelines = {
1528
+ {"soft_max", 64}, {"im2col", 64},
1529
+ {"argmax", 64}, {"mul_mat_vec", 64},
1530
+ {"mul_mat_vec_f16", 32}, {"mul_mat_vec_f32_f16", 32}
1531
+ };
1532
+
1533
+ // Pipeline configuration for RDNA2 GPUs.
1534
+ static const std::unordered_map<std::string, uint32_t> rdna2_pipelines = {
1535
+ {"soft_max", 64}, {"im2col", 64},
1536
+ };
1537
+
1538
+ static constexpr uint32_t RDNA_DEFAULT_SUBGROUP_SIZE = 32;
1539
+
1540
+ // Define configurations for different GPUs.
1541
+ static std::vector<GpuPipelineConfig> gpu_pipeline_configs = {
1542
+ {
1543
+ vk_device_architecture::AMD_RDNA1,
1544
+ {
1545
+ rdna1_pipelines,
1546
+ },
1547
+ RDNA_DEFAULT_SUBGROUP_SIZE
1548
+ },
1549
+ {
1550
+ vk_device_architecture::AMD_RDNA2,
1551
+ {
1552
+ rdna2_pipelines,
1553
+ },
1554
+ RDNA_DEFAULT_SUBGROUP_SIZE
1555
+ },
1556
+ };
1557
+
1558
+ static uint32_t get_subgroup_size(const std::string &pipeline_name, const vk_device_architecture &arch) {
1559
+ for (const auto &config : gpu_pipeline_configs) {
1560
+ if (config.arch == arch) {
1561
+ auto pipIt = config.pipelines.find(pipeline_name);
1562
+ if (pipIt != config.pipelines.end()) {
1563
+ return pipIt->second;
1564
+ }
1565
+ std::vector<std::pair<std::string, uint32_t>> sorted_pipelines(config.pipelines.begin(), config.pipelines.end());
1566
+ std::sort(sorted_pipelines.begin(), sorted_pipelines.end(),
1567
+ [](const auto &a, const auto &b) { return a.first.size() > b.first.size(); });
1568
+ for (const auto &entry : sorted_pipelines) {
1569
+ if (pipeline_name.find(entry.first) != std::string::npos) {
1570
+ return entry.second;
1571
+ }
1572
+ }
1573
+ return config.default_subgroup_size;
1574
+ }
1575
+ }
1576
+ return 0; // If no matching configuration is found
1577
+ }
1578
+
1451
1579
static void ggml_vk_load_shaders(vk_device& device) {
1452
1580
VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
1453
1581
@@ -1574,6 +1702,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
1574
1702
uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
1575
1703
uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
1576
1704
1705
+ if (!require_full_subgroups && required_subgroup_size == 0) {
1706
+ required_subgroup_size = get_subgroup_size(name, device->architecture);
1707
+ }
1708
+
1577
1709
if (!pipeline) {
1578
1710
pipeline = std::make_shared<vk_pipeline_struct>();
1579
1711
pipeline->name = name;
@@ -2250,7 +2382,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
2250
2382
device->need_compiles = false;
2251
2383
}
2252
2384
2253
- static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props);
2385
+ static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch );
2254
2386
2255
2387
static vk_device ggml_vk_get_device(size_t idx) {
2256
2388
VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")");
@@ -2279,6 +2411,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
2279
2411
device->physical_device = physical_devices[dev_num];
2280
2412
const std::vector<vk::ExtensionProperties> ext_props = device->physical_device.enumerateDeviceExtensionProperties();
2281
2413
2414
+ device->architecture = get_device_architecture(device->physical_device);
2415
+
2282
2416
const char* GGML_VK_PREFER_HOST_MEMORY = getenv("GGML_VK_PREFER_HOST_MEMORY");
2283
2417
device->prefer_host_memory = GGML_VK_PREFER_HOST_MEMORY != nullptr;
2284
2418
@@ -2291,7 +2425,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
2291
2425
bool coopmat2_support = false;
2292
2426
device->coopmat_support = false;
2293
2427
2294
- // Check if maintenance4 is supported
2295
2428
for (const auto& properties : ext_props) {
2296
2429
if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
2297
2430
maintenance4_support = true;
@@ -2404,7 +2537,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
2404
2537
2405
2538
device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
2406
2539
2407
- if (!ggml_vk_khr_cooperative_matrix_support(device->properties, driver_props)) {
2540
+ if (!ggml_vk_khr_cooperative_matrix_support(device->properties, driver_props, device->architecture )) {
2408
2541
device->coopmat_support = false;
2409
2542
}
2410
2543
@@ -2782,7 +2915,10 @@ static void ggml_vk_print_gpu_info(size_t idx) {
2782
2915
subgroup_props.pNext = &driver_props;
2783
2916
physical_device.getProperties2(&props2);
2784
2917
2785
- const size_t subgroup_size = subgroup_props.subgroupSize;
2918
+ vk_device_architecture arch = get_device_architecture(physical_device);
2919
+ uint32_t default_subgroup_size = get_subgroup_size("", arch);
2920
+ const size_t subgroup_size = (default_subgroup_size != 0) ? default_subgroup_size : subgroup_props.subgroupSize;
2921
+
2786
2922
const bool uma = props2.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
2787
2923
2788
2924
bool fp16_storage = false;
@@ -2808,7 +2944,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
2808
2944
}
2809
2945
}
2810
2946
2811
- if (!ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props)) {
2947
+ const vk_device_architecture device_architecture = get_device_architecture(physical_device);
2948
+
2949
+ if (!ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props, device_architecture)) {
2812
2950
coopmat_support = false;
2813
2951
}
2814
2952
@@ -8843,18 +8981,15 @@ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::ve
8843
8981
UNUSED(instance_extensions);
8844
8982
}
8845
8983
8846
- static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props) {
8984
+ static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch ) {
8847
8985
switch (props.vendorID) {
8848
8986
case VK_VENDOR_ID_INTEL:
8849
8987
// Intel drivers don't support coopmat properly yet
8850
8988
return false;
8851
8989
case VK_VENDOR_ID_AMD:
8852
8990
if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) {
8853
8991
// Workaround for AMD proprietary driver reporting support on all GPUs
8854
- const std::string name = props.deviceName;
8855
- return name.rfind("AMD Radeon RX 7", 0) == 0 || name.rfind("AMD Radeon(TM) RX 7", 0) == 0 || // RDNA 3 consumer GPUs
8856
- name.rfind("AMD Radeon PRO W7", 0) == 0 || name.rfind("AMD Radeon(TM) PRO W7", 0) == 0 || // RDNA 3 workstation GPUs
8857
- name.rfind("AMD Radeon 7", 0) == 0 || name.rfind("AMD Radeon(TM) 7", 0) == 0; // RDNA 3 APUs
8992
+ return arch == vk_device_architecture::AMD_RDNA3;
8858
8993
}
8859
8994
return true;
8860
8995
default:
0 commit comments