Skip to content

Commit cf2270e

Browse files
daniandtheweb0cc4m
andauthored
vulkan: subgroup size tuning (#12087)
* vulkan: subgroup size test * Vulkan: Add device architecture enum and logic to recognize AMD generations * vulkan: use new architecture logic to specify subgroup size * Initial vulkan subgroup size tuning for RDNA3 * vulkan: commonize RDNA subgroup tuning * vulkan: override subgroup size if required_subgroup_size = 0 * vulkan: disable warp 32 for RDNA3 * vulkan: fine tuned RDNA1 subgroup sizes * vulkan: adjusted subgroup size map * vulkan: fixed RDNA2 subgroup map --------- Co-authored-by: 0cc4m <[email protected]>
1 parent f07690c commit cf2270e

File tree

1 file changed

+145
-10
lines changed

1 file changed

+145
-10
lines changed

ggml/src/ggml-vulkan/ggml-vulkan.cpp

+145-10
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,66 @@ static void ggml_vk_destroy_buffer(vk_buffer& buf);
150150

151151
static constexpr uint32_t mul_mat_vec_max_cols = 8;
152152

153+
enum vk_device_architecture {
154+
OTHER,
155+
AMD_GCN,
156+
AMD_RDNA1,
157+
AMD_RDNA2,
158+
AMD_RDNA3,
159+
};
160+
161+
static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) {
162+
vk::PhysicalDeviceProperties props = device.getProperties();
163+
164+
if (props.vendorID == VK_VENDOR_ID_AMD) {
165+
const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
166+
167+
bool amd_shader_core_properties = false;
168+
bool integer_dot_product = false;
169+
bool subgroup_size_control = false;
170+
171+
for (const auto& properties : ext_props) {
172+
if (strcmp("VK_AMD_shader_core_properties", properties.extensionName) == 0) {
173+
amd_shader_core_properties = true;
174+
} else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0) {
175+
integer_dot_product = true;
176+
} else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
177+
subgroup_size_control = true;
178+
}
179+
}
180+
181+
if (!amd_shader_core_properties || !integer_dot_product || !subgroup_size_control) {
182+
return vk_device_architecture::OTHER;
183+
}
184+
185+
vk::PhysicalDeviceProperties2 props2;
186+
vk::PhysicalDeviceShaderCorePropertiesAMD shader_core_props_amd;
187+
vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR integer_dot_props;
188+
vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
189+
190+
props2.pNext = &shader_core_props_amd;
191+
shader_core_props_amd.pNext = &integer_dot_props;
192+
integer_dot_props.pNext = &subgroup_size_control_props;
193+
194+
device.getProperties2(&props2);
195+
196+
if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 64) {
197+
return vk_device_architecture::AMD_GCN;
198+
}
199+
if (subgroup_size_control_props.maxSubgroupSize == 64 && subgroup_size_control_props.minSubgroupSize == 32) {
200+
// RDNA
201+
if (shader_core_props_amd.wavefrontsPerSimd == 20) {
202+
return vk_device_architecture::AMD_RDNA1;
203+
}
204+
if (integer_dot_props.integerDotProduct4x8BitPackedMixedSignednessAccelerated) {
205+
return vk_device_architecture::AMD_RDNA3;
206+
}
207+
return vk_device_architecture::AMD_RDNA2;
208+
}
209+
}
210+
return vk_device_architecture::OTHER;
211+
}
212+
153213
struct vk_device_struct {
154214
std::mutex mutex;
155215

@@ -162,6 +222,7 @@ struct vk_device_struct {
162222
bool pipeline_robustness;
163223
vk::Device device;
164224
uint32_t vendor_id;
225+
vk_device_architecture architecture;
165226
vk_queue compute_queue;
166227
vk_queue transfer_queue;
167228
bool single_queue;
@@ -1448,6 +1509,73 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
14481509
return supported;
14491510
}
14501511

1512+
struct GpuPipelineConfig {
1513+
// GPU architecture identifier.
1514+
// Example: vk_device_architecture::AMD_GCN
1515+
vk_device_architecture arch;
1516+
1517+
// Mapping of pipeline names to their specific subgroup sizes.
1518+
// Example: {"soft_max_f32", 64}
1519+
std::unordered_map<std::string, uint32_t> pipelines;
1520+
1521+
// Default subgroup size for this GPU.
1522+
// Defaults to 0 if not explicitly provided.
1523+
uint32_t default_subgroup_size = 0;
1524+
};
1525+
1526+
// Pipeline configuration for RDNA1 GPUs.
1527+
static const std::unordered_map<std::string, uint32_t> rdna1_pipelines = {
1528+
{"soft_max", 64}, {"im2col", 64},
1529+
{"argmax", 64}, {"mul_mat_vec", 64},
1530+
{"mul_mat_vec_f16", 32}, {"mul_mat_vec_f32_f16", 32}
1531+
};
1532+
1533+
// Pipeline configuration for RDNA2 GPUs.
1534+
static const std::unordered_map<std::string, uint32_t> rdna2_pipelines = {
1535+
{"soft_max", 64}, {"im2col", 64},
1536+
};
1537+
1538+
static constexpr uint32_t RDNA_DEFAULT_SUBGROUP_SIZE = 32;
1539+
1540+
// Define configurations for different GPUs.
1541+
static std::vector<GpuPipelineConfig> gpu_pipeline_configs = {
1542+
{
1543+
vk_device_architecture::AMD_RDNA1,
1544+
{
1545+
rdna1_pipelines,
1546+
},
1547+
RDNA_DEFAULT_SUBGROUP_SIZE
1548+
},
1549+
{
1550+
vk_device_architecture::AMD_RDNA2,
1551+
{
1552+
rdna2_pipelines,
1553+
},
1554+
RDNA_DEFAULT_SUBGROUP_SIZE
1555+
},
1556+
};
1557+
1558+
static uint32_t get_subgroup_size(const std::string &pipeline_name, const vk_device_architecture &arch) {
1559+
for (const auto &config : gpu_pipeline_configs) {
1560+
if (config.arch == arch) {
1561+
auto pipIt = config.pipelines.find(pipeline_name);
1562+
if (pipIt != config.pipelines.end()) {
1563+
return pipIt->second;
1564+
}
1565+
std::vector<std::pair<std::string, uint32_t>> sorted_pipelines(config.pipelines.begin(), config.pipelines.end());
1566+
std::sort(sorted_pipelines.begin(), sorted_pipelines.end(),
1567+
[](const auto &a, const auto &b) { return a.first.size() > b.first.size(); });
1568+
for (const auto &entry : sorted_pipelines) {
1569+
if (pipeline_name.find(entry.first) != std::string::npos) {
1570+
return entry.second;
1571+
}
1572+
}
1573+
return config.default_subgroup_size;
1574+
}
1575+
}
1576+
return 0; // If no matching configuration is found
1577+
}
1578+
14511579
static void ggml_vk_load_shaders(vk_device& device) {
14521580
VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")");
14531581

@@ -1574,6 +1702,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
15741702
uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
15751703
uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
15761704

1705+
if (!require_full_subgroups && required_subgroup_size == 0) {
1706+
required_subgroup_size = get_subgroup_size(name, device->architecture);
1707+
}
1708+
15771709
if (!pipeline) {
15781710
pipeline = std::make_shared<vk_pipeline_struct>();
15791711
pipeline->name = name;
@@ -2250,7 +2382,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
22502382
device->need_compiles = false;
22512383
}
22522384

2253-
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props);
2385+
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch);
22542386

22552387
static vk_device ggml_vk_get_device(size_t idx) {
22562388
VK_LOG_DEBUG("ggml_vk_get_device(" << idx << ")");
@@ -2279,6 +2411,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
22792411
device->physical_device = physical_devices[dev_num];
22802412
const std::vector<vk::ExtensionProperties> ext_props = device->physical_device.enumerateDeviceExtensionProperties();
22812413

2414+
device->architecture = get_device_architecture(device->physical_device);
2415+
22822416
const char* GGML_VK_PREFER_HOST_MEMORY = getenv("GGML_VK_PREFER_HOST_MEMORY");
22832417
device->prefer_host_memory = GGML_VK_PREFER_HOST_MEMORY != nullptr;
22842418

@@ -2291,7 +2425,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
22912425
bool coopmat2_support = false;
22922426
device->coopmat_support = false;
22932427

2294-
// Check if maintenance4 is supported
22952428
for (const auto& properties : ext_props) {
22962429
if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
22972430
maintenance4_support = true;
@@ -2404,7 +2537,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
24042537

24052538
device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
24062539

2407-
if (!ggml_vk_khr_cooperative_matrix_support(device->properties, driver_props)) {
2540+
if (!ggml_vk_khr_cooperative_matrix_support(device->properties, driver_props, device->architecture)) {
24082541
device->coopmat_support = false;
24092542
}
24102543

@@ -2782,7 +2915,10 @@ static void ggml_vk_print_gpu_info(size_t idx) {
27822915
subgroup_props.pNext = &driver_props;
27832916
physical_device.getProperties2(&props2);
27842917

2785-
const size_t subgroup_size = subgroup_props.subgroupSize;
2918+
vk_device_architecture arch = get_device_architecture(physical_device);
2919+
uint32_t default_subgroup_size = get_subgroup_size("", arch);
2920+
const size_t subgroup_size = (default_subgroup_size != 0) ? default_subgroup_size : subgroup_props.subgroupSize;
2921+
27862922
const bool uma = props2.properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
27872923

27882924
bool fp16_storage = false;
@@ -2808,7 +2944,9 @@ static void ggml_vk_print_gpu_info(size_t idx) {
28082944
}
28092945
}
28102946

2811-
if (!ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props)) {
2947+
const vk_device_architecture device_architecture = get_device_architecture(physical_device);
2948+
2949+
if (!ggml_vk_khr_cooperative_matrix_support(props2.properties, driver_props, device_architecture)) {
28122950
coopmat_support = false;
28132951
}
28142952

@@ -8843,18 +8981,15 @@ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::ve
88438981
UNUSED(instance_extensions);
88448982
}
88458983

8846-
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props) {
8984+
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) {
88478985
switch (props.vendorID) {
88488986
case VK_VENDOR_ID_INTEL:
88498987
// Intel drivers don't support coopmat properly yet
88508988
return false;
88518989
case VK_VENDOR_ID_AMD:
88528990
if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) {
88538991
// Workaround for AMD proprietary driver reporting support on all GPUs
8854-
const std::string name = props.deviceName;
8855-
return name.rfind("AMD Radeon RX 7", 0) == 0 || name.rfind("AMD Radeon(TM) RX 7", 0) == 0 || // RDNA 3 consumer GPUs
8856-
name.rfind("AMD Radeon PRO W7", 0) == 0 || name.rfind("AMD Radeon(TM) PRO W7", 0) == 0 || // RDNA 3 workstation GPUs
8857-
name.rfind("AMD Radeon 7", 0) == 0 || name.rfind("AMD Radeon(TM) 7", 0) == 0; // RDNA 3 APUs
8992+
return arch == vk_device_architecture::AMD_RDNA3;
88588993
}
88598994
return true;
88608995
default:

0 commit comments

Comments
 (0)