[Feature] Support Tenstorrent's Wormhole accelerators #2573 (#2574)

peterschmidt85 · web-flow · commit d428e742588a · 2025-04-30T11:10:16.000+02:00
diff --git a/runner/internal/shim/docker.go b/runner/internal/shim/docker.go
@@ -151,6 +151,14 @@ func (d *DockerRunner) restoreStateFromContainers(ctx context.Context) error {
 						gpuIDs = append(gpuIDs, device.PathOnHost)
 					}
 				}
+			case host.GpuVendorTenstorrent:
+				for _, device := range containerFull.HostConfig.Resources.Devices {
+					if strings.HasPrefix(device.PathOnHost, "/dev/tenstorrent/") {
+						// Extract the device ID from the path
+						deviceID := strings.TrimPrefix(device.PathOnHost, "/dev/tenstorrent/")
+						gpuIDs = append(gpuIDs, deviceID)
+					}
+				}
 			case host.GpuVendorIntel:
 				for _, envVar := range containerFull.Config.Env {
 					if indices, found := strings.CutPrefix(envVar, "HABANA_VISIBLE_DEVICES="); found {
@@ -1009,6 +1017,7 @@ func configureGpuDevices(hostConfig *container.HostConfig, gpuDevices []GPUDevic
 func configureGpus(config *container.Config, hostConfig *container.HostConfig, vendor host.GpuVendor, ids []string) {
 	// NVIDIA: ids are identifiers reported by nvidia-smi, GPU-<UUID> strings
 	// AMD: ids are DRI render node paths, e.g., /dev/dri/renderD128
+	// Tenstorrent: ids are device indices to be used with /dev/tenstorrent/<id>
 	switch vendor {
 	case host.GpuVendorNvidia:
 		hostConfig.Resources.DeviceRequests = append(
@@ -1051,6 +1060,19 @@ func configureGpus(config *container.Config, hostConfig *container.HostConfig, v
 		// --security-opt=seccomp=unconfined
 		hostConfig.SecurityOpt = append(hostConfig.SecurityOpt, "seccomp=unconfined")
 		// TODO: in addition, for non-root user, --group-add=video, and possibly --group-add=render, are required.
+	case host.GpuVendorTenstorrent:
+		// For Tenstorrent, simply add each device
+		for _, id := range ids {
+			devicePath := fmt.Sprintf("/dev/tenstorrent/%s", id)
+			hostConfig.Resources.Devices = append(
+				hostConfig.Resources.Devices,
+				container.DeviceMapping{
+					PathOnHost:        devicePath,
+					PathInContainer:   devicePath,
+					CgroupPermissions: "rwm",
+				},
+			)
+		}
 	case host.GpuVendorIntel:
 		// All options are listed here:
 		// https://docs.habana.ai/en/latest/Installation_Guide/Additional_Installation/Docker_Installation.html
diff --git a/runner/internal/shim/host/gpu.go b/runner/internal/shim/host/gpu.go
@@ -18,13 +18,16 @@ import (
 
 const amdSmiImage = "un1def/amd-smi:6.2.2-0"
 
+const ttSmiImage = "dstackai/tt-smi:latest"
+
 type GpuVendor string
 
 const (
-	GpuVendorNone   GpuVendor = "none"
-	GpuVendorNvidia GpuVendor = "nvidia"
-	GpuVendorAmd    GpuVendor = "amd"
-	GpuVendorIntel  GpuVendor = "intel"
+	GpuVendorNone        GpuVendor = "none"
+	GpuVendorNvidia      GpuVendor = "nvidia"
+	GpuVendorAmd         GpuVendor = "amd"
+	GpuVendorIntel       GpuVendor = "intel"
+	GpuVendorTenstorrent GpuVendor = "tenstorrent"
 )
 
 type GpuInfo struct {
@@ -57,6 +60,9 @@ func GetGpuVendor() GpuVendor {
 	if _, err := os.Stat("/dev/accel"); !errors.Is(err, os.ErrNotExist) {
 		return GpuVendorIntel
 	}
+	if _, err := os.Stat("/dev/tenstorrent"); !errors.Is(err, os.ErrNotExist) {
+		return GpuVendorTenstorrent
+	}
 	return GpuVendorNone
 }
 
@@ -68,6 +74,8 @@ func GetGpuInfo(ctx context.Context) []GpuInfo {
 		return getAmdGpuInfo(ctx)
 	case GpuVendorIntel:
 		return getIntelGpuInfo(ctx)
+	case GpuVendorTenstorrent:
+		return getTenstorrentGpuInfo(ctx)
 	case GpuVendorNone:
 		return []GpuInfo{}
 	}
@@ -195,6 +203,85 @@ func getAmdGpuInfo(ctx context.Context) []GpuInfo {
 	return gpus
 }
 
+type ttSmiSnapshot struct {
+	DeviceInfo []ttDeviceInfo `json:"device_info"`
+}
+
+type ttDeviceInfo struct {
+	BoardInfo ttBoardInfo `json:"board_info"`
+}
+
+type ttBoardInfo struct {
+	BoardType string `json:"board_type"`
+	BusID     string `json:"bus_id"`
+}
+
+func getTenstorrentGpuInfo(ctx context.Context) []GpuInfo {
+	gpus := []GpuInfo{}
+
+	cmd := execute.ExecTask{
+		Command: "docker",
+		Args: []string{
+			"run",
+			"--rm",
+			"--device", "/dev/tenstorrent",
+			ttSmiImage,
+			"-s",
+		},
+		StreamStdio: false,
+	}
+	res, err := cmd.Execute(ctx)
+	if err != nil {
+		log.Error(ctx, "failed to execute tt-smi", "err", err)
+		return gpus
+	}
+	if res.ExitCode != 0 {
+		log.Error(
+			ctx, "failed to execute tt-smi",
+			"exitcode", res.ExitCode, "stdout", res.Stdout, "stderr", res.Stderr,
+		)
+		return gpus
+	}
+
+	var ttSmiSnapshot ttSmiSnapshot
+	if err := json.Unmarshal([]byte(res.Stdout), &ttSmiSnapshot); err != nil {
+		log.Error(ctx, "cannot read tt-smi json", "err", err)
+		log.Debug(ctx, "tt-smi output", "stdout", res.Stdout)
+		return gpus
+	}
+
+	for i, device := range ttSmiSnapshot.DeviceInfo {
+		// Extract board type without R/L suffix
+		boardType := strings.TrimSpace(device.BoardInfo.BoardType)
+		name := boardType
+
+		// Remove " R" or " L" suffix if present
+		if strings.HasSuffix(boardType, " R") {
+			name = boardType[:len(boardType)-2]
+		} else if strings.HasSuffix(boardType, " L") {
+			name = boardType[:len(boardType)-2]
+		}
+
+		// Determine VRAM based on board type
+		vram := 0
+		if strings.HasPrefix(name, "n150") {
+			vram = 12 * 1024 // 12GB in MiB
+		} else if strings.HasPrefix(name, "n300") {
+			vram = 24 * 1024 // 24GB in MiB
+		}
+
+		gpus = append(gpus, GpuInfo{
+			Vendor: GpuVendorTenstorrent,
+			Name:   name,
+			Vram:   vram,
+			ID:     device.BoardInfo.BusID,
+			Index:  strconv.Itoa(i),
+		})
+	}
+
+	return gpus
+}
+
 func getAmdRenderNodePath(bdf string) (string, error) {
 	// amd-smi uses extended BDF Notation with domain: Domain:Bus:Device.Function, e.g., 0000:5f:00.0
 	// udev creates /dev/dri/by-path/pci-<BDF>-render -> ../renderD<N> symlinks
diff --git a/runner/internal/shim/resources.go b/runner/internal/shim/resources.go
@@ -42,6 +42,8 @@ func NewGpuLock(gpus []host.GpuInfo) (*GpuLock, error) {
 				resourceID = gpu.ID
 			case host.GpuVendorAmd:
 				resourceID = gpu.RenderNodePath
+			case host.GpuVendorTenstorrent:
+				resourceID = gpu.Index
 			case host.GpuVendorIntel:
 				resourceID = gpu.Index
 			case host.GpuVendorNone:
diff --git a/src/dstack/_internal/cli/services/configurators/run.py b/src/dstack/_internal/cli/services/configurators/run.py
@@ -52,7 +52,7 @@
 _KNOWN_AMD_GPUS = {gpu.name.lower() for gpu in gpuhunt.KNOWN_AMD_GPUS}
 _KNOWN_NVIDIA_GPUS = {gpu.name.lower() for gpu in gpuhunt.KNOWN_NVIDIA_GPUS}
 _KNOWN_TPU_VERSIONS = {gpu.name.lower() for gpu in gpuhunt.KNOWN_TPUS}
-
+_KNOWN_TENSTORRENT_GPUS = {gpu.name.lower() for gpu in gpuhunt.KNOWN_TENSTORRENT_ACCELERATORS}
 _BIND_ADDRESS_ARG = "bind_address"
 
 logger = get_logger(__name__)
@@ -350,6 +350,7 @@ def validate_gpu_vendor_and_image(self, conf: BaseRunConfiguration) -> None:
         if gpu_spec.count.max == 0:
             return
         has_amd_gpu: bool
+        has_tt_gpu: bool
         vendor = gpu_spec.vendor
         if vendor is None:
             names = gpu_spec.name
@@ -362,6 +363,8 @@ def validate_gpu_vendor_and_image(self, conf: BaseRunConfiguration) -> None:
                         vendors.add(gpuhunt.AcceleratorVendor.NVIDIA)
                     elif name in _KNOWN_AMD_GPUS:
                         vendors.add(gpuhunt.AcceleratorVendor.AMD)
+                    elif name in _KNOWN_TENSTORRENT_GPUS:
+                        vendors.add(gpuhunt.AcceleratorVendor.TENSTORRENT)
                     else:
                         maybe_tpu_version, _, maybe_tpu_cores = name.partition("-")
                         if maybe_tpu_version in _KNOWN_TPU_VERSIONS and maybe_tpu_cores.isdigit():
@@ -380,15 +383,22 @@ def validate_gpu_vendor_and_image(self, conf: BaseRunConfiguration) -> None:
                 # to execute a run on an instance with an AMD accelerator with a default
                 # CUDA image, not a big deal.
                 has_amd_gpu = gpuhunt.AcceleratorVendor.AMD in vendors
+                has_tt_gpu = gpuhunt.AcceleratorVendor.TENSTORRENT in vendors
             else:
                 # If neither gpu.vendor nor gpu.name is set, assume Nvidia.
                 vendor = gpuhunt.AcceleratorVendor.NVIDIA
                 has_amd_gpu = False
+                has_tt_gpu = False
             gpu_spec.vendor = vendor
         else:
             has_amd_gpu = vendor == gpuhunt.AcceleratorVendor.AMD
+            has_tt_gpu = vendor == gpuhunt.AcceleratorVendor.TENSTORRENT
         if has_amd_gpu and conf.image is None:
-            raise ConfigurationError("`image` is required if `resources.gpu.vendor` is AMD.")
+            raise ConfigurationError("`image` is required if `resources.gpu.vendor` is `amd`")
+        if has_tt_gpu and conf.image is None:
+            raise ConfigurationError(
+                "`image` is required if `resources.gpu.vendor` is `tenstorrent`"
+            )
 
 
 class RunWithPortsConfigurator(BaseRunConfigurator):
diff --git a/src/dstack/_internal/core/models/resources.py b/src/dstack/_internal/core/models/resources.py
@@ -246,6 +246,8 @@ def _vendor_from_string(cls, v: str) -> gpuhunt.AcceleratorVendor:
         v = v.lower()
         if v == "tpu":
             return gpuhunt.AcceleratorVendor.GOOGLE
+        if v == "tt":
+            return gpuhunt.AcceleratorVendor.TENSTORRENT
         return gpuhunt.AcceleratorVendor.cast(v)