@@ -18,13 +18,16 @@ import (
18
18
19
19
const amdSmiImage = "un1def/amd-smi:6.2.2-0"
20
20
21
+ const ttSmiImage = "dstackai/tt-smi:latest"
22
+
21
23
type GpuVendor string
22
24
23
25
const (
24
- GpuVendorNone GpuVendor = "none"
25
- GpuVendorNvidia GpuVendor = "nvidia"
26
- GpuVendorAmd GpuVendor = "amd"
27
- GpuVendorIntel GpuVendor = "intel"
26
+ GpuVendorNone GpuVendor = "none"
27
+ GpuVendorNvidia GpuVendor = "nvidia"
28
+ GpuVendorAmd GpuVendor = "amd"
29
+ GpuVendorIntel GpuVendor = "intel"
30
+ GpuVendorTenstorrent GpuVendor = "tenstorrent"
28
31
)
29
32
30
33
type GpuInfo struct {
@@ -57,6 +60,9 @@ func GetGpuVendor() GpuVendor {
57
60
if _ , err := os .Stat ("/dev/accel" ); ! errors .Is (err , os .ErrNotExist ) {
58
61
return GpuVendorIntel
59
62
}
63
+ if _ , err := os .Stat ("/dev/tenstorrent" ); ! errors .Is (err , os .ErrNotExist ) {
64
+ return GpuVendorTenstorrent
65
+ }
60
66
return GpuVendorNone
61
67
}
62
68
@@ -68,6 +74,8 @@ func GetGpuInfo(ctx context.Context) []GpuInfo {
68
74
return getAmdGpuInfo (ctx )
69
75
case GpuVendorIntel :
70
76
return getIntelGpuInfo (ctx )
77
+ case GpuVendorTenstorrent :
78
+ return getTenstorrentGpuInfo (ctx )
71
79
case GpuVendorNone :
72
80
return []GpuInfo {}
73
81
}
@@ -195,6 +203,85 @@ func getAmdGpuInfo(ctx context.Context) []GpuInfo {
195
203
return gpus
196
204
}
197
205
206
+ type ttSmiSnapshot struct {
207
+ DeviceInfo []ttDeviceInfo `json:"device_info"`
208
+ }
209
+
210
+ type ttDeviceInfo struct {
211
+ BoardInfo ttBoardInfo `json:"board_info"`
212
+ }
213
+
214
+ type ttBoardInfo struct {
215
+ BoardType string `json:"board_type"`
216
+ BusID string `json:"bus_id"`
217
+ }
218
+
219
+ func getTenstorrentGpuInfo (ctx context.Context ) []GpuInfo {
220
+ gpus := []GpuInfo {}
221
+
222
+ cmd := execute.ExecTask {
223
+ Command : "docker" ,
224
+ Args : []string {
225
+ "run" ,
226
+ "--rm" ,
227
+ "--device" , "/dev/tenstorrent" ,
228
+ ttSmiImage ,
229
+ "-s" ,
230
+ },
231
+ StreamStdio : false ,
232
+ }
233
+ res , err := cmd .Execute (ctx )
234
+ if err != nil {
235
+ log .Error (ctx , "failed to execute tt-smi" , "err" , err )
236
+ return gpus
237
+ }
238
+ if res .ExitCode != 0 {
239
+ log .Error (
240
+ ctx , "failed to execute tt-smi" ,
241
+ "exitcode" , res .ExitCode , "stdout" , res .Stdout , "stderr" , res .Stderr ,
242
+ )
243
+ return gpus
244
+ }
245
+
246
+ var ttSmiSnapshot ttSmiSnapshot
247
+ if err := json .Unmarshal ([]byte (res .Stdout ), & ttSmiSnapshot ); err != nil {
248
+ log .Error (ctx , "cannot read tt-smi json" , "err" , err )
249
+ log .Debug (ctx , "tt-smi output" , "stdout" , res .Stdout )
250
+ return gpus
251
+ }
252
+
253
+ for i , device := range ttSmiSnapshot .DeviceInfo {
254
+ // Extract board type without R/L suffix
255
+ boardType := strings .TrimSpace (device .BoardInfo .BoardType )
256
+ name := boardType
257
+
258
+ // Remove " R" or " L" suffix if present
259
+ if strings .HasSuffix (boardType , " R" ) {
260
+ name = boardType [:len (boardType )- 2 ]
261
+ } else if strings .HasSuffix (boardType , " L" ) {
262
+ name = boardType [:len (boardType )- 2 ]
263
+ }
264
+
265
+ // Determine VRAM based on board type
266
+ vram := 0
267
+ if strings .HasPrefix (name , "n150" ) {
268
+ vram = 12 * 1024 // 12GB in MiB
269
+ } else if strings .HasPrefix (name , "n300" ) {
270
+ vram = 24 * 1024 // 24GB in MiB
271
+ }
272
+
273
+ gpus = append (gpus , GpuInfo {
274
+ Vendor : GpuVendorTenstorrent ,
275
+ Name : name ,
276
+ Vram : vram ,
277
+ ID : device .BoardInfo .BusID ,
278
+ Index : strconv .Itoa (i ),
279
+ })
280
+ }
281
+
282
+ return gpus
283
+ }
284
+
198
285
func getAmdRenderNodePath (bdf string ) (string , error ) {
199
286
// amd-smi uses extended BDF Notation with domain: Domain:Bus:Device.Function, e.g., 0000:5f:00.0
200
287
// udev creates /dev/dri/by-path/pci-<BDF>-render -> ../renderD<N> symlinks
0 commit comments