Skip to content

Commit

Permalink
Correctly expose only the allocated GPU devices to Docker jobs
Browse files Browse the repository at this point in the history
Previously we assumed that there was a simple mapping of:

    /dev/dri/cardN -> /dev/dri/renderD$(N + 128)

However by inspection this is not the case. Instead, we have to know
the PCI address of the card and use the symlinks in /dev/dri/by-path
to look up the correct card and render devices.

So we now collect PCI addresses for AMD and Intel cards. We don't need
to do this for Nvidia cards because they are exposed by a different
mechanism.
  • Loading branch information
simonwo committed Dec 8, 2023
1 parent f6380ca commit e9dd25e
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 22 deletions.
13 changes: 8 additions & 5 deletions pkg/compute/capacity/system/gpu/amd.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"encoding/json"
"io"
"strconv"
"strings"

"github.com/bacalhau-project/bacalhau/pkg/compute/capacity"
"github.com/bacalhau-project/bacalhau/pkg/models"
Expand All @@ -12,19 +13,20 @@ import (
const rocmCommand = "rocm-smi"
const bytesPerMebibyte = 1048576

var rocmArgs = []string{"--showproductname", "--showmeminfo", "vram", "--json"}
var rocmArgs = []string{"--showproductname", "--showbus", "--showmeminfo", "vram", "--json"}

// {"card0": {"VRAM Total Memory (B)": "68702699520", "VRAM Total Used Memory
// (B)": "10960896", "Card series": "Instinct MI210", "Card model": "0x0c34",
// "Card vendor": "Advanced Micro Devices, Inc. [AMD/ATI]", "Card SKU":
// "D67301"}}
// {"card0": {"PCI Bus": "0000:E7:00.0", "VRAM Total Memory (B)": "68702699520",
// "VRAM Total Used Memory (B)": "10960896", "Card series": "Instinct MI210",
// "Card model": "0x0c34", "Card vendor": "Advanced Micro Devices, Inc.
// [AMD/ATI]", "Card SKU": "D67301"}}
type rocmGPU struct {
TotalMemory string `json:"VRAM Total Memory (B)"`
UsedMemory string `json:"VRAM Total Used Memory (B)"`
Series string `json:"Card series"`
Model string `json:"Card model"`
Vendor string `json:"Card vendor"`
SKU string `json:"Card SKU"`
PCIAddress string `json:"PCI Bus"`
}

type rocmGPUList map[string]rocmGPU
Expand All @@ -50,6 +52,7 @@ func parseRocmSMIOutput(output io.Reader) (models.Resources, error) {
}
gpus[index].Memory = memBytes / bytesPerMebibyte // convert to mebibytes
gpus[index].Vendor = models.GPUVendorAMDATI
gpus[index].PCIAddress = strings.ToLower(record.PCIAddress) // hex letters are uppercase
}

return models.Resources{GPU: uint64(len(gpus)), GPUs: gpus}, nil
Expand Down
3 changes: 2 additions & 1 deletion pkg/compute/capacity/system/gpu/amd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import (

func TestParsingAMDGPUsWithOne(t *testing.T) {
output := strings.NewReader(
`{"card0": {"VRAM Total Memory (B)": "68702699520", ` +
`{"card0": {"PCI Bus": "0000:E7:00.0", "VRAM Total Memory (B)": "68702699520", ` +
`"VRAM Total Used Memory (B)": "10960896", ` +
`"Card series": "Instinct MI210", "Card model": "0x0c34", ` +
`"Card vendor": "Advanced Micro Devices, Inc. [AMD/ATI]", "Card SKU":` +
Expand All @@ -32,6 +32,7 @@ func TestParsingAMDGPUsWithOne(t *testing.T) {
require.Equal(t, uint64(0), gpus[0].Index)
require.Equal(t, "Instinct MI210", gpus[0].Name)
require.Equal(t, uint64(65520), gpus[0].Memory)
require.Equal(t, "0000:e7:00.0", gpus[0].PCIAddress)
}

func TestParsingAMDGPUsWithMany(t *testing.T) {
Expand Down
9 changes: 6 additions & 3 deletions pkg/compute/capacity/system/gpu/intel.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ type xpuDeviceList struct {
List []struct {
DeviceID uint64 `json:"device_id"`
DeviceName string `json:"device_name"`
PCIAddress string `json:"pci_bdf_address"`
} `json:"device_list"`
}

Expand Down Expand Up @@ -47,13 +48,14 @@ type xpuDeviceInfo struct {
DeviceID uint64 `json:"device_id"`
DeviceName string `json:"device_name"`
TotalMemory string `json:"memory_physical_size_byte"`
PCIAddress string `json:"pci_bdf_address"`
}

var xpuDeviceInfoProvider = capacity.ToolBasedProvider{
Command: "xpu-smi",
Provides: "Intel GPUs",
// note: Args require a device ID, appended later
Args: []string{"discovery", "--json", "--device"},
Args: []string{"discovery", "--json", "--device"},
Parser: func(output io.Reader) (models.Resources, error) {
var record xpuDeviceInfo
err := json.NewDecoder(output).Decode(&record)
Expand All @@ -69,8 +71,9 @@ var xpuDeviceInfoProvider = capacity.ToolBasedProvider{
gpu := models.GPU{
Index: record.DeviceID,
Name: record.DeviceName,
Vendor: models.GPUVendorIntel,
Memory: parsedMemoryBytes / bytesPerMebibyte,
Vendor: models.GPUVendorIntel,
Memory: parsedMemoryBytes / bytesPerMebibyte,
PCIAddress: record.PCIAddress,
}

return models.Resources{GPU: 1, GPUs: []models.GPU{gpu}}, nil
Expand Down
2 changes: 2 additions & 0 deletions pkg/compute/capacity/system/gpu/intel_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ func TestParsingIntelGPUsWithOne(t *testing.T) {
gpu := output.GPUs[0]
require.Equal(t, models.GPUVendorIntel, gpu.Vendor)
require.Equal(t, uint64(0), gpu.Index)
require.Equal(t, "0000:e9:00.0", gpu.PCIAddress)
require.Equal(t, "Intel Corporation Device 56c1 (rev 05)", gpu.Name)
require.Equal(t, uint64(5068), gpu.Memory)

Expand All @@ -87,6 +88,7 @@ func TestParsingIntelGPUsWithMany(t *testing.T) {
for _, gpu := range output.GPUs {
require.Equal(t, models.GPUVendorIntel, gpu.Vendor)
require.Equal(t, uint64(0), gpu.Index)
require.Equal(t, "0000:e9:00.0", gpu.PCIAddress)
require.Equal(t, "Intel Corporation Device 56c1 (rev 05)", gpu.Name)
require.Equal(t, uint64(5068), gpu.Memory)
}
Expand Down
32 changes: 19 additions & 13 deletions pkg/executor/docker/executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -371,19 +371,25 @@ func configureDevices(ctx context.Context, resources *models.Resources) ([]conta
fallthrough
case models.GPUVendorIntel:
// https://github.com/openvinotoolkit/docker_ci/blob/master/docs/accelerators.md
for _, gpu := range gpus {
mappings = append(mappings,
container.DeviceMapping{
PathOnHost: fmt.Sprintf("/dev/dri/card%d", gpu.Index),
PathInContainer: fmt.Sprintf("/dev/dri/card%d", gpu.Index),
CgroupPermissions: "rwm",
},
container.DeviceMapping{
PathOnHost: fmt.Sprintf("/dev/dri/renderD%d", (128 + gpu.Index)),
PathInContainer: fmt.Sprintf("/dev/dri/renderD%d", (128 + gpu.Index)),
CgroupPermissions: "rwm",
},
)
paths := lo.FlatMap[models.GPU, string](gpus, func(gpu models.GPU, _ int) []string {
return []string{
filepath.Join("/dev/dri/by-path/", fmt.Sprintf("pci-%s-card", gpu.PCIAddress)),
filepath.Join("/dev/dri/by-path/", fmt.Sprintf("pci-%s-render", gpu.PCIAddress)),
}
})

for _, path := range paths {
// We need to use the PCI address of the GPU to look up the correct devices to expose
absPath, err := filepath.EvalSymlinks(path)
if err != nil {
return nil, nil, errors.Wrapf(err, "could not find attached device for GPU at %q", path)
}

mappings = append(mappings, container.DeviceMapping{
PathOnHost: absPath,
PathInContainer: absPath,
CgroupPermissions: "rwm",
})
}
default:
return nil, nil, fmt.Errorf("job requires GPU from unsupported vendor %q", vendor)
Expand Down
3 changes: 3 additions & 0 deletions pkg/models/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,9 @@ type GPU struct {
Vendor GPUVendor
// Total GPU memory in mebibytes (MiB)
Memory uint64
// PCI address of the device, in the format AAAA:BB:CC.C
// Used to discover the correct device rendering cards
PCIAddress string
}

type Resources struct {
Expand Down

0 comments on commit e9dd25e

Please sign in to comment.