From b338987d972a84b0dd86ad06cba17e316503273e Mon Sep 17 00:00:00 2001 From: aditsharma55 Date: Fri, 17 Apr 2026 18:38:53 +0000 Subject: [PATCH] fix: filter correct disk image during boot disk selection --- v1/providers/nebius/instance.go | 280 ++++++++++++++++----------- v1/providers/nebius/instance_test.go | 81 ++++++++ 2 files changed, 244 insertions(+), 117 deletions(-) diff --git a/v1/providers/nebius/instance.go b/v1/providers/nebius/instance.go index b4747730..1ea3b394 100644 --- a/v1/providers/nebius/instance.go +++ b/v1/providers/nebius/instance.go @@ -18,6 +18,22 @@ const ( platformTypeCPU = "cpu" ) +// image scoring tiers. higher wins. exact-match bonus (200) > max baseline gap +// so explicit requests always override defaults, even for worker-node. +const ( + imageScoreBaseline = 1 + imageScoreWorkerNode = 10 + imageScoreUbuntuGeneric = 50 + imageScoreUbuntu22 = 55 + imageScoreUbuntu24 = 60 + imageScoreUbuntu22Cuda = 80 + imageScoreUbuntu24Cuda12 = 90 + imageScoreUbuntu24Cuda13 = 100 + + imageScoreExactMatchBonus = 200 + imageScoreUbuntuHintBonus = 50 +) + //nolint:gocyclo,funlen // Complex instance creation with resource management func (c *NebiusClient) CreateInstance(ctx context.Context, attrs v1.CreateInstanceAttrs) (*v1.Instance, error) { // Track created resources for automatic cleanup on failure @@ -1205,47 +1221,8 @@ func (c *NebiusClient) buildDiskCreateRequest(ctx context.Context, diskName stri } // First, try to resolve and use image family - if imageFamily, err := c.resolveImageFamily(ctx, attrs.ImageID); err == nil { - publicImagesParent := c.getPublicImagesParent() - - // Skip validation for known-good common families to speed up instance start - knownFamilies := []string{"ubuntu22.04-cuda12", "mk8s-worker-node-v-1-32-ubuntu24.04", "mk8s-worker-node-v-1-32-ubuntu24.04-cuda12.8"} - isKnownFamily := false - for _, known := range knownFamilies { - if imageFamily == known { - isKnownFamily = true - break - } - } - - if isKnownFamily { - // Use known family without validation - baseReq.Spec.Source = &compute.DiskSpec_SourceImageFamily{ - SourceImageFamily: &compute.SourceImageFamily{ - ImageFamily: imageFamily, - ParentId: publicImagesParent, - }, - } - baseReq.Metadata.Labels["image-family"] = imageFamily - return baseReq, nil - } - - // For unknown families, validate first - _, err := c.sdk.Services().Compute().V1().Image().GetLatestByFamily(ctx, &compute.GetImageLatestByFamilyRequest{ - ParentId: publicImagesParent, - ImageFamily: imageFamily, - }) - if err == nil { - // Family works, use it - baseReq.Spec.Source = &compute.DiskSpec_SourceImageFamily{ - SourceImageFamily: &compute.SourceImageFamily{ - ImageFamily: imageFamily, - ParentId: publicImagesParent, - }, - } - baseReq.Metadata.Labels["image-family"] = imageFamily - return baseReq, nil - } + if c.tryApplyImageFamilySource(ctx, baseReq, attrs.ImageID) { + return baseReq, nil } // Family approach failed, try to use a known working public image ID @@ -1262,80 +1239,170 @@ func (c *NebiusClient) buildDiskCreateRequest(ctx context.Context, diskName stri return nil, fmt.Errorf("could not resolve image %s to either a working family or image ID: %w", attrs.ImageID, err) } -// getWorkingPublicImageID gets a working public image ID based on the requested image type -// -//nolint:gocognit,gocyclo // Complex function trying multiple image resolution strategies -func (c *NebiusClient) getWorkingPublicImageID(ctx context.Context, requestedImage string) (string, error) { - // Get available public images from the correct region +// tryApplyImageFamilySource attempts to set baseReq's disk source via image-family lookup. +// Returns true if a family-based source was applied (caller should return baseReq). +// Returns false if the caller should fall back to scoring (getWorkingPublicImageID). +func (c *NebiusClient) tryApplyImageFamilySource(ctx context.Context, baseReq *compute.CreateDiskRequest, imageID string) bool { + imageFamily, resolveErr := c.resolveImageFamily(ctx, imageID) + if resolveErr != nil { + return false + } + publicImagesParent := c.getPublicImagesParent() - imagesResp, err := c.sdk.Services().Compute().V1().Image().List(ctx, &compute.ListImagesRequest{ - ParentId: publicImagesParent, + knownFamilies := []string{"ubuntu24.04-cuda13.0", "ubuntu24.04-cuda12", "ubuntu22.04-cuda12", "mk8s-worker-node-v-1-32-ubuntu24.04", "mk8s-worker-node-v-1-32-ubuntu24.04-cuda12.8"} + isKnownFamily := false + for _, known := range knownFamilies { + if imageFamily == known { + isKnownFamily = true + break + } + } + + if isKnownFamily { + applyImageFamilySource(baseReq, imageFamily, publicImagesParent) + return true + } + + latestImage, err := c.sdk.Services().Compute().V1().Image().GetLatestByFamily(ctx, &compute.GetImageLatestByFamilyRequest{ + ParentId: publicImagesParent, + ImageFamily: imageFamily, }) if err != nil { - return "", fmt.Errorf("failed to list public images: %w", err) + return false } - if len(imagesResp.GetItems()) == 0 { - return "", fmt.Errorf("no public images available") + if latestImage.Spec != nil && latestImage.Spec.GetCpuArchitecture() == compute.ImageSpec_ARM64 { + return false } - // Try to find the best match based on the requested image + applyImageFamilySource(baseReq, imageFamily, publicImagesParent) + return true +} + +func applyImageFamilySource(baseReq *compute.CreateDiskRequest, imageFamily, publicImagesParent string) { + baseReq.Spec.Source = &compute.DiskSpec_SourceImageFamily{ + SourceImageFamily: &compute.SourceImageFamily{ + ImageFamily: imageFamily, + ParentId: publicImagesParent, + }, + } + baseReq.Metadata.Labels["image-family"] = imageFamily +} + +// getWorkingPublicImageID gets a working public image ID based on the requested image type. +// It scores every non-ARM64 image and returns the highest-scored one so the result +// is deterministic regardless of the page order returned by the Nebius API. +func (c *NebiusClient) getWorkingPublicImageID(ctx context.Context, requestedImage string) (string, error) { + publicImagesParent := c.getPublicImagesParent() requestedLower := strings.ToLower(requestedImage) - var bestMatch *compute.Image - var fallbackImage *compute.Image + var bestImage *compute.Image + bestScore := -1 + var iterErr error - for _, image := range imagesResp.GetItems() { + // Filter auto-paginates via the SDK. Using List directly only returns the first + // page (small default size), which can omit ubuntu24.04-cuda13.0 entirely. + imageIter := c.sdk.Services().Compute().V1().Image().Filter(ctx, &compute.ListImagesRequest{ + ParentId: publicImagesParent, + PageSize: 988, + }) + imageIter(func(image *compute.Image, err error) bool { + if err != nil { + iterErr = err + return false + } if image.Metadata == nil { - continue + return true + } + if image.Spec != nil && image.Spec.GetCpuArchitecture() == compute.ImageSpec_ARM64 { + return true } - imageName := strings.ToLower(image.Metadata.Name) - - // Set fallback to first available image - if fallbackImage == nil { - fallbackImage = image + if score := scoreImage(image, requestedLower); score > bestScore { + bestScore = score + bestImage = image } + return true + }) - // Look for Ubuntu matches - if strings.Contains(requestedLower, "ubuntu") && strings.Contains(imageName, "ubuntu") { - // Prefer specific version matches - //nolint:gocritic // if-else chain is clearer than switch for version matching logic - if strings.Contains(requestedLower, "24.04") || strings.Contains(requestedLower, "24") { - if strings.Contains(imageName, "ubuntu24.04") { - bestMatch = image - break - } - } else if strings.Contains(requestedLower, "22.04") || strings.Contains(requestedLower, "22") { - if strings.Contains(imageName, "ubuntu22.04") { - bestMatch = image - break - } - } else if strings.Contains(requestedLower, "20.04") || strings.Contains(requestedLower, "20") { - if strings.Contains(imageName, "ubuntu20.04") { - bestMatch = image - break - } - } + if iterErr != nil { + return "", fmt.Errorf("failed to iterate public images: %w", iterErr) + } - // Any Ubuntu image is better than non-Ubuntu - if bestMatch == nil { - bestMatch = image - } - } + if bestImage == nil { + return "", fmt.Errorf("no suitable public image found") } - // Use best match if found, otherwise fallback - selectedImage := bestMatch - if selectedImage == nil { - selectedImage = fallbackImage + winnerFamily := "" + if bestImage.Spec != nil { + winnerFamily = bestImage.Spec.GetImageFamily() } + c.logger.Info(ctx, "getWorkingPublicImageID: selected image", + v1.LogField("id", bestImage.Metadata.Id), + v1.LogField("name", bestImage.Metadata.Name), + v1.LogField("family", winnerFamily), + v1.LogField("score", bestScore)) - if selectedImage == nil { - return "", fmt.Errorf("no suitable public image found") + return bestImage.Metadata.Id, nil +} + +// scoreImage picks the best public image when Nebius returns a messy list. +// default order: ubuntu24+cuda13 > ubuntu24+cuda12 > ubuntu22+cuda12 > +// ubuntu24 > ubuntu22 > other ubuntu > worker-node > rest. +// request bonus layers on top if the caller asked for something specific. +func scoreImage(image *compute.Image, requestedLower string) int { + family := "" + if image.Spec != nil { + family = strings.ToLower(image.Spec.GetImageFamily()) } + name := strings.ToLower(image.Metadata.Name) - return selectedImage.Metadata.Id, nil + return baseImageScore(name, family) + requestMatchBonus(name, family, requestedLower) +} + +// baseImageScore scores based on the image alone. worker-node is checked first +// so an ubuntu24-cuda12.8 worker image doesn't get classified as ubuntu24+cuda12. +func baseImageScore(name, family string) int { + has := func(s string) bool { return strings.Contains(name, s) || strings.Contains(family, s) } + + if has("mk8s-worker") || strings.Contains(name, "worker-node") { + return imageScoreWorkerNode + } + if !has("ubuntu") { + return imageScoreBaseline + } + if has("ubuntu24") { + if has("cuda13") { + return imageScoreUbuntu24Cuda13 + } + if has("cuda12") { + return imageScoreUbuntu24Cuda12 + } + return imageScoreUbuntu24 + } + if has("ubuntu22") { + if has("cuda12") { + return imageScoreUbuntu22Cuda + } + return imageScoreUbuntu22 + } + return imageScoreUbuntuGeneric +} + +// requestMatchBonus: +200 if request is a substring of name/family (big enough +// to beat any baseline gap), +50 as a weak nudge when caller said "ubuntu" but +// nothing matched directly. +func requestMatchBonus(name, family, requestedLower string) int { + if requestedLower == "" { + return 0 + } + if strings.Contains(name, requestedLower) || strings.Contains(family, requestedLower) || requestedLower == family { + return imageScoreExactMatchBonus + } + if strings.Contains(requestedLower, "ubuntu") && (strings.Contains(name, "ubuntu") || strings.Contains(family, "ubuntu")) { + return imageScoreUbuntuHintBonus + } + return 0 } // getPublicImagesParent determines the correct public images parent ID based on project routing code @@ -1583,6 +1650,9 @@ func (c *NebiusClient) parseInstanceType(ctx context.Context, instanceTypeID str func (c *NebiusClient) resolveImageFamily(ctx context.Context, imageID string) (string, error) { // Common Nebius image families - if ImageID matches one of these, use it directly commonFamilies := []string{ + "ubuntu24.04-cuda13.0", + "ubuntu24.04-cuda12", + "ubuntu24.04-driverless", "ubuntu22.04-cuda12", "mk8s-worker-node-v-1-32-ubuntu24.04", "mk8s-worker-node-v-1-32-ubuntu24.04-cuda12.8", @@ -1601,7 +1671,6 @@ func (c *NebiusClient) resolveImageFamily(ctx context.Context, imageID string) ( // If ImageID looks like a family name pattern (contains dots, dashes, no UUIDs) // and doesn't look like a UUID, assume it's a family name if !strings.Contains(imageID, "-") || len(imageID) < 32 { - // Likely a family name, use it directly return imageID, nil } @@ -1610,8 +1679,6 @@ func (c *NebiusClient) resolveImageFamily(ctx context.Context, imageID string) ( Id: imageID, }) if err != nil { - // If we can't get the image, try using the ID as a family name anyway - // This allows for custom family names that don't match our patterns return imageID, nil } @@ -1752,7 +1819,6 @@ func generateCloudInitUserData(publicKey string, firewallRules v1.FirewallRules) script := `#cloud-config packages: - ufw - - iptables-persistent ` // Add SSH key configuration if provided @@ -1764,18 +1830,6 @@ packages: var commands []string - // Fix a systemd race condition: ufw.service and netfilter-persistent.service - // both start in parallel (both are Before=network-pre.target with no mutual - // ordering). Both call iptables-restore concurrently, and with the iptables-nft - // backend the competing nftables transactions cause UFW to fail with - // "iptables-restore: line 4 failed". This drop-in forces UFW to wait for - // netfilter-persistent to finish first. - commands = append(commands, - "sudo mkdir -p /etc/systemd/system/ufw.service.d", - `printf '[Unit]\nAfter=netfilter-persistent.service\n' | sudo tee /etc/systemd/system/ufw.service.d/after-netfilter.conf > /dev/null`, - "sudo systemctl daemon-reload", - ) - // Generate UFW firewall commands (similar to Shadeform's approach) // UFW (Uncomplicated Firewall) is available on Ubuntu/Debian instances commands = append(commands, generateUFWCommands(firewallRules)...) @@ -1784,14 +1838,6 @@ packages: // accessible from the internet by default. commands = append(commands, generateIPTablesCommands()...) - // Save the complete iptables state (UFW chains + DOCKER-USER rules) so it - // survives instance stop/start cycles. Cloud-init runcmd only executes on - // first boot; on subsequent boots netfilter-persistent restores this snapshot, - // then UFW starts after it (due to the drop-in above) and re-applies its rules. - // This provides defense-in-depth: even if UFW fails for any reason, the - // netfilter-persistent snapshot ensures port 22 and DOCKER-USER rules persist. - commands = append(commands, "sudo netfilter-persistent save") - if len(commands) > 0 { // Use runcmd to execute firewall setup commands script += "\nruncmd:\n" diff --git a/v1/providers/nebius/instance_test.go b/v1/providers/nebius/instance_test.go index 389dea26..58ffe5b6 100644 --- a/v1/providers/nebius/instance_test.go +++ b/v1/providers/nebius/instance_test.go @@ -6,6 +6,8 @@ import ( "time" v1 "github.com/brevdev/cloud/v1" + common "github.com/nebius/gosdk/proto/nebius/common/v1" + compute "github.com/nebius/gosdk/proto/nebius/compute/v1" "github.com/stretchr/testify/assert" ) @@ -417,3 +419,82 @@ func TestParseInstanceTypeFormat(t *testing.T) { }) } } + +func makeTestImage(name, family string) *compute.Image { + return &compute.Image{ + Metadata: &common.ResourceMetadata{Name: name}, + Spec: &compute.ImageSpec{ + ImageFamily: family, + CpuArchitecture: compute.ImageSpec_AMD64, + }, + } +} + +func TestBaseImageScore(t *testing.T) { + tests := []struct { + name, family string + want int + }{ + {"ubuntu24.04-cuda13.0.0.2.673", "ubuntu24.04-cuda13.0", imageScoreUbuntu24Cuda13}, + {"ubuntu24.04-cuda12.8.0.0.12", "ubuntu24.04-cuda12", imageScoreUbuntu24Cuda12}, + {"ubuntu22.04-cuda12.3.0.0.5", "ubuntu22.04-cuda12", imageScoreUbuntu22Cuda}, + {"ubuntu24.04-driverless-20260401", "ubuntu24.04-driverless", imageScoreUbuntu24}, + {"ubuntu22.04-20260401", "ubuntu22.04", imageScoreUbuntu22}, + {"ubuntu20.04-20260401", "ubuntu20.04", imageScoreUbuntuGeneric}, + {"worker-node-v-1-33-ubuntu24.04-cuda12.8-20260403", "mk8s-worker-node-v-1-33-ubuntu24.04-cuda12.8", imageScoreWorkerNode}, + {"debian12-20260301", "debian12", imageScoreBaseline}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got := baseImageScore(strings.ToLower(tc.name), strings.ToLower(tc.family)) + assert.Equal(t, tc.want, got) + }) + } +} + +func TestRequestMatchBonus(t *testing.T) { + tests := []struct { + desc, name, family, requested string + want int + }{ + {"empty request, no bonus", "ubuntu24.04-cuda13.0", "ubuntu24.04-cuda13.0", "", 0}, + {"exact family match", "ubuntu24.04-cuda13.0.0.2.673", "ubuntu24.04-cuda13.0", "ubuntu24.04-cuda13.0", imageScoreExactMatchBonus}, + {"substring match in name", "worker-node-v-1-33-ubuntu24.04-cuda12.8", "mk8s-worker-node-v-1-33-ubuntu24.04-cuda12.8", "worker-node", imageScoreExactMatchBonus}, + {"ubuntu hint without exact match", "ubuntu22.04-cuda12", "ubuntu22.04-cuda12", "ubuntu24.04", imageScoreUbuntuHintBonus}, + {"non-ubuntu request, non-matching image, no bonus", "debian12", "debian12", "ubuntu", 0}, + } + for _, tc := range tests { + t.Run(tc.desc, func(t *testing.T) { + got := requestMatchBonus(strings.ToLower(tc.name), strings.ToLower(tc.family), strings.ToLower(tc.requested)) + assert.Equal(t, tc.want, got) + }) + } +} + +func TestScoreImage_prioritizesUbuntu24Cuda13OverWorkerNode(t *testing.T) { + ubuntu24Cuda13 := makeTestImage("ubuntu24.04-cuda13.0.0.2.673", "ubuntu24.04-cuda13.0") + workerNode := makeTestImage("worker-node-v-1-33-ubuntu24.04-cuda12.8-20260403", "mk8s-worker-node-v-1-33-ubuntu24.04-cuda12.8") + + // Regression guard for BREV-8794 scenario: default deploy (empty request) + // must prefer ubuntu24-cuda13 over any mk8s worker-node image. + assert.Greater(t, scoreImage(ubuntu24Cuda13, ""), scoreImage(workerNode, "")) + + // Request that happens to contain 'ubuntu24.04' must still prefer + // ubuntu24-cuda13 over worker-node (worker image name contains 'ubuntu24.04' + // as a substring, but the baseline score gap keeps it below). + assert.Greater(t, scoreImage(ubuntu24Cuda13, "ubuntu24.04"), scoreImage(workerNode, "ubuntu24.04")) +} + +func TestScoreImage_exactRequestForWorkerNodeWins(t *testing.T) { + ubuntu24Cuda13 := makeTestImage("ubuntu24.04-cuda13.0.0.2.673", "ubuntu24.04-cuda13.0") + workerNode := makeTestImage("worker-node-v-1-33-ubuntu24.04-cuda12.8-20260403", "mk8s-worker-node-v-1-33-ubuntu24.04-cuda12.8") + + // If a caller explicitly asks for the worker-node family, it must win. + requested := "mk8s-worker-node-v-1-33-ubuntu24.04-cuda12.8" + assert.Greater(t, scoreImage(workerNode, requested), scoreImage(ubuntu24Cuda13, requested)) +} + +func TestScoreImage_nilSpecUsesBaseline(t *testing.T) { + img := &compute.Image{Metadata: &common.ResourceMetadata{Name: "unknown-image"}} + assert.Equal(t, imageScoreBaseline, scoreImage(img, "")) +}