Skip to content

Commit

Permalink
feat: Add E2E Tests for Phi-3 and Tuning (#476)
Browse files Browse the repository at this point in the history
**Reason for Change**:
1. Add E2E tests for Phi-3 and Tuning
2. Uses temporary ACR for uploading e2e results
3. Allows a public preset to be used in private accessmode with private
ACR path
4. Adds Validation check for count > 1 for Tuning

---------

Signed-off-by: Ishaan Sehgal <[email protected]>
  • Loading branch information
ishaansehgal99 committed Jul 8, 2024
1 parent 42bbe92 commit e5e61a7
Show file tree
Hide file tree
Showing 7 changed files with 337 additions and 10 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/preset-image-build-1ES.yml
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ jobs:
fail-fast: false
matrix:
model: ${{fromJson(needs.determine-models.outputs.matrix)}}
max-parallel: 1
max-parallel: 10
steps:
- name: Checkout
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/preset-image-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ jobs:
fail-fast: false
matrix:
model: ${{fromJson(needs.determine-models.outputs.matrix)}}
max-parallel: 1
max-parallel: 10
steps:
- name: Checkout
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
Expand Down
2 changes: 1 addition & 1 deletion examples/fine-tuning/kaito_workspace_tuning_phi_3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ tuning:
urls:
- "https://huggingface.co/datasets/philschmid/dolly-15k-oai-style/resolve/main/data/train-00000-of-00001-54e3756291ca09c6.parquet?download=true"
output:
image: "ACR_REPO_HERE.azurecr.io/ADAPTER_HERE:0.0.1" # Tuning Output ACR Path
image: "ACR_REPO_HERE.azurecr.io/IMAGE_NAME_HERE:0.0.1" # Tuning Output ACR Path
imagePushSecret: ACR_REGISTRY_SECRET_HERE
20 changes: 14 additions & 6 deletions pkg/controllers/workspace_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,17 @@ func (c *WorkspaceReconciler) ensureNodePlugins(ctx context.Context, wObj *kaito
}
}

// getPresetName returns the preset name from wObj if available
func getPresetName(wObj *kaitov1alpha1.Workspace) string {
if wObj.Inference != nil && wObj.Inference.Preset != nil {
return string(wObj.Inference.Preset.Name)
}
if wObj.Tuning != nil && wObj.Tuning.Preset != nil {
return string(wObj.Tuning.Preset.Name)
}
return ""
}

func (c *WorkspaceReconciler) ensureService(ctx context.Context, wObj *kaitov1alpha1.Workspace) error {
serviceType := corev1.ServiceTypeClusterIP
wAnnotation := wObj.GetAnnotations()
Expand All @@ -491,18 +502,15 @@ func (c *WorkspaceReconciler) ensureService(ctx context.Context, wObj *kaitov1al
return nil
}

if wObj.Inference != nil && wObj.Inference.Preset != nil {
presetName := string(wObj.Inference.Preset.Name)
if presetName := getPresetName(wObj); presetName != "" {
model := plugin.KaitoModelRegister.MustGet(presetName)
serviceObj := resources.GenerateServiceManifest(ctx, wObj, serviceType, model.SupportDistributedInference())
err = resources.CreateResource(ctx, serviceObj, c.Client)
if err != nil {
if err := resources.CreateResource(ctx, serviceObj, c.Client); err != nil {
return err
}
if model.SupportDistributedInference() {
headlessService := resources.GenerateHeadlessServiceManifest(ctx, wObj)
err = resources.CreateResource(ctx, headlessService, c.Client)
if err != nil {
if err := resources.CreateResource(ctx, headlessService, c.Client); err != nil {
return err
}
}
Expand Down
2 changes: 1 addition & 1 deletion presets/models/falcon/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ func (*falcon7b) GetTuningParameters() *model.PresetParam {
ModelFamilyName: "Falcon",
ImageAccessMode: string(kaitov1alpha1.ModelImageAccessModePublic),
DiskStorageRequirement: "50Gi",
GPUCountRequirement: "2",
GPUCountRequirement: "1",
TotalGPUMemoryRequirement: "16Gi",
PerGPUMemoryRequirement: "16Gi",
TorchRunParams: tuning.DefaultAccelerateParams,
Expand Down
222 changes: 222 additions & 0 deletions test/e2e/preset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,19 @@ package e2e

import (
"fmt"
"log"
"math/rand"
"os"
"path/filepath"
"strconv"
"strings"
"time"

batchv1 "k8s.io/api/batch/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"

"github.com/aws/karpenter-core/pkg/apis/v1alpha5"
kaitov1alpha1 "github.com/azure/kaito/api/v1alpha1"
"github.com/azure/kaito/test/e2e/utils"
Expand Down Expand Up @@ -163,6 +171,65 @@ func createCustomWorkspaceWithPresetCustomMode(imageName string, numOfNode int)
return workspaceObj
}

func createPhi3WorkspaceWithPresetPublicMode(numOfNode int) *kaitov1alpha1.Workspace {
workspaceObj := &kaitov1alpha1.Workspace{}
By("Creating a workspace CR with Phi-3-mini-128k-instruct preset public mode", func() {
uniqueID := fmt.Sprint("preset-", rand.Intn(1000))
workspaceObj = utils.GenerateInferenceWorkspaceManifest(uniqueID, namespaceName, "",
numOfNode, "Standard_NC6s_v3", &metav1.LabelSelector{
MatchLabels: map[string]string{"kaito-workspace": "public-preset-e2e-test-phi-3-mini-128k-instruct"},
}, nil, PresetPhi3Mini128kModel, kaitov1alpha1.ModelImageAccessModePublic, nil, nil, nil)

createAndValidateWorkspace(workspaceObj)
})
return workspaceObj
}

func createCustomTuningConfigMapForE2E() *v1.ConfigMap {
configMap := utils.GenerateE2ETuningConfigMapManifest(namespaceName)

By("Creating a workspace Tuning CR with Falcon-7B preset private mode", func() {
createAndValidateConfigMap(configMap)
})

return configMap
}

func createAndValidateConfigMap(configMap *v1.ConfigMap) {
By("Creating ConfigMap", func() {
Eventually(func() error {
return TestingCluster.KubeClient.Create(ctx, configMap, &client.CreateOptions{})
}, utils.PollTimeout, utils.PollInterval).
Should(Succeed(), "Failed to create ConfigMap %s", configMap.Name)

By("Validating ConfigMap creation", func() {
err := TestingCluster.KubeClient.Get(ctx, client.ObjectKey{
Namespace: configMap.Namespace,
Name: configMap.Name,
}, configMap, &client.GetOptions{})
Expect(err).NotTo(HaveOccurred())
})
})
}

func createPhi3TuningWorkspaceWithPresetPublicMode(configMapName string, numOfNode int) (*kaitov1alpha1.Workspace, string) {
workspaceObj := &kaitov1alpha1.Workspace{}
e2eOutputImageName := fmt.Sprintf("adapter-%s-e2e-test", PresetPhi3Mini128kModel)
e2eOutputImageTag := utils.GenerateRandomString()
var uniqueID string
By("Creating a workspace Tuning CR with Phi-3 preset private mode", func() {
uniqueID = fmt.Sprint("preset-", rand.Intn(1000))
outputRegistryUrl := fmt.Sprintf("%s.azurecr.io/%s:%s", azureClusterName, e2eOutputImageName, e2eOutputImageTag)
workspaceObj = utils.GenerateE2ETuningWorkspaceManifest(uniqueID, namespaceName, "",
outputRegistryUrl, numOfNode, "Standard_NC6s_v3", &metav1.LabelSelector{
MatchLabels: map[string]string{"kaito-workspace": "public-preset-e2e-test-tuning-falcon"},
}, nil, PresetPhi3Mini128kModel, kaitov1alpha1.ModelImageAccessModePublic, []string{aiModelsRegistrySecret}, configMapName)

createAndValidateWorkspace(workspaceObj)
})
return workspaceObj, uniqueID
}

func createAndValidateWorkspace(workspaceObj *kaitov1alpha1.Workspace) {
By("Creating workspace", func() {
Eventually(func() error {
Expand All @@ -180,6 +247,31 @@ func createAndValidateWorkspace(workspaceObj *kaitov1alpha1.Workspace) {
})
}

func copySecretToNamespace(secretName, targetNamespace string) error {
originalNamespace := "default"
originalSecret := &v1.Secret{}

// Fetch the original secret from the default namespace
err := TestingCluster.KubeClient.Get(ctx, client.ObjectKey{
Namespace: originalNamespace,
Name: secretName,
}, originalSecret)
if err != nil {
return fmt.Errorf("failed to get secret %s in namespace %s: %v", secretName, originalNamespace, err)
}

// Create a copy of the secret for the target namespace
newSecret := utils.CopySecret(originalSecret, targetNamespace)

// Create the new secret in the target namespace
err = TestingCluster.KubeClient.Create(ctx, newSecret)
if err != nil {
return fmt.Errorf("failed to create secret %s in namespace %s: %v", secretName, targetNamespace, err)
}

return nil
}

func getAllValidMachines(workspaceObj *kaitov1alpha1.Workspace) (*v1alpha5.MachineList, error) {
machineList := &v1alpha5.MachineList{}
ls := labels.Set{
Expand Down Expand Up @@ -320,6 +412,91 @@ func validateInferenceResource(workspaceObj *kaitov1alpha1.Workspace, expectedRe
})
}

// Logic to validate tuning deployment
func validateTuningResource(workspaceObj *kaitov1alpha1.Workspace) {
By("Checking the tuning resource", func() {
Eventually(func() bool {
var err error
var jobFailed, jobSucceeded int32

job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
Name: workspaceObj.Name,
Namespace: workspaceObj.Namespace,
},
}
err = TestingCluster.KubeClient.Get(ctx, client.ObjectKey{
Namespace: workspaceObj.Namespace,
Name: workspaceObj.Name,
}, job)

if err != nil {
GinkgoWriter.Printf("Error fetching resource: %v\n", err)
return false
}

jobFailed = job.Status.Failed
jobSucceeded = job.Status.Succeeded

if jobFailed > 0 {
GinkgoWriter.Printf("Job '%s' is in a failed state.\n", workspaceObj.Name)
return false
}

if jobSucceeded > 0 {
return true
}

return false
}, 30*time.Minute, utils.PollInterval).Should(BeTrue(), "Failed to wait for Tuning resource to be ready")
})
}

func validateACRTuningResultsUploaded(workspaceObj *kaitov1alpha1.Workspace, jobName string) {
var config *rest.Config
var err error

if os.Getenv("KUBERNETES_SERVICE_HOST") != "" && os.Getenv("KUBERNETES_SERVICE_PORT") != "" {
config, err = rest.InClusterConfig()
if err != nil {
log.Fatalf("Failed to get in-cluster config: %v", err)
}
} else {
// Use kubeconfig file for local development
kubeconfig := filepath.Join(os.Getenv("HOME"), ".kube", "config")
config, err = clientcmd.BuildConfigFromFlags("", kubeconfig)
if err != nil {
log.Fatalf("Failed to load kubeconfig: %v", err)
}
}

coreClient, err := kubernetes.NewForConfig(config)
if err != nil {
log.Fatalf("Failed to create core client: %v", err)
}
namespace := workspaceObj.Namespace
podName, err := utils.GetPodNameForJob(coreClient, namespace, jobName)
if err != nil {
log.Fatalf("Failed to get pod name for job %s: %v", jobName, err)
}

for {
logs, err := utils.GetPodLogs(coreClient, namespace, podName, "docker-sidecar")
if err != nil {
log.Printf("Failed to get logs from pod %s: %v", podName, err)
time.Sleep(10 * time.Second)
continue
}

if strings.Contains(logs, "Upload complete") {
fmt.Println("Upload complete")
break
}

time.Sleep(10 * time.Second) // Poll every 10 seconds
}
}

// Logic to validate workspace readiness
func validateWorkspaceReadiness(workspaceObj *kaitov1alpha1.Workspace) {
By("Checking the workspace status is ready", func() {
Expand Down Expand Up @@ -383,6 +560,7 @@ var aiModelsRegistry string
var aiModelsRegistrySecret string
var supportedModelsYamlPath string
var modelInfo map[string]string
var azureClusterName string

var _ = Describe("Workspace Preset", func() {
BeforeEach(func() {
Expand Down Expand Up @@ -515,4 +693,48 @@ var _ = Describe("Workspace Preset", func() {
validateWorkspaceReadiness(workspaceObj)
})

It("should create a Phi-3-mini-128k-instruct workspace with preset public mode successfully", func() {
numOfNode := 1
workspaceObj := createPhi3WorkspaceWithPresetPublicMode(numOfNode)

defer cleanupResources(workspaceObj)
time.Sleep(30 * time.Second)

validateMachineCreation(workspaceObj, numOfNode)
validateResourceStatus(workspaceObj)

time.Sleep(30 * time.Second)

validateAssociatedService(workspaceObj)

validateInferenceResource(workspaceObj, int32(numOfNode), false)

validateWorkspaceReadiness(workspaceObj)
})

It("should create a workspace for tuning successfully", func() {
numOfNode := 1
err := copySecretToNamespace(aiModelsRegistrySecret, namespaceName)
if err != nil {
log.Fatalf("Error copying secret: %v", err)
}
configMap := createCustomTuningConfigMapForE2E()
workspaceObj, jobName := createPhi3TuningWorkspaceWithPresetPublicMode(configMap.Name, numOfNode)

defer cleanupResources(workspaceObj)
time.Sleep(30 * time.Second)

validateMachineCreation(workspaceObj, numOfNode)
validateResourceStatus(workspaceObj)

time.Sleep(30 * time.Second)

// TODO: Need to check if tuning job uploaded to ACR
validateTuningResource(workspaceObj)

validateACRTuningResultsUploaded(workspaceObj, jobName)

validateWorkspaceReadiness(workspaceObj)
})

})
Loading

0 comments on commit e5e61a7

Please sign in to comment.