Image cache and balloon on H100s, also temporarily stop people from u…

…sing A100 (#523) * Cache H100 * Stop people from using A100 * no cover * no cover * update client version
scaleapi · May 20, 2024 · 2f71b89 · 2f71b89
1 parent e207936
commit 2f71b89
Show file tree

Hide file tree

Showing 8 changed files with 85 additions and 3 deletions.
diff --git a/charts/model-engine/templates/balloon_h100_deployment.yaml b/charts/model-engine/templates/balloon_h100_deployment.yaml
@@ -0,0 +1,50 @@
+{{- if not .Values.serviceIdentifier }}
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ .Chart.Name }}-balloon-h100
+  labels:
+    team: infra
+    product: common-warm-nodes
+spec:
+  replicas: {{ .Values.replicaCount.balloonH100 }}
+  selector:
+    matchLabels:
+      app: {{ .Chart.Name }}-balloon-h100
+      version: v1
+  template:
+    metadata:
+      labels:
+        app: {{ .Chart.Name }}-balloon-h100
+        product: common-warm-nodes
+        team: infra
+        env: {{ .Values.context }}
+        version: v1
+      annotations:
+        sidecar.istio.io/inject: "false"
+    spec:
+      nodeSelector:
+        k8s.amazonaws.com/accelerator: nvidia-ampere-h100
+        {{- with .Values.balloonNodeSelector }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
+      tolerations:
+        - key: "nvidia.com/gpu"
+          operator: "Exists"
+          effect: "NoSchedule"
+      containers:
+        - image: public.ecr.aws/ubuntu/ubuntu:latest
+          imagePullPolicy: IfNotPresent
+          name: main
+          resources:
+            limits:
+              memory: 28Gi
+              nvidia.com/gpu: 1
+              cpu: 4
+          command:
+            - /bin/bash
+            - -c
+            - "while true; do sleep 30; done"
+      terminationGracePeriodSeconds: 0
+      priorityClassName: {{ .Chart.Name }}-low-priority
+{{- end }}
diff --git a/charts/model-engine/values_circleci.yaml b/charts/model-engine/values_circleci.yaml
@@ -8,6 +8,7 @@ replicaCount:
   balloonA100: 0
   balloonCpu: 0
   balloonT4: 0
+  balloonH100: 0
 
 # tag needs to be set dynamically every time. Usually it is set to the SHA1 hash of the git
 # commit from which the image was built.

diff --git a/charts/model-engine/values_sample.yaml b/charts/model-engine/values_sample.yaml
@@ -81,6 +81,8 @@ replicaCount:
   balloonCpu: 0
   # balloonT4 is a low priority pod deployment for T4 GPU nodes
   balloonT4: 0
+  # balloonH100 is a low priority pod deployment for H100 GPU nodes
+  balloonH100: 0
 
 # autoscaling is the autoscaling configuration for LLM Engine server deployments (e.g gateway, cache, and builder deployments)
 autoscaling:
@@ -254,6 +256,27 @@ imageCache:
         - key: "nvidia.com/gpu"
           operator: "Exists"
           effect: "NoSchedule"
+    - name: h100
+      nodeSelector:
+        k8s.amazonaws.com/accelerator: nvidia-hopper-h100
+      tolerations:
+        - key: "nvidia.com/gpu"
+          operator: "Exists"
+          effect: "NoSchedule"
+    - name: h100-mig-1g-20gb
+      nodeSelector:
+        k8s.amazonaws.com/accelerator: nvidia-hopper-h100-mig-1g-20gb
+      tolerations:
+        - key: "nvidia.com/gpu"
+          operator: "Exists"
+          effect: "NoSchedule"
+    - name: h100-mig-3g-40gb
+      nodeSelector:
+        k8s.amazonaws.com/accelerator: nvidia-hopper-h100-mig-3g-40gb
+      tolerations:
+        - key: "nvidia.com/gpu"
+          operator: "Exists"
+          effect: "NoSchedule"
 
 # celeryBrokerType specifies the celery broker type for async endpoints, either "sqs" or "elasticache"
 celeryBrokerType: sqs

diff --git a/clients/python/llmengine/__init__.py b/clients/python/llmengine/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.0.0b32"
+__version__ = "0.0.0b33"
 
 import os
 from typing import Sequence

diff --git a/clients/python/llmengine/model.py b/clients/python/llmengine/model.py
@@ -149,6 +149,8 @@ def create(
                 - ``nvidia-ampere-a100``
                 - ``nvidia-ampere-a100e``
                 - ``nvidia-hopper-h100``
+                - ``nvidia-hopper-h100-1g20gb``
+                - ``nvidia-hopper-h100-3g40gb``
 
             high_priority (`Optional[bool]`):
                 Either ``True`` or ``False``. Enabling this will allow the created
@@ -533,6 +535,8 @@ def update(
                 - ``nvidia-ampere-a100``
                 - ``nvidia-ampere-a100e``
                 - ``nvidia-hopper-h100``
+                - ``nvidia-hopper-h100-1g20gb``
+                - ``nvidia-hopper-h100-3g40gb``
 
             high_priority (`Optional[bool]`):
                 Either ``True`` or ``False``. Enabling this will allow the created

diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "scale-llm-engine"
-version = "0.0.0.beta32"
+version = "0.0.0.beta33"
 description = "Scale LLM Engine Python client"
 license = "Apache-2.0"
 authors = ["Phil Chen <[email protected]>"]

diff --git a/clients/python/setup.py b/clients/python/setup.py
@@ -3,7 +3,7 @@
 setup(
     name="scale-llm-engine",
     python_requires=">=3.7",
-    version="0.0.0.beta32",
+    version="0.0.0.beta33",
     packages=find_packages(),
     package_data={"llmengine": ["py.typed"]},
 )
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -879,6 +879,10 @@ async def execute(
             max_workers=request.max_workers,
             endpoint_type=request.endpoint_type,
         )
+        if request.gpu_type == GpuType.NVIDIA_AMPERE_A100E:  # pragma: no cover
+            raise ObjectHasInvalidValueException(
+                "We have migrated A100 usage to H100. Please request for H100 instead!"
+            )
         if request.labels is None:
             raise EndpointLabelsException("Endpoint labels cannot be None!")
         validate_labels(request.labels)