Skip to content

Commit

Permalink
Image cache and balloon on H100s, also temporarily stop people from u…
Browse files Browse the repository at this point in the history
…sing A100 (#523)

* Cache H100

* Stop people from using A100

* no cover

* no cover

* update client version
  • Loading branch information
yunfeng-scale committed May 20, 2024
1 parent e207936 commit 2f71b89
Show file tree
Hide file tree
Showing 8 changed files with 85 additions and 3 deletions.
50 changes: 50 additions & 0 deletions charts/model-engine/templates/balloon_h100_deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
{{- if not .Values.serviceIdentifier }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ .Chart.Name }}-balloon-h100
labels:
team: infra
product: common-warm-nodes
spec:
replicas: {{ .Values.replicaCount.balloonH100 }}
selector:
matchLabels:
app: {{ .Chart.Name }}-balloon-h100
version: v1
template:
metadata:
labels:
app: {{ .Chart.Name }}-balloon-h100
product: common-warm-nodes
team: infra
env: {{ .Values.context }}
version: v1
annotations:
sidecar.istio.io/inject: "false"
spec:
nodeSelector:
k8s.amazonaws.com/accelerator: nvidia-ampere-h100
{{- with .Values.balloonNodeSelector }}
{{- toYaml . | nindent 8 }}
{{- end }}
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
containers:
- image: public.ecr.aws/ubuntu/ubuntu:latest
imagePullPolicy: IfNotPresent
name: main
resources:
limits:
memory: 28Gi
nvidia.com/gpu: 1
cpu: 4
command:
- /bin/bash
- -c
- "while true; do sleep 30; done"
terminationGracePeriodSeconds: 0
priorityClassName: {{ .Chart.Name }}-low-priority
{{- end }}
1 change: 1 addition & 0 deletions charts/model-engine/values_circleci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ replicaCount:
balloonA100: 0
balloonCpu: 0
balloonT4: 0
balloonH100: 0

# tag needs to be set dynamically every time. Usually it is set to the SHA1 hash of the git
# commit from which the image was built.
Expand Down
23 changes: 23 additions & 0 deletions charts/model-engine/values_sample.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ replicaCount:
balloonCpu: 0
# balloonT4 is a low priority pod deployment for T4 GPU nodes
balloonT4: 0
# balloonH100 is a low priority pod deployment for H100 GPU nodes
balloonH100: 0

# autoscaling is the autoscaling configuration for LLM Engine server deployments (e.g gateway, cache, and builder deployments)
autoscaling:
Expand Down Expand Up @@ -254,6 +256,27 @@ imageCache:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
- name: h100
nodeSelector:
k8s.amazonaws.com/accelerator: nvidia-hopper-h100
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
- name: h100-mig-1g-20gb
nodeSelector:
k8s.amazonaws.com/accelerator: nvidia-hopper-h100-mig-1g-20gb
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
- name: h100-mig-3g-40gb
nodeSelector:
k8s.amazonaws.com/accelerator: nvidia-hopper-h100-mig-3g-40gb
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"

# celeryBrokerType specifies the celery broker type for async endpoints, either "sqs" or "elasticache"
celeryBrokerType: sqs
Expand Down
2 changes: 1 addition & 1 deletion clients/python/llmengine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "0.0.0b32"
__version__ = "0.0.0b33"

import os
from typing import Sequence
Expand Down
4 changes: 4 additions & 0 deletions clients/python/llmengine/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ def create(
- ``nvidia-ampere-a100``
- ``nvidia-ampere-a100e``
- ``nvidia-hopper-h100``
- ``nvidia-hopper-h100-1g20gb``
- ``nvidia-hopper-h100-3g40gb``
high_priority (`Optional[bool]`):
Either ``True`` or ``False``. Enabling this will allow the created
Expand Down Expand Up @@ -533,6 +535,8 @@ def update(
- ``nvidia-ampere-a100``
- ``nvidia-ampere-a100e``
- ``nvidia-hopper-h100``
- ``nvidia-hopper-h100-1g20gb``
- ``nvidia-hopper-h100-3g40gb``
high_priority (`Optional[bool]`):
Either ``True`` or ``False``. Enabling this will allow the created
Expand Down
2 changes: 1 addition & 1 deletion clients/python/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "scale-llm-engine"
version = "0.0.0.beta32"
version = "0.0.0.beta33"
description = "Scale LLM Engine Python client"
license = "Apache-2.0"
authors = ["Phil Chen <[email protected]>"]
Expand Down
2 changes: 1 addition & 1 deletion clients/python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
setup(
name="scale-llm-engine",
python_requires=">=3.7",
version="0.0.0.beta32",
version="0.0.0.beta33",
packages=find_packages(),
package_data={"llmengine": ["py.typed"]},
)
Original file line number Diff line number Diff line change
Expand Up @@ -879,6 +879,10 @@ async def execute(
max_workers=request.max_workers,
endpoint_type=request.endpoint_type,
)
if request.gpu_type == GpuType.NVIDIA_AMPERE_A100E: # pragma: no cover
raise ObjectHasInvalidValueException(
"We have migrated A100 usage to H100. Please request for H100 instead!"
)
if request.labels is None:
raise EndpointLabelsException("Endpoint labels cannot be None!")
validate_labels(request.labels)
Expand Down

0 comments on commit 2f71b89

Please sign in to comment.