feat: Add Deployments for E2E Tests (#60)

Azure · Oct 13, 2023 · 4801723 · 4801723
1 parent 64daef4
commit 4801723
Show file tree

Hide file tree

Showing 36 changed files with 460 additions and 267 deletions.
diff --git a/.gitignore b/.gitignore
@@ -31,5 +31,6 @@ hack/tools/bin/*
 
 # presets
 
-pkg/presets/llama-2/weights
-pkg/presets/llama-2-chat/weights
+presets/llama-2/weights
+presets/llama-2-chat/weights
+presets/falcon/weights
diff --git a/docker/presets/falcon/Dockerfile b/docker/presets/falcon/Dockerfile
@@ -7,8 +7,8 @@ WORKDIR /workspace/huggingface
 # First, copy just the requirements.txt file and install dependencies
 # This is done before copying the code to utilize Docker's layer caching and
 # avoid reinstalling dependencies unless the requirements file changes.
-COPY pkg/presets/falcon/requirements.txt ./requirements.txt
+COPY presets/falcon/requirements.txt ./requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
 
 # Copy the entire 'presets/falcon' folder to the working directory
-COPY pkg/presets/falcon .
+COPY presets/falcon .
diff --git a/docker/presets/llama-2-chat/Dockerfile b/docker/presets/llama-2-chat/Dockerfile
@@ -9,4 +9,4 @@ RUN pip install -e .
 RUN pip install fastapi pydantic
 RUN pip install 'uvicorn[standard]'
 
-ADD pkg/presets/llama-2-chat /workspace/llama/llama-2-chat
+ADD presets/llama-2-chat /workspace/llama/llama-2-chat
diff --git a/docker/presets/llama-2/Dockerfile b/docker/presets/llama-2/Dockerfile
@@ -9,4 +9,4 @@ RUN pip install -e .
 RUN pip install fastapi pydantic
 RUN pip install 'uvicorn[standard]'
 
-ADD pkg/presets/llama-2 /workspace/llama/llama-2
+ADD presets/llama-2 /workspace/llama/llama-2
diff --git a/pkg/presets/convert/convert_fill.sh → hack/dadi_convert/convert_fill.sh b/pkg/presets/convert/convert_fill.sh → hack/dadi_convert/convert_fill.sh
diff --git a/pkg/presets/convert/convert_template.yaml → hack/dadi_convert/convert_template.yaml b/pkg/presets/convert/convert_template.yaml → hack/dadi_convert/convert_template.yaml
diff --git a/pkg/presets/convert/llama-2-13b-chat.yaml b/pkg/presets/convert/llama-2-13b-chat.yaml
diff --git a/pkg/presets/convert/llama-2-7b-chat.yaml b/pkg/presets/convert/llama-2-7b-chat.yaml
diff --git a/pkg/presets/convert/llama-2-7b.yaml b/pkg/presets/convert/llama-2-7b.yaml
diff --git a/pkg/presets/convert/pod-2GPU.yaml b/pkg/presets/convert/pod-2GPU.yaml
diff --git a/pkg/presets/convert/pod.yaml b/pkg/presets/convert/pod.yaml
diff --git a/pkg/presets/README.md → presets/README.md b/pkg/presets/README.md → presets/README.md
diff --git a/pkg/presets/benchmark_inference.py → presets/benchmark_inference.py b/pkg/presets/benchmark_inference.py → presets/benchmark_inference.py
diff --git a/pkg/presets/falcon/config.yaml → presets/falcon/config.yaml b/pkg/presets/falcon/config.yaml → presets/falcon/config.yaml
diff --git a/pkg/presets/falcon/inference-api.py → presets/falcon/inference-api.py b/pkg/presets/falcon/inference-api.py → presets/falcon/inference-api.py
diff --git a/pkg/presets/falcon/requirements.txt → presets/falcon/requirements.txt b/pkg/presets/falcon/requirements.txt → presets/falcon/requirements.txt
diff --git a/presets/k8s/falcon-7b-instruct/falcon-7b-instruct-service.yaml b/presets/k8s/falcon-7b-instruct/falcon-7b-instruct-service.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: falcon-7b-instruct
+spec:
+  selector:
+    app: falcon
+    statefulset.kubernetes.io/pod-name: falcon-7b-instruct-0
+  ports:
+    - protocol: TCP
+      port: 80
+      targetPort: 5000
+  type: LoadBalancer
+  publishNotReadyAddresses: true
diff --git a/presets/k8s/falcon-7b-instruct/falcon-7b-instruct-statefulset.yaml b/presets/k8s/falcon-7b-instruct/falcon-7b-instruct-statefulset.yaml
@@ -0,0 +1,51 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: falcon-7b-instruct
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: falcon
+  podManagementPolicy: Parallel
+  template:
+    metadata:
+      labels:
+        app: falcon
+    spec:
+      containers:
+        - name: falcon-container
+          image: REPO_HERE.azurecr.io/falcon-7b-instruct:TAG_HERE
+          command:
+            - /bin/sh
+            - -c
+            - accelerate launch --config_file config.yaml --num_processes 1 --num_machines 1 --use_deepspeed --machine_rank 0 --gpu_ids all inference-api.py
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 5000
+            initialDelaySeconds: 600 # 10 Min
+            periodSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /healthz
+              port: 5000
+            initialDelaySeconds: 30
+            periodSeconds: 10
+          volumeMounts:
+            - name: dshm
+              mountPath: /dev/shm
+      volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+      tolerations:
+        - effect: NoSchedule
+          key: sku
+          operator: Equal
+          value: gpu
+        - effect: NoSchedule
+          key: nvidia.com/gpu
+          operator: Exists
+      nodeSelector:
+        pool: on7binstruct
diff --git a/presets/k8s/falcon-7b/falcon-7b-service.yaml b/presets/k8s/falcon-7b/falcon-7b-service.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: falcon-7b
+spec:
+  selector:
+    app: falcon
+    statefulset.kubernetes.io/pod-name: falcon-7b-0
+  ports:
+    - protocol: TCP
+      port: 80
+      targetPort: 5000
+  type: LoadBalancer
+  publishNotReadyAddresses: true
+