-
Notifications
You must be signed in to change notification settings - Fork 50
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Generalize SQS region #355
Conversation
@@ -496,9 +496,9 @@ def get_endpoint_resource_arguments_from_request( | |||
|
|||
change_cause_message = ( | |||
f"Deployment at {datetime.utcnow()} UTC. " | |||
f"Using deployment constructed from model bundle ID: {model_bundle.id}, " |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: why does this need to change?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
with the colons, this was messing up the yaml file formatting. not sure why this was only happening in fed environments, but this was the error:
{"message": "Could not load yaml string: apiVersion: apps/v1\nkind: Deployment\nmetadata:\n name: launch-endpoint-id-end-ck2vn71afno003spo0o0\n namespace: scale-deploy\n labels:\n user_id: 5f31ab8b46106def94fabe14\n team: federal\n product: launch\n created_by: 5f31ab8b46106def94fabe14\n owner: 5f31ab8b46106def94fabe14\n env: scalegov-launch\n managed-by: model-engine\n use_scale_launch_endpoint_network_policy: \"true\"\n tags.datadoghq.com/env: scalegov-launch\n tags.datadoghq.com/version: latest\n tags.datadoghq.com/service: endpoint-test_docker_bundle-6\n endpoint_id: end_ck2vn71afno003spo0o0\n endpoint_name: endpoint-test_docker_bundle-6\n annotations:\n celery.scaleml.autoscaler/queue: launch-endpoint-id-end_ck2vn71afno003spo0o0\n celery.scaleml.autoscaler/broker: sqs-message-broker-master\n celery.scaleml.autoscaler/taskVisibility: \"VISIBILITY_24H\"\n celery.scaleml.autoscaler/perWorker: \"1\"\n celery.scaleml.autoscaler/minWorkers: \"0\"\n celery.scaleml.autoscaler/maxWorkers: \"1\"\nspec:\n strategy:\n type: RollingUpdate\n rollingUpdate:\n maxSurge: 1\n maxUnavailable: 0\n replicas: 0\n selector:\n matchLabels:\n app: launch-endpoint-id-end-ck2vn71afno003spo0o0\n version: v1\n template:\n metadata:\n labels:\n app: launch-endpoint-id-end-ck2vn71afno003spo0o0\n user_id: 5f31ab8b46106def94fabe14\n team: federal\n product: launch\n created_by: 5f31ab8b46106def94fabe14\n owner: 5f31ab8b46106def94fabe14\n env: scalegov-launch\n managed-by: model-engine\n use_scale_launch_endpoint_network_policy: \"true\"\n tags.datadoghq.com/env: scalegov-launch\n tags.datadoghq.com/version: latest\n tags.datadoghq.com/service: endpoint-test_docker_bundle-6\n endpoint_id: end_ck2vn71afno003spo0o0\n endpoint_name: endpoint-test_docker_bundle-6\n sidecar.istio.io/inject: \"false\" # TODO: switch to scuttle\n version: v1\n annotations:\n ad.datadoghq.com/main.logs: '[{\"service\": \"endpoint-test_docker_bundle-6\", \"source\": \"python\"}]'\n kubernetes.io/change-cause: \"Deployment at 2023-09-16 19:01:48.672892 UTC. Using deployment constructed from model bundle ID: bun_ck2it5fr1n4003jcvah0, model bundle name: test_docker_bundle, endpoint ID: end_ck2vn71afno003spo0o0\"\n spec:\n affinity:\n podAffinity:\n preferredDuringSchedulingIgnoredDuringExecution:\n - weight: 1\n podAffinityTerm:\n labelSelector:\n matchExpressions:\n - key: app\n operator: In\n values:\n - launch-endpoint-id-end-ck2vn71afno003spo0o0\n topologyKey: kubernetes.io/hostname\n - weight: 100\n podAffinityTerm:\n labelSelector:\n matchExpressions:\n - key: c0513b6bd0bcb817cb31730726a297c6\n operator: In\n values:\n - \"True\"\n topologyKey: kubernetes.io/hostname\n terminationGracePeriodSeconds: 600\n serviceAccount: launch\n k8s.amazonaws.com/accelerator: nvidia-ampere-a10\n tolerations:\n - key: \"nvidia.com/gpu\"\n operator: \"Exists\"\n effect: \"NoSchedule\"\n priorityClassName: model-engine-default-priority\n containers:\n - name: celery-forwarder\n image: 871961284711.dkr.ecr.us-gov-west-1.amazonaws.com/launch/model-engine:latest\n imagePullPolicy: IfNotPresent\n command:\n - /usr/bin/dumb-init\n - --\n - ddtrace-run\n - python\n - -m\n - model_engine_server.inference.forwarding.celery_forwarder\n - --config\n - /workspace/model-engine/model_engine_server/inference/configs/service--forwarder.yaml\n - --queue\n - \"launch-endpoint-id-end_ck2vn71afno003spo0o0\"\n - --task-visibility\n - \"VISIBILITY_24H\"\n - --set\n - \"forwarder.model.args.predict_route=/predict\"\n - --set\n - \"forwarder.model.args.healthcheck_route=/readyz\"\n - --sqs-url\n - \"https://sqs.us-gov-west-1.amazonaws.com/871961284711/launch-endpoint-id-end_ck2vn71afno003spo0o0\"\n - --num-workers\n - \"1\"\n env:\n - name: DATADOG_TRACE_ENABLED\n value: \"false\"\n - name: DD_SERVICE\n value: \"endpoint-test_docker_bundle-6\"\n - name: DD_ENV\n value: scalegov-launch\n - name: DD_VERSION\n value: \"latest\"\n - name: DD_AGENT_HOST\n valueFrom:\n fieldRef:\n fieldPath: status.hostIP\n - name: AWS_PROFILE\n value: \"ml-worker\"\n - name: RESULTS_S3_BUCKET\n value: \"scale-ml\"\n - name: BASE_PATH\n value: \"/workspace\"\n - name: ML_INFRA_SERVICES_CONFIG_PATH\n value: \"/workspace/model-engine/model_engine_server/core/configs/config.yaml\"\n - name: CELERY_QUEUE\n value: \"launch-endpoint-id-end_ck2vn71afno003spo0o0\"\n - name: CELERY_TASK_VISIBILITY\n value: \"VISIBILITY_24H\"\n - name: S3_BUCKET\n value: \"scale-ml\"\n resources:\n requests:\n cpu: 0.1\n memory: \"100M\"\n ephemeral-storage: \"100M\"\n limits:\n cpu: 0.5\n memory: 1Gi\n ephemeral-storage: 1G\n \n \n volumeMounts:\n - name: config-volume\n mountPath: /root/.aws/config\n subPath: config\n - name: user-config\n mountPath: /workspace/user_config\n subPath: raw_data\n - name: endpoint-config\n mountPath: /workspace/endpoint_config\n subPath: raw_data\n - name: infra-service-config-volume\n mountPath: /workspace/model-engine/model_engine_server/core/configs\n - name: main\n securityContext:\n capabilities:\n drop:\n - all\n image: https://scale-ml.s3.us-gov-west-1.amazonaws.com/tmp/test_launch_docker_images/testEndpoint.tgz:test1\n imagePullPolicy: IfNotPresent\n command: ['dumb-init', '--', 'uvicorn', 'path.to.your.server.file:app', '--port', '5005', '--host', '::']\n env: []\n readinessProbe:\n httpGet:\n path: /readyz\n port: 5005\n initialDelaySeconds: 120\n periodSeconds: 5\n resources:\n requests:\n cpu: 4\n memory: 30Gi\n ephemeral-storage: \"40Gi\"\n limits:\n nvidia.com/gpu: 1\n cpu: 4\n memory: 30Gi\n ephemeral-storage: \"40Gi\"\n volumeMounts:\n - name: config-volume\n mountPath: /root/.aws/config\n subPath: config\n - mountPath: /dev/shm\n name: dshm\n - name: infra-service-config-volume\n mountPath: /infra-config\n # LIRA: For compatibility with runnable image converted from artifactlike bundle\n - name: config-volume\n mountPath: /home/scalelaunch/.aws/config\n subPath: config\n - name: user-config\n mountPath: /app/user_config\n subPath: raw_data\n - name: endpoint-config\n mountPath: /app/endpoint_config\n subPath: raw_data\n ports:\n - containerPort: 5005\n name: http\n # Workaround for https://github.com/kubernetes-sigs/external-dns/pull/1185\n securityContext:\n fsGroup: 65534\n volumes:\n - name: config-volume\n configMap:\n name: default-config \n - name: user-config\n configMap:\n name: launch-endpoint-id-end-ck2vn71afno003spo0o0\n - name: endpoint-config\n configMap:\n name: launch-endpoint-id-end-ck2vn71afno003spo0o0-endpoint-config\n - name: dshm\n emptyDir:\n medium: Memory\n - name: infra-service-config-volume\n configMap:\n name: model-engine-service-config\n items:\n - key: infra_service_config\n path: config.yaml", "time": "2023-09-16T19:01:48.684275", "exc_info": "Traceback (most recent call last):\n File \"/workspace/model-engine/model_engine_server/infra/gateways/resources/k8s_endpoint_resource_delegate.py\", line 198, in load_k8s_yaml\n yaml_obj = yaml.safe_load(yaml_str)\n File \"/usr/local/lib/python3.8/site-packages/yaml/__init__.py\", line 125, in safe_load\n return load(stream, SafeLoader)\n File \"/usr/local/lib/python3.8/site-packages/yaml/__init__.py\", line 81, in load\n return loader.get_single_data()\n File \"/usr/local/lib/python3.8/site-packages/yaml/constructor.py\", line 49, in get_single_data\n node = self.get_single_node()\n File \"/usr/local/lib/python3.8/site-packages/yaml/composer.py\", line 36, in get_single_node\n document = self.compose_document()\n File \"/usr/local/lib/python3.8/site-packages/yaml/composer.py\", line 55, in compose_document\n node = self.compose_node(None, None)\n File \"/usr/local/lib/python3.8/site-packages/yaml/composer.py\", line 84, in compose_node\n node = self.compose_mapping_node(anchor)\n File \"/usr/local/lib/python3.8/site-packages/yaml/composer.py\", line 133, in compose_mapping_node\n item_value = self.compose_node(node, item_key)\n File \"/usr/local/lib/python3.8/site-packages/yaml/composer.py\", line 84, in compose_node\n node = self.compose_mapping_node(anchor)\n File \"/usr/local/lib/python3.8/site-packages/yaml/composer.py\", line 133, in compose_mapping_node\n item_value = self.compose_node(node, item_key)\n File \"/usr/local/lib/python3.8/site-packages/yaml/composer.py\", line 84, in compose_node\n node = self.compose_mapping_node(anchor)\n File \"/usr/local/lib/python3.8/site-packages/yaml/composer.py\", line 133, in compose_mapping_node\n item_value = self.compose_node(node, item_key)\n File \"/usr/local/lib/python3.8/site-packages/yaml/composer.py\", line 84, in compose_node\n node = self.compose_mapping_node(anchor)\n File \"/usr/local/lib/python3.8/site-packages/yaml/composer.py\", line 127, in compose_mapping_node\n while not self.check_event(MappingEndEvent):\n File \"/usr/local/lib/python3.8/site-packages/yaml/parser.py\", line 98, in check_event\n self.current_event = self.state()\n File \"/usr/local/lib/python3.8/site-packages/yaml/parser.py\", line 428, in parse_block_mapping_key\n if self.check_token(KeyToken):\n File \"/usr/local/lib/python3.8/site-packages/yaml/scanner.py\", line 116, in check_token\n self.fetch_more_tokens()\n File \"/usr/local/lib/python3.8/site-packages/yaml/scanner.py\", line 223, in fetch_more_tokens\n return self.fetch_value()\n File \"/usr/local/lib/python3.8/site-packages/yaml/scanner.py\", line 577, in fetch_value\n raise ScannerError(None, None,\nyaml.scanner.ScannerError: mapping values are not allowed here\n in \"<unicode string>\", line 84, column 38:\n ... k8s.amazonaws.com/accelerator: nvidia-ampere-a10\n ^", "level": "ERROR", "name": "k8s_endpoint_resource_delegate", "lineno": 200, "pathname": "/workspace/model-engine/model_engine_server/infra/gateways/resources/k8s_endpoint_resource_delegate.py", "dd.trace_id": 14096351640769680099, "dd.span_id": 17176190717685371483, "dd.service": "model-engine-endpoint-builder", "dd.env": "scalegov-launch", "dd.version": "latest"}
@@ -238,7 +238,9 @@ def add_datadog_env_to_main_container(deployment_template: Dict[str, Any]) -> No | |||
[ | |||
{ | |||
"name": "DD_TRACE_ENABLED", | |||
"value": "false" if CIRCLECI else "true", | |||
"value": "false" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: is this related to a different pr?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we missed adding this in 2603b9b, happy to create a separate PR if that's cleaner
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks! It'd be great if you put this change into a separate follow up pr!
can you run the pre-commit hooks? I think unit tests are failing due to an |
Generalize SQS region so it's not hardcoded to
us-west-2