Initial commit

ewkoch · ewkoch · commit 21555e89882b · 2024-10-23T16:24:20.000-04:00
diff --git a/charts/model-engine/templates/_helpers.tpl b/charts/model-engine/templates/_helpers.tpl
@@ -345,6 +345,13 @@ env:
   - name: REDIS_AUTH_TOKEN
     value: {{ .Values.redis.auth }}
   {{- end }}
+  {{- if .Values.redis.authSecret }}
+  - name: REDIS_AUTH_TOKEN
+    valueFrom:
+      secretKeyRef:
+        name: {{ .Values.redis.authSecret }}
+        key: auth_token
+  {{- end }}
   {{- if .Values.azure}}
   - name: AZURE_IDENTITY_NAME
     value: {{ .Values.azure.identity_name }}
diff --git a/charts/model-engine/templates/aws_config_map.yaml b/charts/model-engine/templates/aws_config_map.yaml
@@ -20,6 +20,9 @@ data:
     [profile {{ $profileName }}]
     role_arn = {{ index $annotations "eks.amazonaws.com/role-arn" }}
     web_identity_token_file = /var/run/secrets/eks.amazonaws.com/serviceaccount/token
+    [profile {{ $.Values.serviceAccount.sqsProfileName }}]
+    role_arn = {{ index $annotations "eks.amazonaws.com/role-arn" }}
+    web_identity_token_file = /var/run/secrets/eks.amazonaws.com/serviceaccount/token
 ---
 {{- end }}
 {{- end }}
diff --git a/charts/model-engine/templates/inference_framework_config.yaml b/charts/model-engine/templates/inference_framework_config.yaml
@@ -2,17 +2,18 @@ apiVersion: v1
 kind: ConfigMap
 metadata:
   name: {{ include "modelEngine.fullname" . }}-inference-framework-latest-config
+  namespace: {{ .Release.Namespace }}
   labels:
     product: common
     team: infra
   annotations:
-    "helm.sh/hook": pre-install
+    "helm.sh/hook": pre-install,pre-upgrade
     "helm.sh/hook-weight": "-2"
 data:
   deepspeed: "latest"
   text_generation_inference: "latest"
-  vllm: "latest"
-  vllm_batch: "latest"
-  vllm_batch_v2: "latest"
+  vllm: "{{ .Values.vllmTag }}"
+  vllm_batch:  "{{ .Values.vllmTag }}"
+  vllm_batch_v2:  "{{ .Values.vllmTag }}"
   lightllm: "latest"
   tensorrt_llm: "latest"
diff --git a/charts/model-engine/templates/istio-virtualservice.yaml b/charts/model-engine/templates/istio-virtualservice.yaml
@@ -12,13 +12,9 @@ metadata:
   {{- end }}
 spec:
   hosts:
-  {{- range .Values.virtualservice.hostDomains }}
-    - "{{ $fullName }}.{{ . }}"
-  {{- end }}
+    - model-engine.{{ $.Values.global.networking.internalDomain }}
   gateways:
-  {{- range .Values.virtualservice.gateways }}
-    - {{ . | quote }}
-  {{- end }}
+    - {{ $.Values.global.networking.internalGateway }}
   http:
     - route:
         - destination:
diff --git a/charts/model-engine/templates/service_account_inference.yaml b/charts/model-engine/templates/service_account_inference.yaml
diff --git a/charts/model-engine/templates/service_config_map.yaml b/charts/model-engine/templates/service_config_map.yaml
@@ -3,6 +3,7 @@ apiVersion: v1
 kind: ConfigMap
 metadata:
   name: {{ include "modelEngine.fullname" . }}-service-config
+  namespace: {{ .Release.Namespace }}
   labels:
     {{- include "modelEngine.labels" . | nindent 4 }}
   annotations:
@@ -11,46 +12,110 @@ metadata:
 data:
   launch_service_config: |-
     dd_trace_enabled: {{ .Values.dd_trace_enabled | default false | quote }}
+
+    # Config to know where model-engine is running
     gateway_namespace: {{ .Release.Namespace | quote }}
-    {{- with .Values.config.values.launch }}
-    {{- range $key, $value := . }}
-    {{ $key }}: {{ $value | quote }}
-    {{- end }}
-    {{- end }}
-  infra_service_config: |-
-    env: {{ .Values.context | quote }}
-    {{- with .Values.config.values.infra }}
-    {{- range $key, $value := . }}
-    {{ $key }}: {{ $value | quote }}
-    {{- end }}
-    {{- end }}
 
----
+    # Config for scale-hosted Hosted Model Inference in the prod cluster, plus a bunch of other config-ish notes
+    # NOTE: If you add/change values inside this file that need to apply to all clusters, please make changes in
+    # all service_config_{env}.yaml files as well.
 
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: {{ include "modelEngine.fullname" . }}-service-config
-  namespace: {{ .Values.config.values.launch.endpoint_namespace }}
-  labels:
-    {{- include "modelEngine.labels" . | nindent 4 }}
-  annotations:
-    "helm.sh/hook": pre-install,pre-upgrade
-    "helm.sh/hook-weight": "-2"
-data:
-  launch_service_config: |-
-    dd_trace_enabled: {{ .Values.dd_trace_enabled | default false | quote }}
-    gateway_namespace: {{ .Release.Namespace | quote }}
-    {{- with .Values.config.values.launch }}
-    {{- range $key, $value := . }}
-    {{ $key }}: {{ $value | quote }}
-    {{- end }}
-    {{- end }}
+    # Config for scale-hosted Hosted Model Inference in the prod cluster, see `service_config` for more details
+    model_primitive_host: model-server.{{ .Release.Namespace }}.svc.cluster.local
+
+    # # Endpoint config
+    # K8s namespace the endpoints will be created in
+    endpoint_namespace: {{ .Release.Namespace | quote }}
+
+    # Asynchronous endpoints
+    sqs_profile: {{ $.Values.serviceAccount.sqsProfileName }}
+    sqs_queue_policy_template: |-
+      {
+          "Version": "2012-10-17",
+          "Id": "__default_policy_ID",
+          "Statement": [
+          {
+              "Sid": "__owner_statement",
+              "Effect": "Allow",
+              "Principal": {
+              "AWS": "arn:{{ .Values.aws.partition }}:iam::{{ .Values.aws.accountId }}:root"
+              },
+              "Action": "sqs:*",
+              "Resource": "arn:{{ .Values.aws.partition }}:sqs:{{ .Values.aws.region }}:{{ .Values.aws.accountId }}:${queue_name}"
+          },
+          {
+              "Effect": "Allow",
+              "Principal": {
+              "AWS": "arn:{{ .Values.aws.partition }}:iam::{{ .Values.aws.accountId }}:role/{{ $.Values.serviceAccount.sqsProfileName }}"
+              },
+              "Action": "sqs:*",
+              "Resource": "arn:{{ .Values.aws.partition }}:sqs:{{ .Values.aws.region }}:{{ .Values.aws.accountId }}:${queue_name}"
+          },
+          {
+              "Effect": "Allow",
+              "Principal": {
+              "AWS": "arn:{{ .Values.aws.partition }}:iam::{{ .Values.aws.accountId }}:role/ml_hosted_model_inference"
+              },
+              "Action": "sqs:*",
+              "Resource": "arn:{{ .Values.aws.partition }}:sqs:{{ .Values.aws.region }}:{{ .Values.aws.accountId }}:${queue_name}"
+          }
+          ]
+      }
+
+    sqs_queue_tag_template: |-
+      {
+          "infra.scale.com/product": "{{ .Values.productTag }}",
+          "infra.scale.com/team": "${team}",
+          "infra.scale.com/contact": "{{ .Values.contactEmail }}",
+          "infra.scale.com/customer": "AllCustomers",
+          "infra.scale.com/financialOwner": "{{ .Values.contactEmail}}",
+          "Launch-Endpoint-Id": "${endpoint_id}",
+          "Launch-Endpoint-Name": "${endpoint_name}",
+          "Launch-Endpoint-Created-By": "${endpoint_created_by}"
+      }
+
+    # Billing
+    billing_queue_arn: arn:aws:events:{{ .Values.aws.region }}:{{ .Values.aws.accountId }}:event-bus/money
+    
+    # The below redis URL would not work if we needed auth, which we do, so we have to pull cache_url from the cache_redis_aws_secret_name
+    cache_redis_aws_secret_name: "{{ .Values.secrets.redisAwsSecretName }}"
+
+    cloud_file_llm_fine_tune_repository: "s3://{{ .Values.aws.s3Bucket }}/hosted-model-inference/llm-ft-job-repository/prod"
+
+    dd_trace_enabled: true
+    istio_enabled: true
+    sensitive_log_mode: true
+    tgi_repository: "text-generation-inference"
+    vllm_repository: "vllm"
+    lightllm_repository: "lightllm"
+    tensorrt_llm_repository: "tensorrt-llm"
+    batch_inference_vllm_repository: "llm-engine/batch-infer-vllm"
+    user_inference_base_repository: "launch/inference"
+    user_inference_pytorch_repository: "hosted-model-inference/async-pytorch"
+    user_inference_tensorflow_repository: "hosted-model-inference/async-tensorflow-cpu"
+    docker_image_layer_cache_repository: "kaniko-cache"
+
+    # S3 access
+    hf_user_fine_tuned_weights_prefix: "s3://{{ .Values.aws.s3Bucket }}/hosted-model-inference/fine_tuned_weights"
   infra_service_config: |-
     env: {{ .Values.context | quote }}
-    {{- with .Values.config.values.infra }}
-    {{- range $key, $value := . }}
-    {{ $key }}: {{ $value | quote }}
-    {{- end }}
-    {{- end }}
+    cloud_provider: "aws"
+    env: "prod"
+    k8s_cluster_name: "usgw1-prod"
+    dns_host_domain: "model-engine.ml-serving.{{ $.Values.global.networking.internalDomain }}"
+    default_region: "{{ .Values.aws.region }}"
+    ml_account_id: "{{ .Values.aws.accountId }}"
+    docker_repo_prefix: "{{ .Values.aws.accountId }}.dkr.ecr.{{ .Values.aws.region }}.amazonaws.com"
+    redis_host: "{{ .Values.redis.hostname }}"
+    s3_bucket: "{{ .Values.aws.s3Bucket }}"
+    profile_ml_worker: "ml-worker"
+    profile_ml_inference_worker: "ml-worker"
+    identity_service_url: "{{ .Values.identityServiceUrl }}"
+    firehose_role_arn: "arn:{{ .Values.aws.partition }}:iam::{{ .Values.aws.accountId }}:role/firehose-stream-logging-role"
+    firehose_stream_name: "{{ .Values.firehoseStreamName }}"
+    db_engine_pool_size: 20
+    db_engine_max_overflow: 10
+    db_engine_echo: false
+    db_engine_echo_pool: true
+    db_engine_disconnect_strategy: "pessimistic"
 {{- end }}
diff --git a/charts/model-engine/templates/service_template_config_map.yaml b/charts/model-engine/templates/service_template_config_map.yaml
@@ -95,14 +95,17 @@ data:
             {{- toYaml . | nindent 12 }}
           {{- end }}
           {{- if eq $device "gpu" }}
-          {{- if empty $node_selector }}
-          nodeSelector:
-          {{- end }}
-            k8s.amazonaws.com/accelerator: ${GPU_TYPE}
+          # {{- if empty $node_selector }}
+          # nodeSelector:
+          # {{- end }}
+          #   k8s.amazonaws.com/accelerator: ${GPU_TYPE}
           tolerations:
             - key: "nvidia.com/gpu"
               operator: "Exists"
               effect: "NoSchedule"
+            - key: "gpu_a100_multi"
+              operator: "Exists"
+              effect: "NoSchedule"
           {{- end }}
           priorityClassName: ${PRIORITY}
           containers:
@@ -522,6 +525,7 @@ data:
         loadBalancer:
           simple: LEAST_REQUEST
   {{- end }}
+  {{- if and (.Capabilities.APIVersions.Has "autoscaling.k8s.io/v1") (.Values.autoscaling.vertical.enabled) }}
   vertical-pod-autoscaler.yaml: |-
     apiVersion: "autoscaling.k8s.io/v1"
     kind: VerticalPodAutoscaler
@@ -548,6 +552,7 @@ data:
               cpu: ${CPUS}
               memory: ${MEMORY}
             controlledResources: ["cpu", "memory"]
+  {{- end }}
   pod-disruption-budget.yaml: |-
     apiVersion: policy/v1
     kind: PodDisruptionBudget
@@ -675,14 +680,17 @@ data:
             {{- toYaml . | nindent 12 }}
           {{- end }}
           {{- if eq $device "gpu" }}
-          {{- if empty $node_selector }}
-          nodeSelector:
-          {{- end }}
-            k8s.amazonaws.com/accelerator: ${GPU_TYPE}
+          # {{- if empty $node_selector }}
+          # nodeSelector:
+          # {{- end }}
+          #   k8s.amazonaws.com/accelerator: ${GPU_TYPE}
           tolerations:
             - key: "nvidia.com/gpu"
               operator: "Exists"
               effect: "NoSchedule"
+            - key: "gpu_a100_multi"
+              operator: "Exists"
+              effect: "NoSchedule"
           {{- end }}
           {{- if $service_template_service_account_name }}
           serviceAccountName: {{ $service_template_service_account_name }}
diff --git a/charts/model-engine/values.yaml b/charts/model-engine/values.yaml
@@ -5,7 +5,7 @@ redis:
   auth:
 db:
   runDbInitScript: false
-balloonNodeSelector:
-  node-lifecycle: normal
-nodeSelector:
-  node-lifecycle: normal
+# balloonNodeSelector:
+#   node-lifecycle: normal
+# nodeSelector:
+#   node-lifecycle: normal