一、为什么需要容器化 1.1 容器化优势 裸机部署: 环境依赖复杂,版本冲突 扩缩容困难,手动运维 资源隔离差,互相影响 容器化部署: 环境一致性,开箱即用 弹性扩缩容,自动运维 资源隔离,互不影响 版本管理,灰度发布1.2 CANN 容器架构 ┌──────────────────────────────────────┐ │ Kubernetes 集群 │ ├──────────────────────────────────────┤ │ ┌─────────────┐ ┌─────────────┐ │ │ │ Pod 0 │ │ Pod 1 │ │ │ │ ┌─────────┐ │ │ ┌─────────┐ │ │ │ │ │推理服务 │ │ │ │推理服务 │ │ │ │ │ └─────────┘ │ │ └─────────┘ │ │ │ │ /dev/davinci0│ │ /dev/davinci1│ │ │ └─────────────┘ └─────────────┘ │ ├──────────────────────────────────────┤ │ 宿主机 (Host OS) │ │ ┌──────────────────────────┐ │ │ │ CANN 驱动 + Docker NPU │ │ │ └──────────────────────────┘ │ └──────────────────────────────────────┘二、Docker 环境搭建 2.1 基础镜像 # Dockerfile.cann-base FROM ubuntu:20.04 # 安装基础依赖 RUN apt-get update && apt-get install -y \ build-essential \ cmake \ git \ wget \ && rm -rf /var/lib/apt/lists/* # 安装 CANN COPY Ascend-cann-toolkit_8.2.RC1_linux-aarch64.run /tmp/ RUN /tmp/Ascend-cann-toolkit_8.2.RC1_linux-aarch64.run --install --quiet \ && rm /tmp/Ascend-cann-toolkit_8.2.RC1_linux-aarch64.run # 设置环境变量 ENV ASCEND_HOME=/usr/local/Ascend ENV PATH=${ASCEND_HOME}/ascend-toolkit/bin:${ASCEND_HOME}/nnae/bin:${PATH} ENV LD_LIBRARY_PATH=${ASCEND_HOME}/ascend-toolkit/lib64:${ASCEND_HOME}/driver/lib64:${LD_LIBRARY_PATH} # 验证安装 RUN npu-smi info WORKDIR /workspace2.2 构建推理镜像 # Dockerfile.inference FROM cann-base:latest # 安装 Python RUN apt-get update && apt-get install -y python3 python3-pip && \ pip3 install --upgrade pip # 安装依赖 COPY requirements.txt /tmp/ RUN pip3 install -r /tmp/requirements.txt # 复制模型和代码 COPY model/ /workspace/model/ COPY src/ /workspace/src/ # 暴露端口 EXPOSE 50051 # 启动命令 CMD ["python3", "/workspace/src/server.py", "--model", "/workspace/model/model.om"]2.3 构建与运行 # 构建基础镜像 docker build-f Dockerfile.cann-base-t cann-base:latest. # 构建推理镜像 docker build-f Dockerfile.inference-t inference-server:latest. # 运行容器 (需要 NPU 设备) docker run-d \ --name inference\ --device /dev/davinci0\ --device /dev/davinci_manager\ --device /dev/devmm_svm\ --device /dev/hisi_hdc\ -v /usr/local/Ascend:/usr/local/Ascend\ -p 50051 :50051\ inference-server:latest三、NPU 设备透传 3.1 设备挂载参数 # 必须挂载的设备 docker run-d \ --device /dev/davinci0\ # NPU 设备节点 --device /dev/davinci_manager\ # NPU 管理器 --device /dev/devmm_svm\ # 设备内存管理 --device /dev/hisi_hdc\ # HDC 通信 -v /usr/local/Ascend:/usr/local/Ascend\ # CANN 驱动 inference-server:latest# 挂载所有 NPU (多卡) docker run-d \ --device /dev/davinci0\ --device /dev/davinci1\ --device /dev/davinci2\ --device /dev/davinci3\ --device /dev/davinci_manager\ --device /dev/devmm_svm\ --device /dev/hisi_hdc\ -v /usr/local/Ascend:/usr/local/Ascend\ inference-server:latest3.2 Docker Compose 配置 # docker-compose.yml version : '3.8' services : inference : build : context : .dockerfile : Dockerfile.inferencecontainer_name : inference- serverrestart : unless- stoppedports : - "50051:50051" devices : - /dev/davinci0: /dev/davinci0- /dev/davinci_manager: /dev/davinci_manager- /dev/devmm_svm: /dev/devmm_svm- /dev/hisi_hdc: /dev/hisi_hdcvolumes : - /usr/local/Ascend: /usr/local/Ascend- ./model: /workspace/modelenvironment : - ASCEND_HOME=/usr/local/Ascend- PYTHONUNBUFFERED=1deploy : resources : limits : memory : 16Greservations : memory : 8Gnginx : image : nginx: latestports : - "80:80" volumes : - ./nginx.conf: /etc/nginx/nginx.confdepends_on : - inference四、Kubernetes 部署 4.1 NPU Device Plugin # npu-device-plugin.yml apiVersion : apps/v1kind : DaemonSetmetadata : name : ascend- npu- device- pluginnamespace : kube- systemspec : selector : matchLabels : name : ascend- npu- device- plugintemplate : metadata : labels : name : ascend- npu- device- pluginspec : tolerations : - key : CriticalAddonsOnlyoperator : ExistspriorityClassName : system- node- criticalcontainers : - name : npu- device- pluginimage : ascend- k8sdeviceplugin/amd64- npu- plugin: latestimagePullPolicy : IfNotPresentsecurityContext : privileged : true volumeMounts : - name : device- pluginmountPath : /var/lib/kubelet/device- plugins- name : davincimountPath : /dev/davincivolumes : - name : device- pluginhostPath : path : /var/lib/kubelet/device- plugins- name : davincihostPath : path : /dev/davinci4.2 推理服务 Deployment # inference-deployment.yml apiVersion : apps/v1kind : Deploymentmetadata : name : inference- servernamespace : defaultspec : replicas : 3 selector : matchLabels : app : inference- servertemplate : metadata : labels : app : inference- serverspec : containers : - name : inferenceimage : inference- server: latestimagePullPolicy : IfNotPresentports : - containerPort : 50051 name : grpcresources : limits : huawei.com/npu : 1 # 请求 1 张 NPU requests : huawei.com/npu : 1 volumeMounts : - name : model- volumemountPath : /workspace/modelreadinessProbe : grpc : port : 50051 initialDelaySeconds : 10 periodSeconds : 5 livenessProbe : grpc : port : 50051 initialDelaySeconds : 15 periodSeconds : 10 volumes : - name : model- volumepersistentVolumeClaim : claimName : model- pvc--- apiVersion : v1kind : Servicemetadata : name : inference- servicenamespace : defaultspec : selector : app : inference- serverports : - name : grpcport : 50051 targetPort : 50051 type : ClusterIP4.3 自动扩缩容 # hpa.yml apiVersion : autoscaling/v2kind : HorizontalPodAutoscalermetadata : name : inference- hpanamespace : defaultspec : scaleTargetRef : apiVersion : apps/v1kind : Deploymentname : inference- serverminReplicas : 2 maxReplicas : 10 metrics : - type : Resourceresource : name : cputarget : type : UtilizationaverageUtilization : 70 - type : Podspods : metric : name : inference_queue_sizetarget : type : AverageValueaverageValue : "10" behavior : scaleUp : stabilizationWindowSeconds : 60 policies : - type : Podsvalue : 2 periodSeconds : 60 scaleDown : stabilizationWindowSeconds : 300 policies : - type : Podsvalue : 1 periodSeconds : 120 五、资源限制与隔离 5.1 NPU 资源配额 # resource-quota.yml apiVersion : v1kind : ResourceQuotametadata : name : npu- quotanamespace : inferencespec : hard : requests.huawei.com/npu : "8" limits.huawei.com/npu : "8" requests.cpu : "32" limits.cpu : "64" requests.memory : "128Gi" limits.memory : "256Gi" 5.2 Pod 资源限制 # pod-with-limits.yml apiVersion : v1kind : Podmetadata : name : inference- podspec : containers : - name : inferenceimage : inference- server: latestresources : limits : huawei.com/npu : 1 cpu : "8" memory : "32Gi" requests : huawei.com/npu : 1 cpu : "4" memory : "16Gi" 六、监控与日志 6.1 Prometheus 指标暴露 from prometheus_clientimport Counter, Histogram, Gauge, start_http_server# 定义指标 INFERENCE_REQUESTS= Counter( 'inference_requests_total' , 'Total inference requests' , [ 'model_name' , 'status' ] ) INFERENCE_LATENCY= Histogram( 'inference_latency_seconds' , 'Inference latency' , [ 'model_name' ] , buckets= [ 0.01 , 0.05 , 0.1 , 0.5 , 1.0 , 2.0 , 5.0 ] ) NPU_MEMORY_USAGE= Gauge( 'npu_memory_usage_bytes' , 'NPU memory usage' , [ 'device' ] ) # 在推理中记录指标 def predict_with_metrics ( model_name, input_data) : start= time. time( ) try : output= server. Inference( input_data) latency= time. time( ) - start INFERENCE_REQUESTS. labels( model_name= model_name, status= 'success' ) . inc( ) INFERENCE_LATENCY. labels( model_name= model_name) . observe( latency) return outputexcept Exceptionas e: INFERENCE_REQUESTS. labels( model_name= model_name, status= 'error' ) . inc( ) raise # 启动指标服务器 start_http_server( 8000 ) 6.2 Grafana 看板 { "dashboard" : { "title" : "CANN Inference Dashboard" , "panels" : [ { "title" : "QPS" , "type" : "graph" , "targets" : [ { "expr" : "rate(inference_requests_total[5m])" , "legendFormat" : "{{model_name}}" } ] } , { "title" : "P99 Latency" , "type" : "graph" , "targets" : [ { "expr" : "histogram_quantile(0.99, rate(inference_latency_seconds_bucket[5m]))" , "legendFormat" : "{{model_name}}" } ] } , { "title" : "NPU Memory" , "type" : "graph" , "targets" : [ { "expr" : "npu_memory_usage_bytes" , "legendFormat" : "{{device}}" } ] } ] } } 七、常见问题 问题 原因 解决方案 容器内 NPU 不可用 设备未挂载 添加 --device 参数 NPU 驱动版本不匹配 镜像与宿主机驱动版本不一致 使用相同版本的 CANN 推理性能下降 容器资源限制太严 增加 CPU/内存限制 Pod 无法调度 NPU 资源不足 扩容集群或减少副本数 OOM Killed 显存/内存超限 增加资源限制或优化模型
相关仓库 ascend-docker - 昇腾 Docker 工具 https://gitee.com/ascend/ascend-dockerk8s-device-plugin - K8s NPU 插件 https://gitee.com/ascend/k8s-device-pluginascend-operator - K8s Ascend Operator https://gitee.com/ascend/ascend-operator