Qwen3.5-122B-A10B 昇腾910B单机部署
·
一、环境依赖
| vllm-ascend镜像 | quay.io/ascend/vllm-ascend:v0.17.0rc1 |
| 驱动版本 | 25.5.1 |
二、启动容器
示例:
docker run -itd --shm-size=500g --privileged=true --name test0402 \
--privileged=true --net=host \
-v /var/queue_schedule:/var/queue_schedule \
-v /etc/ascend_install.info:/etc/ascend_install.info \
-v /usr/local/sbin:/usr/local/sbin \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-v /usr/local/Ascend/firmware:/usr/local/Ascend/firmware \
-v /data/models/Qwen/Qwen3___5-122B-A10B:/data \
--device=/dev/davinci_manager:/dev/davinci_manager \
--device=/dev/hisi_hdc:/dev/hisi_hdc \
--entrypoint=bash \
0fa7e4550d22
三、容器内拉起vllm服务
export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
export PYTORCH_NPU_ALLOC_CONF="expandable_segments:True"
export PYTORCH_NPU_ALLOC_CONF=max_split_size_mb:256
export HCCL_IF_IP="10.14.10.222"
export HCCL_OP_EXPANSION_MODE="AIV"
export HCCL_BUFFSIZE=1024
export OMP_NUM_THREADS=1
export LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libjemalloc.so.2:$LD_PRELOAD
export TASK_QUEUE_ENABLE=1
export VLLM_ASCEND_ENABLE_PREFETCH_MLP=1
export VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE=1
export VLLM_ASCEND_ENABLE_NZ=1
vllm serve /data \
--served-model-name "qwen3.5-122b-a10b" \
--host 10.14.10.222 \
--port 5678 \
--tensor-parallel-size 8 \
--max-model-len 128000 \
--max-num-batched-tokens 8192 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.95 \
--compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY", "cudagraph_capture_sizes":[1,4,8,12,16,24,32,48,56,64]}' \
--trust-remote-code \
--enable-auto-tool-choice \
--tool-call-parser qwen3_coder \
--async-scheduling \
--allowed-local-media-path / \
--mm_processor_cache_type="shm" \
--mm-processor-cache-gb 0 \
--speculative-config '{"num_speculative_tokens": 3, "method":"qwen3_5_mtp", "enforce_eager": true}' \
--additional-config '{"enable_cpu_binding":true, "multistream_overlap_shared_expert": true, "enable_weight_nz_layout":true}'
四、curl请求
curl -H "Accept: application/json" -H "Content-type: application/json" -X POST -d '
{
"model":"qwen3.5-122b-a10b",
"messages": [
{
"role": "system",
"content": "思考过程和输出结果都用中文"
},
{
"role": "user",
"content": "你是谁"
}
],
"max_tokens": 32768,
"presence_penalty":1.5,
"repetition_penalty":1.0,
"top_k": 20,
"top_p": 0.8,
"temperature": 0.7,
"chat_template_kwargs": {"enable_thinking": false},
"stream": true
}' http://10.14.10.222:5678/v1/chat/completions
更多推荐


所有评论(0)