临时Web服务
Python3
# 在当前目录启动服务器(端口8000) python3 -m http.server # 指定端口 python3 -m http.server 8080 # 绑定到所有网络接口 python3 -m http.server 8000 --bind 0.0.0.0 # 指定目录 python3 -m http.server --directory /path/to/dir
Python2
https://my.feishu.cn/wiki/KQbbw0aQ9iEYSxkOehOceNFLnde?from=from_copylink
https://my.feishu.cn/wiki/KQbbw0aQ9iEYSxkOehOceNFLnde?from=from_copylink
vllm启动命令:
master:
nic_name="enp61s0f2"
local_ip="10.91.10.151"
node0_ip="10.91.10.151"
export HCCL_OP_EXPANSION_MODE="AIV"
export HCCL_IF_IP=$local_ip
export GLOO_SOCKET_IFNAME=$nic_name
export TP_SOCKET_IFNAME=$nic_name
export HCCL_SOCKET_IFNAME=$nic_name
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100
export VLLM_USE_V1=1
export HCCL_BUFFSIZE=200
export VLLM_ASCEND_ENABLE_MLAPO=1
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
export VLLM_ASCEND_ENABLE_FLASHCOMM1=0
export HCCL_CONNECT_TIMEOUT=120
export HCCL_INTRA_PCIE_ENABLE=1
export HCCL_INTRA_ROCE_ENABLE=0
export ACL_OP_INIT_MODE=1
vllm serve /data/vllm-ascend/DeepSeek-V3___2-W8A8 \
--host 0.0.0.0 \
--port 9001 \
--data-parallel-size 2 \
--data-parallel-size-local 1 \
--data-parallel-address $node0_ip \
--data-parallel-rpc-port 13389 \
--tensor-parallel-size 8 \
--quantization ascend \
--seed 1024 \
--served-model-name DeepSeek-V3.2 \
--enable-expert-parallel \
--max-num-seqs 1 \
--max-model-len 32768 \
--max-num-batched-tokens 4096 \
--trust-remote-code \
--no-enable-prefix-caching \
--gpu-memory-utilization 0.96 \
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes":[3,6]}' \
--speculative-config '{"num_speculative_tokens": 2, "method": "deepseek_mtp"}' >> /root/vllm.log 2>&1 &
slave:
nic_name="enp61s0f2"
local_ip="10.91.10.152"
node0_ip="10.91.10.151"
export HCCL_OP_EXPANSION_MODE="AIV"
export HCCL_IF_IP=$local_ip
export GLOO_SOCKET_IFNAME=$nic_name
export TP_SOCKET_IFNAME=$nic_name
export HCCL_SOCKET_IFNAME=$nic_name
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100
export VLLM_USE_V1=1
export HCCL_BUFFSIZE=200
export VLLM_ASCEND_ENABLE_MLAPO=1
export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
export VLLM_ASCEND_ENABLE_FLASHCOMM1=0
export HCCL_CONNECT_TIMEOUT=120
export HCCL_INTRA_PCIE_ENABLE=1
export HCCL_INTRA_ROCE_ENABLE=0
export ACL_OP_INIT_MODE=1
vllm serve /data/vllm-ascend/DeepSeek-V3___2-W8A8 \
--host 0.0.0.0 \
--port 9001 \
--headless \
--data-parallel-size 2 \
--data-parallel-size-local 1 \
--data-parallel-start-rank 1 \
--data-parallel-address $node0_ip \
--data-parallel-rpc-port 13389 \
--tensor-parallel-size 8 \
--quantization ascend \
--seed 1024 \
--served-model-name DeepSeek-V3.2 \
--enable-expert-parallel \
--max-num-seqs 1 \
--max-model-len 32768 \
--max-num-batched-tokens 4096 \
--trust-remote-code \
--no-enable-prefix-caching \
--gpu-memory-utilization 0.96 \
--compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "cudagraph_capture_sizes":[3,6]}' \
--speculative-config '{"num_speculative_tokens": 2, "method": "deepseek_mtp"}' >> /root/vllm.log 2>&1 &
本站的资源来自转载或站长的原创,按照CC BY-NC-SA 3.0 CN 协议发布和共享转载或引用本站文章应遵循相同协议。
如果有侵犯版权的资源,请尽快联系站长我们会删除有争议的资源。


