使用llama.cpp部署RAG及语言大模型
·
使用llama.cpp部署RAG及语言大模型
话不多说,直接上干货
services:
llama-reranker:
image: ghcr.io/ggml-org/llama.cpp:server
container_name: llama-reranker
ports:
- "8081:8080"
volumes:
- ./models:/models # 挂载本地模型目录
environment:
LLAMA_ARG_ALIAS: bge-reranker-v2-m3
# 相关模型建议直接hf上下载即可,量大管饱
LLAMA_ARG_MODEL: /models/bge-reranker-v2-m3-Q4_K_M.gguf
LLAMA_ARG_RERANKING: 1
command: >
--api-key aabbss
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
interval: 10s
timeout: 5s
retries: 3
start_period: 30s
bge-embedding:
image: ghcr.io/ggml-org/llama.cpp:server
container_name: llama-embedding
ports:
- "8082:8080"
volumes:
- ./models:/models
environment:
LLAMA_ARG_MODEL: /models/bge-large-zh-v1.5.Q4_K_M.gguf
LLAMA_ARG_HOST: 0.0.0.0
LLAMA_ARG_PORT: 8080
LLAMA_ARG_CTX_SIZE: 512
LLAMA_ARG_N_GPU_LAYERS: 35
LLAMA_ARG_THREADS: 4
LLAMA_ARG_EMBEDDINGS: 1
# 可设置--api-key
command: >
--alias bge-large-zh-v1.5
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
interval: 10s
timeout: 5s
retries: 3
start_period: 50s
qwen:
image: ghcr.io/ggml-org/llama.cpp:server
container_name: qwen
ports:
- "8084:8080"
volumes:
- ./models:/models
# --n_gpu_layers 35 可以在command或env根据显存大小调整
command: >
--model /models/Qwen3-4B-Q5_K_M.gguf
--ctx-size 1024
--threads 10
--alias Qwen3-4B
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
interval: 40s
timeout: 5s
retries: 3
start_period: 90s
lobe-chat:
image: lobehub/lobe-chat:latest
container_name: lobe-chat
ports:
- "3210:3210"
restart: unless-stopped
healthcheck:
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3210/healthz"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
更多推荐




所有评论(0)