使用llama.cpp部署RAG及语言大模型

疯子杨_Alston

268人浏览 · 2025-12-23 10:16:02

疯子杨_Alston · 2025-12-23 10:16:02 发布

使用llama.cpp部署RAG及语言大模型

话不多说，直接上干货

services:
  llama-reranker:
    image: ghcr.io/ggml-org/llama.cpp:server
    container_name: llama-reranker
    ports:
      - "8081:8080"
    volumes:
      - ./models:/models  # 挂载本地模型目录
    environment:
      LLAMA_ARG_ALIAS: bge-reranker-v2-m3
      # 相关模型建议直接hf上下载即可，量大管饱
      LLAMA_ARG_MODEL: /models/bge-reranker-v2-m3-Q4_K_M.gguf
      LLAMA_ARG_RERANKING: 1
    command: >
      --api-key aabbss
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 30s

  bge-embedding:
    image: ghcr.io/ggml-org/llama.cpp:server
    container_name: llama-embedding
    ports:
      - "8082:8080"
    volumes:
      - ./models:/models
    environment:
      LLAMA_ARG_MODEL: /models/bge-large-zh-v1.5.Q4_K_M.gguf
      LLAMA_ARG_HOST: 0.0.0.0
      LLAMA_ARG_PORT: 8080
      LLAMA_ARG_CTX_SIZE: 512
      LLAMA_ARG_N_GPU_LAYERS: 35
      LLAMA_ARG_THREADS: 4
      LLAMA_ARG_EMBEDDINGS: 1
    # 可设置--api-key 
    command: >
      --alias bge-large-zh-v1.5
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 50s
      
  qwen:
    image: ghcr.io/ggml-org/llama.cpp:server
    container_name: qwen
    ports:
      - "8084:8080"
    volumes:
      - ./models:/models
    # --n_gpu_layers 35 可以在command或env根据显存大小调整
    command: >
      --model /models/Qwen3-4B-Q5_K_M.gguf
      --ctx-size 1024
      --threads 10
      --alias Qwen3-4B
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
      interval: 40s
      timeout: 5s
      retries: 3
      start_period: 90s

  lobe-chat:
    image: lobehub/lobe-chat:latest
    container_name: lobe-chat
    ports:
      - "3210:3210"
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost:3210/healthz"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 10s