version: '3.8'

services:
  qwen3-codex-server:
    build: .
    ports:
      - "8000:8000"
    volumes:
      - ./Qwen3-4B-Function-Calling-Pro.gguf:/app/Qwen3-4B-Function-Calling-Pro.gguf:ro
    environment:
      - CUDA_VISIBLE_DEVICES=""  # Disable CUDA for CPU-only mode
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s
    deploy:
      resources:
        limits:
          memory: 8G
        reservations:
          memory: 6G