version: '3.8' services: qwen3-codex-server: build: . ports: - "8000:8000" volumes: - ./Qwen3-4B-Function-Calling-Pro.gguf:/app/Qwen3-4B-Function-Calling-Pro.gguf:ro environment: - CUDA_VISIBLE_DEVICES="" # Disable CUDA for CPU-only mode restart: unless-stopped healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 30s timeout: 10s retries: 3 start_period: 40s deploy: resources: limits: memory: 8G reservations: memory: 6G