# ═══════════════════════════════════════════════════════════ # MAC — MBM AI Cloud | Local Server Configuration # ═══════════════════════════════════════════════════════════ # Copy this to .env: cp .env.example .env # Then run: docker compose up -d # ═══════════════════════════════════════════════════════════ # ── App ─────────────────────────────────────────────────── MAC_ENV=development MAC_HOST=0.0.0.0 MAC_PORT=8000 MAC_DEBUG=false MAC_SECRET_KEY=change-me-to-random-string MAC_CORS_ORIGINS=["*"] MAC_WORKERS=4 # Uvicorn worker processes # ── Database (PostgreSQL — persistent storage) ──────────── DATABASE_URL=postgresql+asyncpg://mac:mac_password@localhost:5432/mac_db PGADMIN_PORT=5050 PGADMIN_DEFAULT_EMAIL=admin@mbm.local PGADMIN_DEFAULT_PASSWORD=ChangeThisStrongPassword! # ── Redis (rate limiting & caching) ────────────────────── REDIS_URL=redis://localhost:6379/0 # ── JWT Auth ────────────────────────────────────────────── JWT_SECRET_KEY=change-me-jwt-secret-random-string JWT_ALGORITHM=HS256 JWT_ACCESS_TOKEN_EXPIRE_MINUTES=1440 # ── vLLM Local GPU Inference ───────────────────────────── # Each model runs its own vLLM instance on a separate port. # Docker Compose sets these automatically via service names. VLLM_BASE_URL=http://localhost:8001 VLLM_SPEED_URL=http://localhost:8001 VLLM_CODE_URL=http://localhost:8002 VLLM_REASONING_URL=http://localhost:8003 VLLM_INTELLIGENCE_URL=http://localhost:8004 VLLM_API_KEY= VLLM_TIMEOUT=120 # HTTP timeout (seconds) for LLM requests VLLM_HEALTH_TIMEOUT=5 # Timeout for model health checks # ── Model Registry ──────────────────────────────────────── # Override the entire model list with a JSON array (leave empty for defaults) # Each object needs: id, name, served_name, url_key, category, # parameters, context_length, capabilities (list), specialty. MAC_MODELS_JSON= # Only enable specific models from the built-in list (comma-separated IDs) # Example: MAC_ENABLED_MODELS=qwen2.5:7b,qwen2.5-coder:7b MAC_ENABLED_MODELS= # Which model ID the "auto" keyword falls back to (empty = first code model) MAC_AUTO_FALLBACK= # Default max_tokens when the client doesn't specify MAC_DEFAULT_MAX_TOKENS=2048 # ── Docker Compose vLLM Tuning ──────────────────────────── # Adjust these to match your GPU VRAM. 24GB GPU example: # Speed (7B) ≈ 5GB, Code (7B) ≈ 5GB, Reason (14B) ≈ 9GB → 19GB total VLLM_SPEED_MODEL=Qwen/Qwen2.5-7B-Instruct VLLM_SPEED_PORT=8001 VLLM_SPEED_GPU_MEM=0.22 VLLM_SPEED_MAX_LEN=8192 VLLM_CODE_MODEL=Qwen/Qwen2.5-Coder-7B-Instruct VLLM_CODE_PORT=8002 VLLM_CODE_GPU_MEM=0.22 VLLM_CODE_MAX_LEN=8192 VLLM_REASON_MODEL=deepseek-ai/DeepSeek-R1-Distill-Qwen-14B VLLM_REASON_PORT=8003 VLLM_REASON_GPU_MEM=0.35 VLLM_REASON_MAX_LEN=8192 VLLM_DTYPE=auto # auto | float16 | bfloat16 # Intelligence slot (uncomment vllm-intel in docker-compose.yml first) # VLLM_INTEL_MODEL=google/gemma-3-27b-it # VLLM_INTEL_PORT=8004 # VLLM_INTEL_GPU_MEM=0.45 # VLLM_INTEL_MAX_LEN=4096 # ── Whisper / Speech-to-Text ───────────────────────────── # Uncomment the whisper service in docker-compose.yml first. # Uses OpenAI-compatible /v1/audio/transcriptions endpoint. WHISPER_URL=http://localhost:8005 WHISPER_MODEL=Systran/faster-whisper-small WHISPER_TIMEOUT=300 # ── Text-to-Speech ─────────────────────────────────────── # Uncomment the tts service in docker-compose.yml first. # Uses OpenAI-compatible /v1/audio/speech endpoint. TTS_URL=http://localhost:8006 TTS_MODEL=default TTS_TIMEOUT=120 # ── Embeddings ──────────────────────────────────────────── # Optional separate embedding server. Leave empty to use VLLM_BASE_URL. EMBEDDING_URL= EMBEDDING_MODEL=nomic-embed-text EMBEDDING_TIMEOUT=60 # ── Rate Limits ─────────────────────────────────────────── RATE_LIMIT_REQUESTS_PER_HOUR=100 RATE_LIMIT_TOKENS_PER_DAY=50000 # ── Qdrant (Vector DB for RAG) ─────────────────────────── QDRANT_URL=http://localhost:6333 # ── SearXNG (Web Search) ───────────────────────────────── SEARXNG_URL=http://localhost:8888